From 3bb3fd19ae115484878e421649dba4b9549aa42a Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 12 Dec 2024 18:20:13 +0100
Subject: [PATCH 001/183] Fixup opt to reduce the amount of odd if statements.
 (#2833)

* Fixup opt to reduce the amount of odd if statements.

* Fixing cargo lock
---
 Cargo.lock                                         | 14 +++++++-------
 .../models/custom_modeling/opt_modeling.py         | 11 ++++++++---
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9551ae2d..f0b756f9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4367,7 +4367,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.0.1-dev0"
+version = "3.0.2-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -4391,7 +4391,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-benchmark"
-version = "3.0.1-dev0"
+version = "3.0.2-dev0"
 dependencies = [
  "average",
  "clap 4.5.21",
@@ -4411,7 +4411,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "3.0.1-dev0"
+version = "3.0.2-dev0"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -4429,7 +4429,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "3.0.1-dev0"
+version = "3.0.2-dev0"
 dependencies = [
  "clap 4.5.21",
  "ctrlc",
@@ -4450,7 +4450,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "3.0.1-dev0"
+version = "3.0.2-dev0"
 dependencies = [
  "anyhow",
  "async-stream",
@@ -4501,7 +4501,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v2"
-version = "3.0.1-dev0"
+version = "3.0.2-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -4550,7 +4550,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v3"
-version = "3.0.1-dev0"
+version = "3.0.2-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
diff --git a/server/text_generation_server/models/custom_modeling/opt_modeling.py b/server/text_generation_server/models/custom_modeling/opt_modeling.py
index a6348b5b..db73ae84 100644
--- a/server/text_generation_server/models/custom_modeling/opt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch OPT model."""
+"""PyTorch OPT model."""
+
 import random
 from typing import List, Optional, Tuple, Union
 
@@ -317,7 +318,6 @@ class OPTDecoderLayer(nn.Module):
         super().__init__()
         self.process_group = weights.process_group
         self.hidden_size = config.hidden_size
-        prefix = f"{prefix if prefix else ''}decoder.layers.{layer_id}"
         self.self_attn = OPTAttention(
             config,
             prefix=f"{prefix}.self_attn",
@@ -478,7 +478,12 @@ class OPTDecoder(OPTPreTrainedModel):
 
         self.layers = nn.ModuleList(
             [
-                OPTDecoderLayer(layer_id, prefix, config, weights)
+                OPTDecoderLayer(
+                    layer_id,
+                    prefix=f"{prefix}decoder.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                )
                 for layer_id in range(config.num_hidden_layers)
             ]
         )

From ea7f4082c454e00e52d2fe06a757675c2f84843f Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:50:59 +0100
Subject: [PATCH 002/183] TensorRT-LLM backend bump to latest version + misc
 fixes (#2791)

* misc(cmake) update dependencies

* feat(hardware) enable new hardware.hpp and unittests

* test(ctest) enable address sanitizer

* feat(backend): initial rewrite of the backend for simplicity

* feat(backend): remove all the logs from hardware.hpp

* feat(backend): added some logging

* feat(backend): enable compiler warning if support for RVO not applying

* feat(backend): missing return statement

* feat(backend): introduce backend_workspace_t to store precomputed information from the engine folder

* feat(backend): delete previous backend impl

* feat(backend): more impl

* feat(backend): use latest trtllm main version to have g++ >= 13 compatibility

* feat(backend): allow overriding which Python to use

* feat(backend): fix backend_exception_t -> backend_error_t naming

* feat(backend): impl missing generation_step_t as return value of pull_tokens

* feat(backend): make backend_workspace_t::engines_folder constexpr

* feat(backend): fix main.rs retrieving the tokenizer

* feat(backend): add guard to multiple header definitions

* test(backend): add more unittest

* feat(backend): remove constexpr from par

* feat(backend): remove constexpig

* test(backend): more test coverage

* chore(trtllm): update dependency towards 0.15.0

* effectively cancel the request on the executor

* feat(backend) fix moving backend when pulling

* feat(backend): make sure we can easily cancel request on the executor

* feat(backend): fix missing "0" field access

* misc(backend): fix reborrowing Pin<&mut T> as described in the doc https://doc.rust-lang.org/stable/std/pin/struct.Pin.html#method.as_mut

* chore: Add doc and CI for TRTLLM (#2799)

* chore: Add doc and CI for TRTLLM

* chore: Add doc and CI for TRTLLM

* chore: Add doc and CI for TRTLLM

* chore: Add doc and CI for TRTLLM

* doc: Formatting

* misc(backend): indent

---------

Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>
---
 .github/workflows/build.yaml                |  10 +
 .github/workflows/ci_build.yaml             |   2 +-
 Cargo.lock                                  |  55 +----
 Dockerfile_trtllm                           |  21 +-
 backends/trtllm/CMakeLists.txt              |  54 ++++-
 backends/trtllm/Cargo.toml                  |   9 +-
 backends/trtllm/build.rs                    |  34 +--
 backends/trtllm/cmake/fmt.cmake             |   6 -
 backends/trtllm/cmake/json.cmake            |   4 +-
 backends/trtllm/cmake/spdlog.cmake          |   4 +-
 backends/trtllm/cmake/trtllm.cmake          |   9 +-
 backends/trtllm/csrc/backend.cpp            |  79 +++++++
 backends/trtllm/csrc/backend.hpp            | 231 ++++++++++++++++++++
 backends/trtllm/csrc/ffi.hpp                | 162 ++++++++++++++
 backends/trtllm/csrc/hardware.hpp           |  81 +++++++
 backends/trtllm/include/backend.h           | 144 ------------
 backends/trtllm/include/ffi.h               |  75 -------
 backends/trtllm/include/hardware.h          |  59 -----
 backends/trtllm/lib/backend.cpp             | 203 -----------------
 backends/trtllm/scripts/install_tensorrt.sh |   2 +-
 backends/trtllm/src/ffi.cpp                 |  89 --------
 backends/trtllm/src/lib.rs                  |  22 +-
 backends/trtllm/src/looper.rs               | 223 ++++++++-----------
 backends/trtllm/src/main.rs                 | 154 ++++++++-----
 backends/trtllm/tests/infer_test.cpp        |  14 --
 backends/trtllm/tests/test_backend.cpp      | 152 +++++++++++++
 backends/trtllm/tests/test_hardware.cpp     |  82 +++++++
 docs/source/_toctree.yml                    |   6 +
 docs/source/architecture.md                 |   4 +-
 docs/source/backends/trtllm.md              |  81 +++++++
 docs/source/multi_backend_support.md        |  13 ++
 router/src/server.rs                        |   4 +-
 32 files changed, 1192 insertions(+), 896 deletions(-)
 delete mode 100644 backends/trtllm/cmake/fmt.cmake
 create mode 100644 backends/trtllm/csrc/backend.cpp
 create mode 100644 backends/trtllm/csrc/backend.hpp
 create mode 100644 backends/trtllm/csrc/ffi.hpp
 create mode 100644 backends/trtllm/csrc/hardware.hpp
 delete mode 100644 backends/trtllm/include/backend.h
 delete mode 100644 backends/trtllm/include/ffi.h
 delete mode 100644 backends/trtllm/include/hardware.h
 delete mode 100644 backends/trtllm/lib/backend.cpp
 delete mode 100644 backends/trtllm/src/ffi.cpp
 delete mode 100644 backends/trtllm/tests/infer_test.cpp
 create mode 100644 backends/trtllm/tests/test_backend.cpp
 create mode 100644 backends/trtllm/tests/test_hardware.cpp
 create mode 100644 docs/source/backends/trtllm.md
 create mode 100644 docs/source/multi_backend_support.md

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 6d867ebb..c0199a66 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -8,6 +8,7 @@ on:
         description: Hardware
           # options:
           # - cuda
+          # - cuda-trtllm
           # - rocm
           # - intel
         required: true
@@ -52,6 +53,15 @@ jobs:
                 export platform=""
                 export extra_pytest=""
                 ;;
+            cuda-trtllm)
+                export dockerfile="Dockerfile_trtllm"
+                export label_extension="-trtllm"
+                export docker_volume="/mnt/cache"
+                export docker_devices=""
+                export runs_on="ubuntu-latest"
+                export platform=""
+                export extra_pytest=""
+                ;;
             rocm)
                 export dockerfile="Dockerfile_amd"
                 export label_extension="-rocm"
diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml
index 5190f321..0d87cb29 100644
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@@ -37,7 +37,7 @@ jobs:
       # fail-fast is true by default
       fail-fast: false
       matrix:
-        hardware: ["cuda", "rocm", "intel-xpu", "intel-cpu"]
+        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu"]
     uses: ./.github/workflows/build.yaml # calls the one above ^
     permissions:
       contents: write
diff --git a/Cargo.lock b/Cargo.lock
index f0b756f9..74ae6e16 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2850,20 +2850,6 @@ dependencies = [
  "urlencoding",
 ]
 
-[[package]]
-name = "opentelemetry"
-version = "0.24.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
-dependencies = [
- "futures-core",
- "futures-sink",
- "js-sys",
- "once_cell",
- "pin-project-lite",
- "thiserror",
-]
-
 [[package]]
 name = "opentelemetry-otlp"
 version = "0.13.0"
@@ -2963,24 +2949,6 @@ dependencies = [
  "thiserror",
 ]
 
-[[package]]
-name = "opentelemetry_sdk"
-version = "0.24.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
-dependencies = [
- "async-trait",
- "futures-channel",
- "futures-executor",
- "futures-util",
- "glob",
- "once_cell",
- "opentelemetry 0.24.0",
- "percent-encoding",
- "rand",
- "thiserror",
-]
-
 [[package]]
 name = "option-ext"
 version = "0.2.0"
@@ -4369,7 +4337,6 @@ dependencies = [
 name = "text-generation-backends-trtllm"
 version = "3.0.2-dev0"
 dependencies = [
- "async-stream",
  "async-trait",
  "clap 4.5.21",
  "cmake",
@@ -4377,16 +4344,14 @@ dependencies = [
  "cxx-build",
  "hashbrown 0.14.5",
  "hf-hub",
- "log",
  "pkg-config",
+ "pyo3",
  "text-generation-router",
  "thiserror",
  "tokenizers",
  "tokio",
  "tokio-stream",
  "tracing",
- "tracing-opentelemetry 0.25.0",
- "tracing-subscriber",
 ]
 
 [[package]]
@@ -5086,24 +5051,6 @@ dependencies = [
  "web-time 0.2.4",
 ]
 
-[[package]]
-name = "tracing-opentelemetry"
-version = "0.25.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
-dependencies = [
- "js-sys",
- "once_cell",
- "opentelemetry 0.24.0",
- "opentelemetry_sdk 0.24.1",
- "smallvec",
- "tracing",
- "tracing-core",
- "tracing-log 0.2.0",
- "tracing-subscriber",
- "web-time 1.1.0",
-]
-
 [[package]]
 name = "tracing-opentelemetry-instrumentation-sdk"
 version = "0.16.0"
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index 3ccb0310..b4523ea5 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -1,5 +1,5 @@
 ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
-ARG OMPI_VERSION="4.1.6"
+ARG OMPI_VERSION="4.1.7rc1"
 
 # Build dependencies resolver stage
 FROM lukemathwalker/cargo-chef:latest AS chef
@@ -10,7 +10,7 @@ COPY . .
 RUN cargo chef prepare --recipe-path recipe.json
 
 # CUDA dependent dependencies resolver stage
-FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
@@ -18,18 +18,21 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     build-essential \
     cmake \
     curl \
-    gcc  \
-    g++ \
+    gcc-14  \
+    g++-14 \
     git \
     git-lfs \
     libssl-dev \
+    libucx-dev \
     ninja-build \
     pkg-config \
+    pipx \
     python3 \
     python3-dev \
     python3-setuptools \
     tar \
-    wget
+    wget && \
+    pipx ensurepath
 
 ENV TGI_INSTALL_PREFIX=/usr/local/tgi
 ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
@@ -83,13 +86,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
     cd backends/trtllm && \
     CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
 
-FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
-RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
+FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
+RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
     rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
-    python3 -m pip install transformers tokenizers
+    pipx ensurepath && \
+    pipx install --include-deps transformers tokenizers
 
 WORKDIR /usr/local/tgi/bin
 
+ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
 ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
 ENV TOKENIZERS_PARALLELISM=false
 ENV OMPI_MCA_plm_rsh_agent=""
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 831372cd..9c1f3436 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -13,10 +13,11 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
 endif ()
 
 project(tgi-trtllm-backend VERSION 1.0.0)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 
 include(FetchContent)
 include(ExternalProject)
+include(CheckCXXCompilerFlag)
 
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
@@ -29,11 +30,20 @@ set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE ST
 find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
 
 #### External dependencies ####
-include(cmake/fmt.cmake)
 include(cmake/json.cmake)
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
 
+if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
+endif()
+
+# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
+check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
+if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
+    set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
+endif()
+
 # Let's build TRTLLM as part of CMake
 add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
 
@@ -41,15 +51,21 @@ add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
 set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
 
 # TGI TRTLLM Backend definition
-add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
+add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp csrc/backend.cpp)
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-        $<INSTALL_INTERFACE:include>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
+#        $<INSTALL_INTERFACE:csrc>
 )
 target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
-target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
-target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
+target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
+
+if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
+else()
+    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
+endif ()
 
 # This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
 install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
@@ -60,16 +76,30 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
     message(STATUS "Building tests")
     FetchContent_Declare(
             Catch2
-            GIT_REPOSITORY https://github.com/catchorg/Catch2
-            GIT_TAG v3.6.0
+            URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
     )
     FetchContent_MakeAvailable(Catch2)
 
-    #    add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
-    #    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
+    add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
+    target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
+    target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
+    target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
+    target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
+
+    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
+    else()
+        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
+    endif ()
+
+    if(CMAKE_BUILD_TYPE MATCHES "Debug")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
+        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
+    endif()
 
     list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
     include(CTest)
     include(Catch)
-    #    catch_discover_tests(tgi_trtllm_backend_tests)
+    catch_discover_tests(tgi_trtllm_backend_tests)
 endif ()
diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml
index 97ef1a76..5d907109 100644
--- a/backends/trtllm/Cargo.toml
+++ b/backends/trtllm/Cargo.toml
@@ -7,20 +7,21 @@ homepage.workspace = true
 
 [dependencies]
 async-trait = "0.1"
-async-stream = "0.3"
+#async-stream = "0.3"
 clap = { version = "4.5", features = ["derive"] }
 cxx = "1.0"
 hashbrown = "0.14"
 hf-hub = { workspace = true }
-log = { version = "0.4", features = [] }
+#log = { version = "0.4", features = [] }
 text-generation-router = { path = "../../router" }
 tokenizers = { workspace = true }
 tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tokio-stream = "0.1.15"
 thiserror = "1.0.63"
 tracing = "0.1"
-tracing-opentelemetry = "0.25"
-tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
+#tracing-opentelemetry = "0.25"
+#tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
+pyo3 = { workspace = true }
 
 [build-dependencies]
 cmake = "0.1"
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index 98501926..0a0f6e6b 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -4,7 +4,7 @@ use std::env;
 use std::env::consts::ARCH;
 use std::path::{absolute, PathBuf};
 
-const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
+const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
 const CUDA_REQUIRED_VERSION: &str = "12.6";
 const MPI_REQUIRED_VERSION: &str = "4.1";
@@ -43,8 +43,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
         install_path = absolute(out_dir).expect("cannot happen").join(install_path);
     }
 
-    let _ = cmake::Config::new(".")
-        .uses_cxx11()
+    let mut config = cmake::Config::new(".");
+    config.uses_cxx11()
         .generator("Ninja")
         .profile(match is_debug {
             true => "Debug",
@@ -53,9 +53,16 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
         .env("OPT_LEVEL", opt_level)
         .define("CMAKE_INSTALL_PREFIX", &install_path)
         .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
+        .define("Python3_ROOT_DIR", "../venv")
         .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
-        .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
-        .build();
+        .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
+
+        // Allow to override which Python to use ...
+        if let Some(python3) = option_env!("Python3_EXECUTABLE") {
+            config.define("Python3_EXECUTABLE", python3);
+        }
+
+        config.build();
 
     // Additional transitive CMake dependencies
     let deps_folder = out_dir.join("build").join("_deps");
@@ -90,26 +97,25 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
     CFG.include_prefix = "backends/trtllm";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
-        .include(deps_folder.join("fmt-src").join("include"))
+        .std("c++23")
         .include(deps_folder.join("spdlog-src").join("include"))
         .include(deps_folder.join("json-src").join("include"))
         .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
         .include("/usr/local/cuda/include")
         .include("/usr/local/tensorrt/include")
-        .file("src/ffi.cpp")
-        .std("c++20")
-        .define("NDEBUG", ndebug)
+        .include("csrc/")
+        .file("csrc/ffi.hpp")
+        .define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
         .compile("tgi_trtllm_backend");
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");
     println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
     println!("cargo:rerun-if-changed=cmake/json.cmake");
-    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
     println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
-    println!("cargo:rerun-if-changed=include/backend.h");
-    println!("cargo:rerun-if-changed=lib/backend.cpp");
-    println!("cargo:rerun-if-changed=include/ffi.h");
-    println!("cargo:rerun-if-changed=src/ffi.cpp");
+    println!("cargo:rerun-if-changed=csrc/backend.hpp");
+    println!("cargo:rerun-if-changed=csrc/backend.cpp");
+    println!("cargo:rerun-if-changed=csrc/hardware.hpp");
+    println!("cargo:rerun-if-changed=csrc/ffi.hpp");
 }
 
 fn main() {
diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake
deleted file mode 100644
index afd6ea5f..00000000
--- a/backends/trtllm/cmake/fmt.cmake
+++ /dev/null
@@ -1,6 +0,0 @@
-FetchContent_Declare(
-        fmt
-        DOWNLOAD_EXTRACT_TIMESTAMP
-        URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
-)
-FetchContent_MakeAvailable(fmt)
diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake
index 67eff2fe..d6cdbe3a 100644
--- a/backends/trtllm/cmake/json.cmake
+++ b/backends/trtllm/cmake/json.cmake
@@ -1,6 +1,6 @@
 fetchcontent_declare(
         json
-        DOWNLOAD_EXTRACT_TIMESTAMP
-        URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
+#        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
 )
 fetchcontent_makeavailable(json)
diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake
index 7f529a7d..45e6790a 100644
--- a/backends/trtllm/cmake/spdlog.cmake
+++ b/backends/trtllm/cmake/spdlog.cmake
@@ -1,6 +1,6 @@
 set(SPDLOG_USE_FMT ON)
 set(SPDLOG_BUILD_SHARED OFF)
-set(SPDLOG_FMT_EXTERNAL ON)
+set(SPDLOG_FMT_EXTERNAL OFF)
 
 # Define the level at which SPDLOG_ compilation level is defined
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
@@ -11,7 +11,7 @@ endif ()
 
 fetchcontent_declare(
         spdlog
-        DOWNLOAD_EXTRACT_TIMESTAMP
+#        DOWNLOAD_EXTRACT_TIMESTAMP
         URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
 )
 fetchcontent_makeavailable(spdlog)
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
index 5f1b6c19..4217892b 100644
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@@ -11,6 +11,7 @@ set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
 
 message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
+set(ENABLE_UCX OFF)
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
     set(FAST_BUILD ON)
     set(NVTX_DISABLE OFF)
@@ -20,11 +21,13 @@ else ()
     set(NVTX_DISABLE ON)
 endif ()
 
+find_package(Python3 REQUIRED Interpreter)
+
 fetchcontent_declare(
         trtllm
-        GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
-        GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
-        GIT_SHALLOW FALSE
+        GIT_REPOSITORY https://github.com/huggingface/TensorRT-LLM.git
+        GIT_TAG 1bb9ca4688805444f203647674bac1d7219d0579
+        GIT_SHALLOW ON
         DOWNLOAD_EXTRACT_TIMESTAMP
 )
 fetchcontent_makeavailable(trtllm)
diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
new file mode 100644
index 00000000..b50044d8
--- /dev/null
+++ b/backends/trtllm/csrc/backend.cpp
@@ -0,0 +1,79 @@
+#include <ranges>
+
+#include <nlohmann/json.hpp>
+#include <spdlog/spdlog.h>
+
+#include "backend.hpp"
+#include "hardware.hpp"
+
+namespace huggingface::tgi::backends::trtllm {
+    tle::ParallelConfig backend_workspace_t::parallel_config() const {
+        // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
+        const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
+
+        auto mode = tle::CommunicationMode::kLEADER;
+        std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
+
+        if (world_size > 1) {
+            SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
+            mode = tle::CommunicationMode::kORCHESTRATOR;
+            orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
+        } else {
+            SPDLOG_INFO("Detected single engine deployment, using leader mode");
+        }
+
+        return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
+    }
+
+
+    tle::ExecutorConfig backend_workspace_t::executor_config() const {
+        // Retrieve the compute capabilities to enable some options at runtime
+        const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
+
+        // Allocate the config
+        tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1);
+
+        // Set the parallel config as inferred
+        executor_config.setParallelConfig(parallel_config());
+
+        // Define some configuration variables
+        executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
+        executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
+        executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
+        return executor_config;
+    }
+
+    backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
+        : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
+
+    size_t backend_t::num_tokens_ready() const noexcept {
+        return executor_.getNumResponsesReady();
+    }
+
+    std::expected<request_id_t, backend_error_t>
+    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
+        SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
+        return executor_.enqueueRequest(tle::Request {
+                {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
+                static_cast<tle::SizeType32>(generation_params.max_new_tokens),
+                true,
+                (tle::SamplingConfig) sampling_params,
+                tle::OutputConfig { /* returnLogProbs= */ true },
+                std::nullopt,
+                std::nullopt,
+                std::nullopt,
+                std::nullopt,
+                workspace.generation_config().stop_words
+        });
+    }
+
+    std::vector<tle::Response> backend_t::pull_tokens() noexcept {
+        SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready());
+        return executor_.awaitResponses();
+    }
+
+    void backend_t::cancel(request_id_t request_id) noexcept {
+        SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
+        executor_.cancelRequest(request_id);
+    }
+}
diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp
new file mode 100644
index 00000000..f49c437a
--- /dev/null
+++ b/backends/trtllm/csrc/backend.hpp
@@ -0,0 +1,231 @@
+#ifndef TGI_BACKEND_TRTLLM
+#define TGI_BACKEND_TRTLLM
+
+#include <cmath>
+#include <cstdint>
+#include <expected>
+#include <fstream>
+#include <list>
+#include <span>
+
+#include <nlohmann/json.hpp>
+#include <spdlog/spdlog.h>
+#include <spdlog/fmt/fmt.h>
+
+#include <tensorrt_llm/executor/executor.h>
+
+namespace huggingface::tgi::backends::trtllm {
+    namespace tle = tensorrt_llm::executor;
+    using json = nlohmann::json;
+    using request_id_t = uint64_t;
+    using token_id_t = tle::TokenIdType;
+
+    /**
+     * Represent the parameters used for generation
+     */
+    struct generation_params_t {
+        uint32_t max_new_tokens;
+    };
+
+    /**
+     * Represent the parameters used to sample tokens from the logit distribution
+     */
+    struct sampling_params_t {
+        uint32_t top_k;
+        float_t top_p;
+        float_t repetition_penalty;
+        float_t frequency_penalty;
+        float_t temperature;
+        uint64_t seed;
+
+        constexpr explicit operator tle::SamplingConfig() const {
+            return tle::SamplingConfig{
+                    1,
+                    top_k,
+                    top_p,
+                    std::nullopt,
+                    std::nullopt,
+                    std::nullopt,
+                    seed,
+                    temperature,
+                    std::nullopt,
+                    std::nullopt,
+                    repetition_penalty,
+                    std::nullopt,
+                    frequency_penalty,
+                    std::nullopt
+            };
+        }
+    };
+
+    /**
+     * Represent possible values from transformers generation `generation_config.json`.
+     * It usually stores default sampling parameters to use, such as top_p, temperature, etc.
+     */
+    struct generation_config_t {
+        float_t top_p;
+        float_t temperature;
+        std::list<std::vector<int32_t>> stop_words;
+
+        constexpr explicit generation_config_t(const json &config) :
+                top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {
+            if (config.contains("/eos_token_id"_json_pointer) && config["/eos_token_id"_json_pointer].is_array()) {
+                const auto &eos_token_id = config["/eos_token_id"_json_pointer];
+                std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
+                    stop_words.emplace_back(1, token_id.template get<int32_t>());
+                });
+
+                SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
+            }
+        }
+    };
+
+    /**
+     * Helper class representing various items which are stored within the TensorRT-LLM engines folder and
+     * can be retrieved at runtime
+     */
+    class backend_workspace_t {
+    private:
+        constexpr static auto as_json = [](const std::filesystem::path &path) -> json {
+            std::ifstream config_f(path);
+            return json::parse(config_f);
+        };
+
+        std::filesystem::path engines_folder_;
+        std::filesystem::path executor_worker_path_;
+        json config_;
+        generation_config_t generation_config_;
+
+    public:
+        backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) :
+                engines_folder_(engines_folder),
+                executor_worker_path_(executor_worker_path),
+                config_(as_json(engines_folder / "config.json")),
+                generation_config_(as_json(engines_folder / "generation_config.json")) {};
+
+        backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path) :
+                engines_folder_(engines_folder),
+                executor_worker_path_(executor_worker_path),
+                config_(as_json(engines_folder / "config.json")),
+                generation_config_(as_json(engines_folder / "generation_config.json")) {};
+
+        /**
+         * Path to the folder containing the TensorRT-LLM engines
+         * @return local filesystem path to the folder
+         */
+        [[nodiscard]] constexpr std::filesystem::path engines_folder() const { return engines_folder_; }
+
+        /**
+         * Hugging Face transformers' generated `generation_config_t` mapping information stored in the
+         * `generation_config.json` holding default generation parameters.
+         * @return `generation_config_t`
+         */
+        [[nodiscard]] constexpr const generation_config_t &generation_config() const { return generation_config_; }
+
+        /**
+         * Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
+         * to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information
+         * @return `tensorrt_llm::executor::ParallelConfig` instance
+         */
+        [[nodiscard]] tle::ParallelConfig parallel_config() const;
+
+        /**
+         * Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used
+         * to initialize `tensorrt_llm::executor::Executor`
+         * @return `tensorrt_llm::executor::ExecutorConfig` instance
+         */
+        [[nodiscard]] tle::ExecutorConfig executor_config() const;
+    };
+
+    /**
+     * Error raised by the underlying backend implementation
+     */
+    enum backend_error_t {
+        EXECUTOR_NOT_READY = 3,
+        EXECUTOR_SCHEDULING_FAILED = 4,
+    };
+
+
+    /**
+     * Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to
+     * - schedule new request
+     * - pull status of submitted request(s)
+     * - cancel submitted request(s)
+     */
+    class backend_t {
+    private:
+        backend_workspace_t workspace;
+        tle::Executor executor_;
+
+    public:
+        backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
+
+        backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
+                : backend_t(engines_folder, executor_worker_path) {};
+
+        /**
+         * Submit a new request to the executor
+         * @param token_ids
+         * @param generation_params
+         * @param sampling_params
+         * @return Either newly submitted request's id or the error why it failed to submit
+         */
+        [[nodiscard("Discarded executor request_id needs to be assigned")]]
+        std::expected<request_id_t, backend_error_t>
+        submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,
+               sampling_params_t sampling_params) noexcept;
+
+        /**
+         * Query the number of tokens available across all in-flight generations
+         * @return
+         */
+        [[nodiscard("Pulling out the number of tokens")]]
+        size_t num_tokens_ready() const noexcept;
+
+        /**
+         * Pull out newly generated tokens from the executor
+         * @return
+         */
+        [[nodiscard("")]]
+        std::vector<tle::Response> pull_tokens() noexcept;
+
+        /**
+         * Cancel the specified request on the executor' set
+         * @param request_id Request's Identifier to remove from the in-flight executor
+         */
+        void cancel(request_id_t) noexcept;
+    };
+
+    /**
+     * Create a TensorRT-LLM executor from a workspace
+     */
+    const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
+        return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,
+                workspace.executor_config()};
+    };
+}
+
+/**
+ * Helper structures to define formatting strategies for various types in the backend
+ */
+template<>
+struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t> : formatter<string_view> {
+    auto format(huggingface::tgi::backends::trtllm::generation_params_t const &c,
+                format_context &ctx) const -> format_context::iterator {
+        return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
+    }
+};
+
+template<>
+struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : formatter<string_view> {
+    auto format(huggingface::tgi::backends::trtllm::sampling_params_t const &c,
+                format_context &ctx) const -> format_context::iterator {
+        return fmt::format_to(
+                ctx.out(),
+                "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
+                c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
+        );
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
new file mode 100644
index 00000000..de2333af
--- /dev/null
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -0,0 +1,162 @@
+#ifndef TGI_BACKEND_TRTLLM_FFI
+#define TGI_BACKEND_TRTLLM_FFI
+
+#include <memory>
+#include <thread>
+
+#include <nvml.h>
+#include <tensorrt_llm/common/tllmException.h>
+#include <tensorrt_llm/plugins/api/tllmPlugin.h>
+
+#include <spdlog/spdlog.h>
+
+#include <backend.hpp>
+#include <hardware.hpp>
+
+namespace rust::behavior {
+    template<typename Try, typename Fail>
+    static void trycatch(Try &&func, Fail &&fail) noexcept try {
+        func();
+    } catch (tensorrt_llm::common::TllmException &e) {
+        fail(e.what());
+    }
+}
+
+namespace huggingface::tgi::backends::trtllm {
+    class tensorrt_llm_backend_t;
+}
+
+#include "backends/trtllm/src/lib.rs.h"
+
+namespace huggingface::tgi::backends::trtllm {
+    std::once_flag backend_initialized_flag;
+
+    class tensorrt_llm_backend_t {
+    private:
+        backend_t inner_;
+
+    public:
+        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
+            : inner_(engine_folder, executor_worker_path) {}
+
+        size_t num_tokens_ready() const noexcept {
+            return inner_.num_tokens_ready();
+        }
+
+        request_id_t submit(
+                rust::Slice<const uint32_t> tokens,
+                uint32_t max_new_tokens,
+                uint32_t top_k,
+                float_t top_p,
+                float_t temperature,
+                float_t repetition_penalty,
+                float_t frequency_penalty,
+                uint64_t seed
+        ) {
+            // This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
+            SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
+
+            // Submit the request to the executor and get back a potential request_id used to track request status
+            const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
+            const auto maybe_request_id = inner_.submit(
+                signed_tokens,
+                {max_new_tokens},
+                {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
+            );
+
+            // If we do have a value, let's return the request_id
+            if(maybe_request_id.has_value()) [[likely]] {
+                return *maybe_request_id;
+            } else {
+                SPDLOG_WARN("[FFI] Failed to submit request to the executor");
+                return maybe_request_id.error();
+            }
+        }
+
+        std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
+            if(num_tokens_ready() > 0) [[likely]] {
+                const auto responses = inner_.pull_tokens();
+
+                SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
+                // Transform tle::Response to GenerationStep
+                auto steps = std::make_unique<std::vector<generation_step_t>>();
+                std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
+                    const auto reqId = r.getRequestId();
+                    if (!r.hasError()) [[likely]] {
+                        const auto result = r.getResult();
+                        return generation_step_t{
+                                reqId,
+                                static_cast<uint32_t>(result.outputTokenIds[0][0]),
+                                result.logProbs.value()[0][0],
+                                result.isFinal,
+                                false,
+                                std::string()
+                        };
+                    } else {
+                        return generation_step_t{
+                                reqId,
+                                0,
+                                0.0,
+                                true,
+                                true,
+                                std::move(r.getErrorMsg())
+                        };
+                    }
+                });
+                return steps;
+
+            } else {
+                return std::make_unique<std::vector<generation_step_t>>();
+            }
+        }
+
+        void cancel(request_id_t requestId) noexcept {
+            SPDLOG_DEBUG("[FFI] cancelling request {:d}", requestId);
+            inner_.cancel(requestId);
+        }
+    };
+
+    void initialize_logging() {
+#ifndef TGI_TRTLLM_BACKEND_DEBUG
+        if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
+        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
+        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
+            return std::tolower(c);
+        });
+
+        if (log_level == "debug")
+            spdlog::set_level(spdlog::level::debug);
+        else
+            spdlog::set_level(spdlog::level::info);
+    }
+#else
+        spdlog::set_level(spdlog::level::debug);
+#endif
+    }
+
+    void initialize_tensorrt_llm_backend() {
+        SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version());
+
+        // Initialize everyone
+        initialize_logging();
+        nvmlInit_v2();
+        initTrtLlmPlugins();
+
+        const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count();
+        if (numGpus.has_value()) {
+            SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", *numGpus);
+        } else {
+            SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system");
+            // todo: throw
+        }
+    }
+
+    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
+        std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
+        return std::make_unique<tensorrt_llm_backend_t>(
+            std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
+            std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
+        );
+    }
+}
+#endif
\ No newline at end of file
diff --git a/backends/trtllm/csrc/hardware.hpp b/backends/trtllm/csrc/hardware.hpp
new file mode 100644
index 00000000..8e5fa696
--- /dev/null
+++ b/backends/trtllm/csrc/hardware.hpp
@@ -0,0 +1,81 @@
+#ifndef TGI_HARDWARE_CUDA
+#define TGI_HARDWARE_CUDA
+#include <cstdint>
+#include <optional>
+
+#include <nvml.h>
+
+namespace huggingface::tgi::hardware::cuda {
+    static constexpr auto VOLTA = std::make_tuple(7u, 0u);
+    static constexpr auto TURING = std::make_tuple(7u, 5u);
+    static constexpr auto AMPERE = std::make_tuple(8u, 0u);
+    static constexpr auto HOPPER = std::make_tuple(9u, 0u);
+    static constexpr auto ADA_LOVELACE = std::make_tuple(8u, 9u);
+
+    /**
+     * Get the number of GPUs on the local machine
+     * @return std::nullopt if no device is available, otherwise >= 1
+     */
+    inline std::optional<size_t> get_device_count() {
+        uint32_t numGpus = 0;
+        if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
+            return numGpus;
+        }
+        return std::nullopt;
+    }
+
+    /**
+     * Store information about the version of the CUDA Compute Capabilities detected on the device
+     */
+    struct compute_capabilities_t {
+        int32_t major;
+        int32_t minor;
+
+        compute_capabilities_t(): compute_capabilities_t(0) {}
+        explicit compute_capabilities_t(size_t device_idx): major(-1), minor(-1) {
+            nvmlDevice_t device;
+            if (nvmlDeviceGetHandleByIndex_v2(device_idx, &device) == NVML_SUCCESS) {
+               nvmlDeviceGetCudaComputeCapability(device, &major, &minor);
+            }
+        };
+        compute_capabilities_t(int32_t major, int32_t minor): major(major), minor(minor) {}
+
+        /**
+         * Evaluate if the underlying capabilities is at least greater or equals to the provided 2-tuple (major, minor)
+         * @param sm Architecture version (major, minor)
+         * @return True if greater or equals to the underlying compute capabilities
+         */
+        [[nodiscard]] constexpr auto is_at_least(std::tuple<uint32_t, uint32_t> sm) const -> decltype(auto) { return std::tie(major, minor) >= sm; }
+
+        /**
+         * Check if the capabilities match at least Volta architecture (sm_70)
+         * @return true if at least Volta (>= sm_70), false otherwise
+         */
+        [[nodiscard]] constexpr bool is_at_least_volta() const { return is_at_least(VOLTA); }
+
+        /**
+         * Check if the capabilities match at least Turing architecture (sm_75)
+         * @return true if at least Turing (>= sm_75), false otherwise
+         */
+        [[nodiscard]] constexpr bool is_at_least_turing() const { return is_at_least(TURING); }
+
+        /**
+         * Check if the capabilities match at least Ampere architecture (sm_80)
+         * @return true if at least Ampere (>= sm_80), false otherwise
+         */
+        [[nodiscard]] constexpr bool is_at_least_ampere() const { return is_at_least(AMPERE); }
+
+        /**
+         * Check if the capabilities match at least Ada Lovelace architecture (sm_89)
+         * @return true if at least Ada Lovelace (>= sm_89), false otherwise
+         */
+        [[nodiscard]] constexpr bool is_at_least_ada_lovelace() const { return is_at_least(ADA_LOVELACE); }
+
+        /**
+         * Check if the capabilities match at least Hopper architecture (sm_90)
+         * @return true if at least Hopper (>= sm_90), false otherwise
+         */
+        [[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
+    };
+}
+#endif
\ No newline at end of file
diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h
deleted file mode 100644
index d23f6288..00000000
--- a/backends/trtllm/include/backend.h
+++ /dev/null
@@ -1,144 +0,0 @@
-//
-// Created by Morgan Funtowicz on 6/30/24.
-//
-
-#ifndef TGI_TRTLLM_BACKEND_H
-#define TGI_TRTLLM_BACKEND_H
-
-#include <array>
-#include <cmath>
-#include <filesystem>
-#include <span>
-#include <vector>
-
-#include <nlohmann/json.hpp>
-
-#include <tensorrt_llm/runtime/common.h>
-#include <tensorrt_llm/executor/executor.h>
-#include <tensorrt_llm/plugins/api/tllmPlugin.h>
-
-using json = nlohmann::json;
-namespace tle = tensorrt_llm::executor;
-
-
-#define CAST_SIZETYPE(x) static_cast<tle::SizeType32>(x)
-
-namespace huggingface::tgi::backends {
-    using RequestId = tle::IdType;
-    using TokenId = tle::TokenIdType;
-
-    const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
-    constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
-            "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
-    constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
-            "Submitting inference [{}] to the executor ({:d} already in-flight)");
-    constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
-            "Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}");
-
-    /**
-     * Initialize all the components required by TRTLLM.
-     * It is required to call this function before attempting to load any engine
-     */
-    void InitializeBackend();
-
-    /**
-     * Initialize logging mechanism
-     */
-    void InitializeLogging();
-
-
-    /**
-     *
-     * @param config TensorRT-LLM configuration object
-     * @param workerPath Path to the "executorWorker" provided by TensorRT-LLM when using orchestrator mode
-     * @return
-     */
-    tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
-
-    /**
-     *
-     * @param worldSize
-     * @param workerPath
-     * @return
-     */
-    tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
-
-    /**
-     * Get the sampling configuration from the parameters provided by TGI
-     * @param topK
-     * @param topP
-     * @param temperature
-     * @param repetition_penalty
-     * @param frequency_penalty
-     * @param seed
-     * @return
-     */
-    tle::SamplingConfig GetSamplingConfig(
-            uint32_t topK,
-            float_t topP,
-            float_t temperature,
-            float_t repetition_penalty,
-            float_t frequency_penalty,
-            uint64_t seed
-    ) noexcept;
-
-    /**
-     * Attempt to retrieve the
-     * @param generationConfigPath
-     * @return
-     */
-    std::optional<std::list<std::vector<TokenId>>>
-    GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept;
-
-    /**
-     *
-     */
-    class TensorRtLlmBackend {
-    private:
-        const json config;
-        tle::Executor executor;
-
-        /** Frequently accessed variables cached here **/
-        uint32_t maxNumTokens;
-        std::list<std::vector<TokenId>> stopWords;
-
-    public:
-        explicit TensorRtLlmBackend(
-                const std::filesystem::path &engineFolder,
-                const std::filesystem::path &executorWorker
-        );
-
-        /**
-         * Query the executor for the number of token available for pulling
-         * @return
-         */
-        [[nodiscard]] size_t NumResponsesReady() const;
-
-        /**
-         * Submit a new generation task to the executor
-         * @param tokens
-         * @param topK
-         * @param topP
-         * @param temperature
-         * @param repetitionPenalty
-         * @param frequencyPenalty
-         * @param seed
-         * @return Request id related to this generation for reference
-         */
-        [[nodiscard]] RequestId Submit(
-                const std::vector<TokenId> &tokens,
-                uint32_t maxNewTokens,
-                int32_t topK,
-                float_t topP,
-                float_t temperature,
-                float_t repetitionPenalty,
-                float_t frequencyPenalty,
-                uint64_t seed
-        );
-
-        [[nodiscard]] std::vector<tle::Response> PullNewTokens();
-    };
-}
-
-
-#endif //TGI_TRTLLM_BACKEND_H
diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h
deleted file mode 100644
index 449bcd4d..00000000
--- a/backends/trtllm/include/ffi.h
+++ /dev/null
@@ -1,75 +0,0 @@
-//
-// Created by mfuntowicz on 7/11/24.
-//
-
-#ifndef TGI_TRTLLM_BACKEND_FFI_H
-#define TGI_TRTLLM_BACKEND_FFI_H
-
-#include <cmath>
-#include <cstddef>
-#include <memory>
-#include "backend.h"
-
-namespace huggingface::tgi::backends {
-    class TensorRtLlmBackendImpl;
-}
-
-// Template to support returning error from TllmException back to Rust in a Result<>
-#include <tensorrt_llm/common/tllmException.h>
-
-namespace rust::behavior {
-    template<typename Try, typename Fail>
-    static void trycatch(Try &&func, Fail &&fail) noexcept try {
-        func();
-    } catch (tensorrt_llm::common::TllmException &e) {
-        fail(e.what());
-    }
-}
-
-#include "backends/trtllm/src/lib.rs.h"
-
-namespace huggingface::tgi::backends {
-
-    class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
-    public:
-        /***
-         *
-         * @param engineFolder
-         * @param executorWorker
-         */
-        TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
-
-        /***
-         *
-         * @param tokens
-         * @param maxNewTokens
-         * @param topK
-         * @param topP
-         * @param temperature
-         * @param repetition_penalty
-         * @param frequency_penalty
-         * @param seed
-         * @return
-         */
-        [[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
-        uint64_t
-        Submit(rust::Slice<const uint32_t> tokens, uint32_t maxNewTokens,
-               int32_t topK, float_t topP, float_t temperature,
-               float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
-
-        /***
-         *
-         * @return
-         */
-        std::unique_ptr<std::vector<GenerationStep>> PullTokens();
-    };
-
-    /***
-    *
-    * @param engineFolder
-    * @return
-    */
-    std::unique_ptr<TensorRtLlmBackendImpl> CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker);
-}
-
-#endif //TGI_TRTLLM_BACKEND_FFI_H
diff --git a/backends/trtllm/include/hardware.h b/backends/trtllm/include/hardware.h
deleted file mode 100644
index 9633495f..00000000
--- a/backends/trtllm/include/hardware.h
+++ /dev/null
@@ -1,59 +0,0 @@
-//
-// Created by mfuntowicz on 7/23/24.
-//
-
-#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
-#define TGI_TRTLLM_BACKEND_HARDWARE_H
-
-#include <cstdint>
-#include <limits>
-#include <fmt/base.h>
-#include <spdlog/spdlog.h>
-#include <nvml.h>
-
-namespace huggingface::hardware::cuda {
-
-#define AMPERE_SM_MAJOR 8
-#define HOPPER_SM_MAJOR 9
-
-    /**
-     * Store information about the version of the CUDA Compute Capabilities detected on the device
-     */
-    struct CudaComputeCapabilities {
-        int32_t major;
-        int32_t minor;
-
-        [[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
-
-        [[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; }
-    };
-
-    CudaComputeCapabilities GetCudaComputeCapabilities() {
-        // Get the compute capabilities of the current hardware
-        nvmlDevice_t device;
-        CudaComputeCapabilities capabilities{0, 0};
-        if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
-            SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
-            if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
-                SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
-            }
-        }
-
-        return capabilities;
-    }
-
-    /**
-     * Return the number of GPU detected. If no GPU is detected, return size_t::max()
-     * @return
-     */
-    std::optional<size_t> GetNumDevices() {
-        uint32_t numGpus = 0;
-        if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
-            return std::optional(numGpus);
-        } else {
-            return std::nullopt;
-        }
-    }
-}
-
-#endif //TGI_TRTLLM_BACKEND_HARDWARE_H
diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp
deleted file mode 100644
index 4dd41de0..00000000
--- a/backends/trtllm/lib/backend.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-#include <cstdlib>
-#include <fstream>
-
-#include <fmt/ranges.h>
-#include <spdlog/spdlog.h>
-#include <nvml.h>
-
-#include "backend.h"
-#include "hardware.h"
-
-
-void huggingface::tgi::backends::InitializeLogging() {
-#ifdef NDEBUG
-    if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
-        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
-        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
-            return std::tolower(c);
-        });
-
-        if (log_level == "debug")
-            spdlog::set_level(spdlog::level::debug);
-        else
-            spdlog::set_level(spdlog::level::info);
-    }
-#else
-    spdlog::set_level(spdlog::level::debug);
-#endif
-}
-
-void huggingface::tgi::backends::InitializeBackend() {
-    SPDLOG_INFO("Initializing Backend...");
-    nvmlInit_v2();
-    initTrtLlmPlugins();
-
-    InitializeLogging();
-
-    SPDLOG_INFO("Backend Executor Version: {}", tle::version());
-    const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
-    if (numGpus.has_value()) {
-        SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
-    } else {
-        SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
-    }
-}
-
-[[nodiscard]]
-tle::ParallelConfig
-huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept {
-    auto mode = tle::CommunicationMode::kLEADER;
-    std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
-
-    if (worldSize > 1) {
-        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
-        mode = tle::CommunicationMode::kORCHESTRATOR;
-        orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
-    } else {
-        SPDLOG_INFO("Detected single engine deployment, using leader mode");
-    }
-
-    return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
-}
-
-[[nodiscard]]
-tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
-    tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
-
-    // Retrieve the compute capabilities to enable some options at runtime
-    const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
-
-    // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
-    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
-    execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath));
-
-    // Define some configuration variables
-    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
-    execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere());
-    execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
-    return execConfig;
-}
-
-tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
-        const uint32_t topK,
-        const float_t topP,
-        const float_t temperature,
-        const float_t repetition_penalty,
-        const float_t frequency_penalty,
-        const uint64_t seed) noexcept {
-
-    return tle::SamplingConfig(
-            1,  // TGI only use a single beam
-            topK,
-            topP,
-            std::nullopt,
-            std::nullopt,
-            std::nullopt,
-            seed,
-            temperature,
-            temperature,
-            std::nullopt,
-            repetition_penalty,
-            std::nullopt,
-            frequency_penalty
-    );
-}
-
-std::optional<std::list<std::vector<huggingface::tgi::backends::TokenId>>>
-huggingface::tgi::backends::GetStopWordsFromConfig(
-        const std::filesystem::path &generationConfigPath) noexcept {
-    if (exists(generationConfigPath)) {
-        const auto generationConfig = json::parse(std::ifstream(generationConfigPath));
-        if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) {
-            SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size());
-            std::list<std::vector<huggingface::tgi::backends::TokenId>> stopWords(eosTokenIds.size());
-
-            const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type {
-                return {tokenIdObj.template get<tle::TokenIdType>()};
-            };
-
-            std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token);
-            return stopWords;
-        } else {
-            SPDLOG_INFO("Invalid EOS tokens entry found (not an array)");
-        }
-    } else {
-        SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist");
-    }
-
-    return std::nullopt;
-}
-
-huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
-        const std::filesystem::path &enginesFolder,
-        const std::filesystem::path &executorWorker
-) :
-        config(json::parse(std::ifstream(enginesFolder / "config.json"))),
-        executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY,
-                 GetExecutorConfig(config, executorWorker.string())) {
-
-    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get<std::string_view>());
-
-    // Ensure we have enough GPUs on the system
-    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
-    const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0);
-    if (numGpus < worldSize) {
-        SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize);
-        // todo : raise exception to catch on rust side
-    }
-
-    // Cache variables
-    maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<uint32_t>();
-
-    // Attempt to discover stopWords from the generation_config.json
-    const auto generationConfigPath = enginesFolder / "generation_config.json";
-    stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list<std::vector<TokenId>>());
-}
-
-[[nodiscard("Returned number of requests needs to be consumed")]]
-size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
-#ifdef NDEBUG
-    return executor.getNumResponsesReady();
-#else
-    const auto numResponses = executor.getNumResponsesReady();
-    if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses);
-    return numResponses;
-#endif
-}
-
-[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
-tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
-        const std::vector<tle::TokenIdType> &tokens,
-        const uint32_t maxNewTokens,
-        const int32_t topK,
-        const float_t topP,
-        const float_t temperature,
-        const float_t repetitionPenalty,
-        const float_t frequencyPenalty,
-        const uint64_t seed
-) {
-    const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast<uint32_t>(maxNumTokens - tokens.size()));
-#ifndef NDEBUG
-    {
-        const auto &iterations = executor.getLatestIterationStats();
-        const auto &lastIteration = iterations.front();
-
-        SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests);
-        SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
-        SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked);
-    }
-#endif
-
-    const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
-
-    // Build the request
-    auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG};
-    request.setStopWords(stopWords);
-
-    // Submit to the executor for batching
-    return executor.enqueueRequest(request);
-}
-
-std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() {
-    return executor.awaitResponses();
-}
diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh
index 4c2dc26b..7deb2fe8 100755
--- a/backends/trtllm/scripts/install_tensorrt.sh
+++ b/backends/trtllm/scripts/install_tensorrt.sh
@@ -2,7 +2,7 @@
 
 set -ex
 
-TRT_VER_BASE="10.4.0"
+TRT_VER_BASE="10.6.0"
 TRT_VER_FULL="${TRT_VER_BASE}.26"
 CUDA_VER="12.6"
 CUDNN_VER="9.5.0.50-1"
diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp
deleted file mode 100644
index 0a92c050..00000000
--- a/backends/trtllm/src/ffi.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-//
-// Created by mfuntowicz on 6/30/24.
-//
-#pragma once
-
-#include <algorithm>
-#include <exception>
-#include <filesystem>
-#include <functional>
-#include <limits>
-#include <iterator>
-#include <ranges>
-#include <vector>
-
-#include <spdlog/spdlog.h>
-#include "backends/trtllm/include/ffi.h"
-
-
-huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
-        const std::string_view &engineFolder,
-        const std::string_view &executorWorker
-) : TensorRtLlmBackend(engineFolder, executorWorker) {}
-
-
-uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
-        rust::Slice<const uint32_t> tokens,
-        uint32_t maxNewTokens,
-        int32_t topK,
-        float_t topP,
-        float_t temperature,
-        float_t repetition_penalty,
-        float_t frequency_penalty,
-        uint64_t seed) {
-
-    // This will copy all the items from the initial slice
-    std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
-    return TensorRtLlmBackend::Submit(
-            std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
-}
-
-std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
-huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
-    const auto responses = TensorRtLlmBackend::PullNewTokens();
-
-    auto steps = std::make_unique<std::vector<GenerationStep>>();
-    steps->reserve(responses.size());
-
-#ifndef NDEBUG
-    SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
-#endif
-
-    // Transform tle::Response to GenerationStep
-    std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
-        const auto reqId = r.getRequestId();
-        if (!r.hasError()) {
-            const auto result = r.getResult();
-            return GenerationStep{
-                    reqId,
-                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
-                    result.logProbs.value()[0][0],
-                    result.isFinal,
-                    false,
-                    std::string()
-            };
-        } else {
-            return GenerationStep{
-                    reqId,
-                    0,
-                    0.0,
-                    true,
-                    true,
-                    std::move(r.getErrorMsg())
-            };
-        }
-    });
-
-    return steps;
-}
-
-std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
-huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
-    SPDLOG_INFO("Creating TensorRT-LLM Backend");
-    // Unconditionally call this to initialize and discover TRTLLM plugins
-    InitializeBackend();
-
-    const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
-    const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
-    return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
-}
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index edd8caff..d6acafa1 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -4,10 +4,11 @@ pub mod errors;
 mod looper;
 mod utils;
 
-#[cxx::bridge(namespace = "huggingface::tgi::backends")]
+#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
 mod ffi {
     /// Struct used as shared type between rust and C++ to represent the result
     /// of a single decoding iteration
+    #[cxx_name = "generation_step_t"]
     #[derive(Debug, Clone)]
     pub struct GenerationStep {
         request_id: u64,
@@ -19,9 +20,10 @@ mod ffi {
     }
 
     unsafe extern "C++" {
-        include!("backends/trtllm/src/ffi.cpp");
+        include!("backends/trtllm/csrc/ffi.hpp");
 
         /// Represent an instance of the underlying TensorRT-LLM backend
+        #[cxx_name = "tensorrt_llm_backend_t"]
         type TensorRtLlmBackendImpl;
 
         /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
@@ -38,21 +40,18 @@ mod ffi {
         /// ```
         ///
         /// ```
-        #[rust_name = "create_tensorrt_llm_backend"]
-        fn CreateTensorRtLlmBackend(
+        fn create_backend_from_engine_folder(
             engine_folder: &str,
             executor_worker: &str,
         ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
 
-        #[rust_name = "num_responses_ready"]
-        fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
+        fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
 
-        #[rust_name = "submit"]
-        fn Submit(
+        fn submit(
             self: Pin<&mut TensorRtLlmBackendImpl>,
             tokens: &[u32],
             max_new_tokens: u32,
-            top_k: i32,
+            top_k: u32,
             top_p: f32,
             temperature: f32,
             repetition_penalty: f32,
@@ -60,9 +59,10 @@ mod ffi {
             seed: u64,
         ) -> Result<u64>;
 
-        #[rust_name = "pull_tokens"]
-        fn PullTokens(
+        fn pull_tokens(
             self: Pin<&mut TensorRtLlmBackendImpl>,
         ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
+
+        fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
     }
 }
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index e26155c1..3addd95f 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -1,14 +1,13 @@
-use std::hint;
-use std::ops::Deref;
-use std::path::Path;
-
 use async_trait::async_trait;
 use cxx::UniquePtr;
 use hashbrown::HashMap;
+use std::hint;
+use std::ops::Deref;
+use std::path::Path;
 use tokenizers::Tokenizer;
 use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
 use tokio::sync::TryAcquireError;
-use tokio::task::{spawn_blocking, JoinHandle};
+use tokio::task::spawn_blocking;
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, error, warn};
@@ -22,7 +21,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest};
 use text_generation_router::{FinishReason, Token};
 
 use crate::errors::TensorRtLlmBackendError;
-use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
+use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
 use crate::utils::first_line;
 
 type InferResult<T> = Result<T, InferError>;
@@ -30,9 +29,10 @@ type InferResult<T> = Result<T, InferError>;
 /// Wrap the requests along with the channel used to stream back to the client the decoded tokens
 struct GenerationContext {
     request: ValidGenerateRequest,
+    streamer: UnboundedSender<InferResult<InferStreamResponse>>,
+    tokens: Vec<u32>,
     start: Option<Instant>,
     queued: Instant,
-    streamer: UnboundedSender<InferResult<InferStreamResponse>>,
 }
 
 #[derive(Debug, Copy, Clone)]
@@ -58,31 +58,22 @@ impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
     }
 }
 
-/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens
-struct DecodedTokenContext {
-    token: DecodedToken,
-    start: Option<Instant>,
-    queued: Instant,
-    channel: UnboundedSender<InferResult<InferStreamResponse>>,
-}
-
 fn executor_status_looper(
-    mut backend: UniquePtr<TensorRtLlmBackendImpl>,
     max_inflight_requests: usize,
-    mut waiting_requests: UnboundedReceiver<GenerationContext>,
-    post_processor_sender: UnboundedSender<(u64, InferResult<DecodedTokenContext>)>,
+    tokenizer: Tokenizer,
+    mut backend: UniquePtr<TensorRtLlmBackendImpl>,
+    mut backlog: UnboundedReceiver<GenerationContext>,
 ) {
     // Track the tuple (request_id, stream) for each request
     let mut in_flights =
         HashMap::<u64, GenerationContext>::with_capacity(max_inflight_requests * 2);
 
-    // TODO: Does it need a spin-loop?
     'scheduler: loop {
         // Is there any request pending to be scheduled?
-        let awaiting_requests = waiting_requests.len();
+        let awaiting_requests = backlog.len();
         for _ in 0..awaiting_requests {
             // Retrieve all the requests
-            if let Some(mut ctx) = waiting_requests.blocking_recv() {
+            if let Some(ctx) = backlog.blocking_recv() {
                 // Submit all the request to the executor and move the context to the in-flight tracker
                 let request = &ctx.request;
                 let generation_params = &request.parameters;
@@ -93,7 +84,7 @@ fn executor_status_looper(
                 match backend.pin_mut().submit(
                     &input_ids.unwrap(), // This is checked beforehand in validate()
                     stopping_params.max_new_tokens,
-                    generation_params.top_k as i32,
+                    generation_params.top_k,
                     generation_params.top_p,
                     generation_params.temperature,
                     generation_params.repetition_penalty,
@@ -103,7 +94,6 @@ fn executor_status_looper(
                     Ok(request_id) => {
                         // Insert the context linked to the generated request id in the tracker
                         debug!("[in-flight] Added {}", request_id);
-                        ctx.start = Some(Instant::now());
                         in_flights.insert(request_id, ctx);
                     }
                     Err(e) => {
@@ -117,29 +107,40 @@ fn executor_status_looper(
                         }
                     }
                 };
+            } else {
+                break 'scheduler;
             }
         }
 
-        if backend.num_responses_ready() > 0 {
-            match backend.pin_mut().pull_tokens() {
+        if backend.num_tokens_ready() > 0 {
+            let mut backend = backend.pin_mut();
+            match backend.as_mut().pull_tokens() {
                 Ok(responses) => {
                     // Iterate through all the decoded token
                     for step in responses.deref() {
-                        if let Some(ctx) = in_flights.get(&step.request_id) {
-                            // Remove from tracked requests
-                            let parcel =
-                                DecodedToken::try_from(step).map(|dt| DecodedTokenContext {
-                                    token: dt,
-                                    start: ctx.start,
-                                    queued: ctx.queued,
-                                    channel: ctx.streamer.clone(),
-                                });
+                        if let Some(ctx) = in_flights.get_mut(&step.request_id) {
+                            // Update the starting timestamp if not set
+                            // This value might not be the actual real starting time of the request
+                            // on the executor side - Need to expose more info from the executor to
+                            // retrieve this value
+                            // TODO : Expose actual real starting time for a request on FFI layer
+                            if ctx.start.is_none() {
+                                ctx.start = Some(Instant::now());
+                            }
 
-                            // Submit the work to p:the post_processor
-                            let posted = post_processor_sender.send((step.request_id, parcel));
+                            // Try to map the generation step to a DecodedToken
+                            let response = match DecodedToken::try_from(step) {
+                                Ok(decoded_token) => {
+                                    post_process_decoded_token(&tokenizer, ctx, decoded_token)
+                                }
+                                Err(err) => Err(err)
+                            };
 
-                            if posted.is_err() || step.is_final {
-                                debug!("Removing {}", step.request_id);
+                            // Attempt to send back the response to the client
+                            if let Err(_) = ctx.streamer.send(response) {
+                                // Client has dropped, remove from tracked requests
+                                debug!("Client dropped - removing request {} from tracked requests", step.request_id);
+                                backend.as_mut().cancel(step.request_id);
                                 let _ = in_flights.remove(&step.request_id);
                             }
                         } else {
@@ -159,80 +160,48 @@ fn executor_status_looper(
     }
 }
 
-fn post_processor_looper<const MAX_NUM_TOKENS: usize>(
-    tokenizer: Tokenizer,
-    max_inflight_requests: usize,
-    mut decoded_tokens: UnboundedReceiver<(u64, InferResult<DecodedTokenContext>)>,
-) {
-    let mut states: HashMap<u64, Vec<u32>> = HashMap::with_capacity(max_inflight_requests * 2);
+fn post_process_decoded_token(tokenizer: &Tokenizer, ctx: &mut GenerationContext, decoded_token: DecodedToken) -> InferResult<InferStreamResponse> {
+    match tokenizer.decode(&[decoded_token.id], false) {
+        Ok(text) => {
+            let is_special =
+                tokenizer.get_added_vocabulary().is_special_token(&text);
+            let token = Token {
+                id: decoded_token.id,
+                text,
+                logprob: decoded_token.log_prob,
+                special: is_special,
+            };
 
-    'post_processor: loop {
-        if decoded_tokens.is_closed() {
-            warn!("Post processor IPC is closed, loop will exit now.");
-            break 'post_processor;
-        }
+            // Append the token to the tracked generated tokens
+            ctx.tokens.push(token.id);
 
-        if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() {
-            match decoded {
-                Ok(ctx) => {
-                    states
-                        .entry(request_id)
-                        .and_modify(|s| s.push(*&ctx.token.id))
-                        .or_insert_with(|| {
-                            let mut state = Vec::with_capacity(MAX_NUM_TOKENS);
-                            state.push(*&ctx.token.id);
-                            state
-                        });
-
-                    let out = match tokenizer.decode(&[ctx.token.id], false) {
-                        Ok(text) => {
-                            let is_special =
-                                tokenizer.get_added_vocabulary().is_special_token(&text);
-                            let token = Token {
-                                id: ctx.token.id,
-                                text,
-                                logprob: ctx.token.log_prob,
-                                special: is_special,
-                            };
-
-                            let out = if !ctx.token.is_final {
-                                InferStreamResponse::Intermediate {
-                                    token,
-                                    top_tokens: vec![],
-                                }
-                            } else {
-                                let tokens = states.remove(&request_id).unwrap();
-                                let text = tokenizer.decode(&tokens, true);
-                                let generated_text = GeneratedText {
-                                    text: text.unwrap(),
-                                    generated_tokens: tokens.len() as u32,
-                                    finish_reason: FinishReason::EndOfSequenceToken,
-                                    seed: None,
-                                };
-
-                                InferStreamResponse::End {
-                                    token,
-                                    top_tokens: vec![],
-                                    generated_text,
-                                    start: ctx.start.unwrap(),
-                                    queued: ctx.queued,
-                                }
-                            };
-
-                            Ok(out)
-                        }
-                        Err(err) => Err(GenerationError(err.to_string())),
-                    };
-
-                    if let Err(_) = ctx.channel.send(out) {
-                        warn!("Failed to send decoded token back to the user")
-                    }
+            // Map the correct response depending on the step is final or not
+            let out = if !decoded_token.is_final {
+                InferStreamResponse::Intermediate {
+                    token,
+                    top_tokens: vec![],
                 }
-                Err(_err) => {
-                    todo!("what do we do?")
+            } else {
+                let text = tokenizer.decode(&ctx.tokens, true);
+                let generated_text = GeneratedText {
+                    text: text.unwrap(),
+                    generated_tokens: ctx.tokens.len() as u32,
+                    finish_reason: FinishReason::EndOfSequenceToken,  // TODO : Map FinishReason
+                    seed: None,
+                };
+
+                InferStreamResponse::End {
+                    token,
+                    top_tokens: vec![],
+                    generated_text,
+                    start: ctx.start.unwrap(),
+                    queued: ctx.queued,
                 }
-            }
+            };
+
+            Ok(out)
         }
+        Err(err) => Err(GenerationError(err.to_string())),
     }
 }
 
@@ -277,11 +246,8 @@ fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
 
 unsafe impl Send for TensorRtLlmBackendImpl {}
 
-pub struct TensorRtLlmBackendV2 {
-    executor_looper: JoinHandle<()>,
-    post_processor_looper: JoinHandle<()>,
-    executor: UnboundedSender<GenerationContext>,
-}
+pub struct TensorRtLlmBackendV2(UnboundedSender<GenerationContext>);
+
 
 impl TensorRtLlmBackendV2 {
     pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
@@ -295,32 +261,22 @@ impl TensorRtLlmBackendV2 {
 
         // Allocate the IPC layer to communicate with the backend
         let (executor_sender, executor_receiver) = unbounded_channel();
-        let (post_processor_sender, post_processor_receiver) = unbounded_channel();
 
         // Create the FFI backend
-        let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
+        let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
             .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
 
         // Executor looper is responsible for scheduling and pulling requests state at regular interval
-        let executor_looper = spawn_blocking(move || {
+        spawn_blocking(move || {
             executor_status_looper(
-                backend,
                 max_inflight_requests,
+                tokenizer,
+                backend,
                 executor_receiver,
-                post_processor_sender,
             )
         });
 
-        // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user
-        let post_processor_looper = spawn_blocking(move || {
-            post_processor_looper::<256>(tokenizer, max_inflight_requests, post_processor_receiver)
-        });
-
-        Ok(TensorRtLlmBackendV2 {
-            executor_looper,
-            post_processor_looper,
-            executor: executor_sender,
-        })
+        Ok(TensorRtLlmBackendV2(executor_sender))
     }
 
     fn validate(request: &ValidGenerateRequest) -> InferResult<()> {
@@ -354,20 +310,21 @@ impl TensorRtLlmBackendV2 {
 impl Backend for TensorRtLlmBackendV2 {
     fn schedule(
         &self,
-        inner: ValidGenerateRequest,
+        request: ValidGenerateRequest,
     ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
-        Self::validate(&inner)?;
+        Self::validate(&request)?;
 
         // Open-up the stream to send tokens
         let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();
 
         // Send the context to the executor for scheduling
         let queued = Instant::now();
-        match self.executor.send(GenerationContext {
-            request: inner,
+        match self.0.send(GenerationContext {
+            request,
+            streamer,
+            tokens: Vec::with_capacity(256),
             start: None,
             queued,
-            streamer,
         }) {
             Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
             Err(_) => Err(GenerationError(
@@ -377,6 +334,6 @@ impl Backend for TensorRtLlmBackendV2 {
     }
 
     async fn health(&self, _: bool) -> bool {
-        !self.executor_looper.is_finished() & !self.post_processor_looper.is_finished()
+        true
     }
 }
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index 8ab8c533..9c76bafa 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -3,14 +3,13 @@ use std::path::{Path, PathBuf};
 use clap::Parser;
 use hf_hub::api::tokio::{Api, ApiBuilder};
 use hf_hub::{Cache, Repo, RepoType};
-use tokenizers::Tokenizer;
 use tracing::info;
 
 use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
 use text_generation_backends_trtllm::TensorRtLlmBackendV2;
-use text_generation_router::server::get_base_tokenizer;
 use text_generation_router::usage_stats::UsageStatsLevel;
-use text_generation_router::{server, HubTokenizerConfig};
+use text_generation_router::{server, HubTokenizerConfig, Tokenizer};
+use text_generation_router::server::{get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer};
 
 /// App Configuration
 #[derive(Parser, Debug)]
@@ -61,7 +60,7 @@ struct Args {
     #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
     executor_worker: PathBuf,
     #[clap(default_value = "on", long, env)]
-    usage_stats: usage_stats::UsageStatsLevel,
+    usage_stats: UsageStatsLevel,
     #[clap(default_value = "2000000", long, env)]
     payload_limit: usize,
 }
@@ -126,18 +125,18 @@ async fn get_tokenizer(
 
     // Load tokenizer and model info
     let (
-        tokenizer_filename,
-        _config_filename,
-        tokenizer_config_filename,
+        config_filename,
+        _tokenizer_config_filename,
         _preprocessor_config_filename,
         _processor_config_filename,
+        _model_info
     ) = match api {
         Type::None => (
-            Some(local_path.join("tokenizer.json")),
             Some(local_path.join("config.json")),
             Some(local_path.join("tokenizer_config.json")),
             Some(local_path.join("preprocessor_config.json")),
             Some(local_path.join("processor_config.json")),
+            None
         ),
         Type::Api(api) => {
             let api_repo = api.repo(Repo::with_revision(
@@ -146,21 +145,24 @@ async fn get_tokenizer(
                 revision.unwrap_or_else(|| "main").to_string(),
             ));
 
-            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
-                Ok(tokenizer_filename) => Some(tokenizer_filename),
-                Err(_) => get_base_tokenizer(&api, &api_repo).await,
-            };
+
             let config_filename = api_repo.get("config.json").await.ok();
             let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
             let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
             let processor_config_filename = api_repo.get("processor_config.json").await.ok();
 
+            let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await {
+                Some(model_info)
+            } else {
+                tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
+                None
+            };
             (
-                tokenizer_filename,
                 config_filename,
                 tokenizer_config_filename,
                 preprocessor_config_filename,
                 processor_config_filename,
+                model_info,
             )
         }
         Type::Cache(cache) => {
@@ -170,24 +172,55 @@ async fn get_tokenizer(
                 revision.clone().unwrap_or_else(|| "main").to_string(),
             ));
             (
-                repo.get("tokenizer.json"),
                 repo.get("config.json"),
                 repo.get("tokenizer_config.json"),
                 repo.get("preprocessor_config.json"),
                 repo.get("processor_config.json"),
+                None
             )
         }
     };
 
     // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
-    let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
-    {
-        HubTokenizerConfig::from_file(filename)
-    } else {
-        tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
+    // let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
+    // {
+    //     HubTokenizerConfig::from_file(filename)
+    // } else {
+    //     tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
+    // };
+
+    // let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
+    //     tracing::warn!("Could not find tokenizer config locally and no API specified");
+    //     HubTokenizerConfig::default()
+    // });
+
+    let tokenizer: Tokenizer = {
+        use pyo3::prelude::*;
+        pyo3::Python::with_gil(|py| -> PyResult<()> {
+            py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), false)?;
+            Ok(())
+        })
+            .inspect_err(|err| {
+                tracing::error!("Failed to import python tokenizer {err}");
+            })
+            .or_else(|err| {
+                let out = legacy_tokenizer_handle(config_filename.as_ref());
+                out.ok_or(err)
+            })
+            .expect("We cannot load a tokenizer");
+        let filename = "out/tokenizer.json";
+        if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
+            Tokenizer::Rust(tok)
+        } else {
+            Tokenizer::Python {
+                tokenizer_name: tokenizer_name.to_string(),
+                revision: revision.map(|revision| revision.to_string()),
+                trust_remote_code: false,
+            }
+        }
     };
 
-    tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok())
+    Some(tokenizer)
 }
 
 #[tokio::main]
@@ -258,50 +291,55 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
     }
 
     // Create the backend
-    let tokenizer = get_tokenizer(
+    match get_tokenizer(
         &tokenizer_name,
         tokenizer_config_path.as_deref(),
         revision.as_deref(),
     )
     .await
-    .expect("Failed to retrieve tokenizer implementation");
+    .expect("Failed to retrieve tokenizer implementation") {
+        Tokenizer::Python { .. } => {
+            Err(TensorRtLlmBackendError::Tokenizer("Failed to retrieve Rust based tokenizer".to_string()))
+        }
+        Tokenizer::Rust(tokenizer) => {
+            info!("Successfully retrieved tokenizer {}", &tokenizer_name);
+            let backend = TensorRtLlmBackendV2::new(
+                tokenizer,
+                model_id,
+                executor_worker,
+                max_concurrent_requests,
+            )?;
 
-    info!("Successfully retrieved tokenizer {}", &tokenizer_name);
-    let backend = TensorRtLlmBackendV2::new(
-        tokenizer,
-        model_id,
-        executor_worker,
-        max_concurrent_requests,
-    )?;
+            info!("Successfully created backend");
 
-    info!("Successfully created backend");
+            // Run server
+            server::run(
+                backend,
+                max_concurrent_requests,
+                max_best_of,
+                max_stop_sequences,
+                max_top_n_tokens,
+                max_input_tokens,
+                max_total_tokens,
+                validation_workers,
+                auth_token,
+                tokenizer_name,
+                tokenizer_config_path,
+                revision,
+                false,
+                hostname,
+                port,
+                cors_allow_origin,
+                false,
+                None,
+                None,
+                true,
+                max_client_batch_size,
+                usage_stats,
+                payload_limit,
+            ).await?;
+            Ok(())
+        }
+    }
 
-    // Run server
-    server::run(
-        backend,
-        max_concurrent_requests,
-        max_best_of,
-        max_stop_sequences,
-        max_top_n_tokens,
-        max_input_tokens,
-        max_total_tokens,
-        validation_workers,
-        auth_token,
-        tokenizer_name,
-        tokenizer_config_path,
-        revision,
-        false,
-        hostname,
-        port,
-        cors_allow_origin,
-        false,
-        None,
-        None,
-        true,
-        max_client_batch_size,
-        usage_stats,
-        payload_limit,
-    )
-    .await?;
-    Ok(())
 }
diff --git a/backends/trtllm/tests/infer_test.cpp b/backends/trtllm/tests/infer_test.cpp
deleted file mode 100644
index 8520065a..00000000
--- a/backends/trtllm/tests/infer_test.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//
-// Created by mfuntowicz on 7/2/24.
-//
-#include <catch2/catch_all.hpp>
-#include <spdlog/spdlog.h>
-#include "../include/backend.h"
-
-TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
-    const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
-    const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
-
-    spdlog::info("Loading config from: {}", absolute(engines).string());
-    huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
-}
diff --git a/backends/trtllm/tests/test_backend.cpp b/backends/trtllm/tests/test_backend.cpp
new file mode 100644
index 00000000..ae097405
--- /dev/null
+++ b/backends/trtllm/tests/test_backend.cpp
@@ -0,0 +1,152 @@
+//
+// Created by mfuntowicz on 12/3/24.
+//
+
+#include <catch2/catch_all.hpp>
+#include <nlohmann/json.hpp>
+#include <tensorrt_llm/executor/executor.h>
+
+#include "backend.hpp"
+
+
+
+using namespace huggingface::tgi::backends::trtllm;
+
+TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
+{
+    const json config_j = {{"temperature", 0.6}, {"top_p", 0.95}, {"eos_token_id", {1,2,3}}};
+    const auto generation_config = generation_config_t(config_j);
+
+    REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6));
+    REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(0.95, 1e-6));
+
+    // Stop words
+    REQUIRE_FALSE(generation_config.stop_words.empty());
+    REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
+
+    for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
+    {
+        // Currently we do not support multi-tokens stop words
+        REQUIRE(lhs.size() == 1);
+        REQUIRE(rhs.size() == 1);
+        REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
+    }
+}
+
+TEST_CASE("parse generation_config.json default", "[generation_config_t]")
+{
+    const json config_j = {{"eos_token_id", {1,2,3}}};
+    const auto generation_config = generation_config_t(config_j);
+
+    REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
+    REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
+
+    REQUIRE_FALSE(generation_config.stop_words.empty());
+    REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
+
+    for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
+    {
+        // Currently we do not support multi-tokens stop words
+        REQUIRE(lhs.size() == 1);
+        REQUIRE(rhs.size() == 1);
+        REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
+    }
+}
+
+TEST_CASE("parse generation_config.json empty", "[generation_config_t]")
+{
+    const json config_j = {{"eos_token_id", {}}};
+    const auto generation_config = generation_config_t(config_j);
+
+    REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
+    REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
+
+    REQUIRE(generation_config.stop_words.empty());
+
+    const json config_j2 = {};
+    const auto generation_config2 = generation_config_t(config_j);
+
+    REQUIRE_THAT(generation_config2.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
+    REQUIRE_THAT(generation_config2.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
+
+    REQUIRE(generation_config2.stop_words.empty());
+}
+
+TEST_CASE("parallel_config single", "[backend_workspace_t]")
+{
+    // Generate temporary folder
+    const auto tmp_p = std::filesystem::temp_directory_path();
+    const auto config_p = tmp_p / "config.json";
+    const auto generation_config_p = tmp_p / "generation_config.json";
+
+    // Generate content
+    std::ofstream o_config(config_p);
+    o_config << R"({"pretrained_config": {"mapping": {"world_size": 2}}})"_json;
+    o_config.close();
+
+    std::ofstream o_generation_config(generation_config_p);
+    o_generation_config << R"({"eos_token_id": []})"_json;
+    o_generation_config.close();
+
+    const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
+    const auto parallel = workspace.parallel_config();
+    REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kORCHESTRATOR);
+    REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);
+
+    std::filesystem::remove(config_p);
+    std::filesystem::remove(generation_config_p);
+}
+
+TEST_CASE("parallel_config multi", "[backend_workspace_t]")
+{
+    // Generate temporary folder
+    const auto tmp_p = std::filesystem::temp_directory_path();
+    const auto config_p = tmp_p / "config.json";
+    const auto generation_config_p = tmp_p / "generation_config.json";
+
+    // Generate content
+    std::ofstream o_config(config_p);
+    o_config << R"({"pretrained_config": {"mapping": {"world_size": 1}}})"_json;
+    o_config.close();
+
+    std::ofstream o_generation_config(generation_config_p);
+    o_generation_config << R"({"eos_token_id": []})"_json;
+    o_generation_config.close();
+
+    const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
+    const auto parallel = workspace.parallel_config();
+    REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kLEADER);
+    REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);
+
+    std::filesystem::remove(config_p);
+    std::filesystem::remove(generation_config_p);
+}
+
+TEST_CASE("executor_config", "[backend_workspace_t]")
+{
+
+}
+
+TEST_CASE("sampling_params_t to tle::SamplingConfig", "[backend_t]")
+{
+    const sampling_params_t params = {40, 0.95, 0.9, 1.0, 0.6, 2014};
+    const auto config = static_cast<tle::SamplingConfig>(params);
+
+    REQUIRE(config.getTopK().has_value());
+    REQUIRE(config.getTopK().value() == params.top_k);
+
+    REQUIRE(config.getSeed().has_value());
+    REQUIRE(config.getSeed().value() == params.seed);
+
+    REQUIRE(config.getTopP().has_value());
+    REQUIRE_THAT(*config.getTopP(), Catch::Matchers::WithinAbs(params.top_p, 1e-6f));
+
+    REQUIRE(config.getRepetitionPenalty().has_value());
+    REQUIRE_THAT(*config.getRepetitionPenalty(), Catch::Matchers::WithinAbs(params.repetition_penalty, 1e-6f));
+
+    REQUIRE(config.getFrequencyPenalty().has_value());
+    REQUIRE_THAT(*config.getFrequencyPenalty(), Catch::Matchers::WithinAbs(params.frequency_penalty, 1e-6f));
+
+    REQUIRE(config.getTemperature().has_value());
+    REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f));
+}
\ No newline at end of file
diff --git a/backends/trtllm/tests/test_hardware.cpp b/backends/trtllm/tests/test_hardware.cpp
new file mode 100644
index 00000000..4cb7b562
--- /dev/null
+++ b/backends/trtllm/tests/test_hardware.cpp
@@ -0,0 +1,82 @@
+//
+// Created by mfuntowicz on 11/16/24.
+//
+
+#include <catch2/catch_all.hpp>
+#include "../csrc/hardware.hpp"
+
+using namespace huggingface::tgi::hardware::cuda;
+
+TEST_CASE("is_at_least_<arch>") {
+    const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
+    REQUIRE(VOLTA_CAPABILITIES.is_at_least_volta());
+    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_turing());
+    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ampere());
+    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ada_lovelace());
+    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_hopper());
+
+    const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
+    REQUIRE(TURING_CAPABILITIES.is_at_least_volta());
+    REQUIRE(TURING_CAPABILITIES.is_at_least_turing());
+    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ampere());
+    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ada_lovelace());
+    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_hopper());
+
+    const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
+    REQUIRE(AMPERE_CAPABILITIES.is_at_least_volta());
+    REQUIRE(AMPERE_CAPABILITIES.is_at_least_turing());
+    REQUIRE(AMPERE_CAPABILITIES.is_at_least_ampere());
+    REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_ada_lovelace());
+    REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_hopper());
+
+    const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
+    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_volta());
+    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_turing());
+    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ampere());
+    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ada_lovelace());
+    REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least_hopper());
+
+    const static auto HOPPER_CAPABILITIES = compute_capabilities_t(9, 0);
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least_volta());
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least_turing());
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least_ampere());
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least_ada_lovelace());
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least_hopper());
+}
+
+TEST_CASE("is_at_least") {
+    const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
+    REQUIRE(VOLTA_CAPABILITIES.is_at_least(VOLTA));
+    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(TURING));
+    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(AMPERE));
+    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(ADA_LOVELACE));
+    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(HOPPER));
+
+    const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
+    REQUIRE(TURING_CAPABILITIES.is_at_least(VOLTA));
+    REQUIRE(TURING_CAPABILITIES.is_at_least(TURING));
+    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(AMPERE));
+    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(ADA_LOVELACE));
+    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(HOPPER));
+
+    const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
+    REQUIRE(AMPERE_CAPABILITIES.is_at_least(VOLTA));
+    REQUIRE(AMPERE_CAPABILITIES.is_at_least(TURING));
+    REQUIRE(AMPERE_CAPABILITIES.is_at_least(AMPERE));
+    REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(ADA_LOVELACE));
+    REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(HOPPER));
+
+    const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
+    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(VOLTA));
+    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(TURING));
+    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(AMPERE));
+    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(ADA_LOVELACE));
+    REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least(HOPPER));
+
+    const static auto HOPPER_CAPABILITIES = compute_capabilities_t (9, 0);
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least(VOLTA));
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least(TURING));
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE));
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE));
+    REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER));
+}
\ No newline at end of file
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index e31a3788..4503424b 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -17,6 +17,8 @@
     title: Using TGI with Intel GPUs
   - local: installation
     title: Installation from source
+  - local: multi_backend_support
+    title: Multi-backend support
 
   - local: architecture
     title: Internal Architecture
@@ -45,6 +47,10 @@
   - local: basic_tutorials/train_medusa
     title: Train Medusa
   title: Tutorials
+- sections:
+  - local: backends/trtllm
+    title: TensorRT-LLM
+  title: Backends
 - sections:
   - local: reference/launcher
     title: All TGI CLI options
diff --git a/docs/source/architecture.md b/docs/source/architecture.md
index 6660630d..d3a6fa92 100644
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
@@ -9,8 +9,10 @@ A high-level architecture diagram can be seen here:
 This diagram shows well there are these separate components:
 
 - **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
-- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
 - **The launcher** is a helper that will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
+- **The model server**, responsible for receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
+
+Note that for other backends (eg. TRTLLM) the model server and launcher are specific to the backend.
 
 The router and the model server can be two different machines, they do not need to be deployed together.
 
diff --git a/docs/source/backends/trtllm.md b/docs/source/backends/trtllm.md
new file mode 100644
index 00000000..8eb37180
--- /dev/null
+++ b/docs/source/backends/trtllm.md
@@ -0,0 +1,81 @@
+# TensorRT-LLM backend
+
+The NVIDIA TensorRT-LLM (TRTLLM) backend is a high-performance backend for LLMs
+that uses NVIDIA's TensorRT library for inference acceleration.
+It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels.
+
+To use the TRTLLM backend you need to compile `engines` for the models you want to use.
+Each `engine` must be compiled on the same GPU architecture that you will use for inference.
+
+## Supported models
+
+Check the [support matrix](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html) to see which models are
+supported.
+
+## Compiling engines
+
+You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you
+want to use.
+
+```bash 
+MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
+
+# Install huggingface_cli
+python -m pip install huggingface-cli[hf_transfer]
+
+# Login to the Hugging Face Hub
+huggingface-cli login
+
+# Create a directory to store the model
+mkdir -p /tmp/models/$MODEL_NAME
+
+# Create a directory to store the compiled engine
+mkdir -p /tmp/engines/$MODEL_NAME
+
+# Download the model 
+HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
+
+# Compile the engine using Optimum-NVIDIA
+docker run \
+  --rm \
+  -it \
+  --gpus=1 \
+  -v /tmp/models/$MODEL_NAME:/model \
+  -v /tmp/engines/$MODEL_NAME:/engine \
+  huggingface/optimum-nvidia \
+    optimum-cli export trtllm \
+    --tp=1 \
+    --pp=1 \
+    --max-batch-size=128 \
+    --max-input-length 4096 \
+    --max-output-length 8192 \
+    --max-beams-width=1 \
+    --destination /engine \
+    $MODEL_NAME
+```
+
+Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory.
+
+## Using the TRTLLM backend
+
+Run TGI-TRTLLM Docker image with the compiled engine:
+
+```bash
+docker run \
+  --gpus 1 \
+  -it \
+  --rm \
+  -p 3000:3000 \
+  -e MODEL=$MODEL_NAME \
+  -e PORT=3000 \
+  -e HF_TOKEN='hf_XXX' \
+  -v /tmp/engines/$MODEL_NAME:/data \ 
+  ghcr.io/huggingface/text-generation-inference:latest-trtllm \
+  --executor-worker executorWorker \
+  --model-id /data/$MODEL_NAME
+```
+
+## Development
+
+To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
+`.devcontainer` directory.
\ No newline at end of file
diff --git a/docs/source/multi_backend_support.md b/docs/source/multi_backend_support.md
new file mode 100644
index 00000000..5899e4b7
--- /dev/null
+++ b/docs/source/multi_backend_support.md
@@ -0,0 +1,13 @@
+# Multi-backend support
+
+TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs).
+With multi-backend support, you can choose the backend that best suits your needs, 
+whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with 
+TGI remains consistent across backends, allowing you to switch between them seamlessly.
+
+**Supported backends:**
+* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option 
+  within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face.
+* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference. 
+  It utilizes specialized optimizations and custom kernels for enhanced performance. 
+  However, it requires a model-specific compilation step for each GPU architecture.
\ No newline at end of file
diff --git a/router/src/server.rs b/router/src/server.rs
index 29df3d06..aef0f812 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1593,7 +1593,7 @@ pub fn schema() -> ApiDoc {
     ApiDoc
 }
 
-fn py_resolve_tokenizer(
+pub fn py_resolve_tokenizer(
     py: pyo3::Python,
     tokenizer_name: &str,
     revision: Option<&str>,
@@ -1619,7 +1619,7 @@ fn py_resolve_tokenizer(
     Ok(())
 }
 
-fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
+pub fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
     // XXX Legacy case for FasterDecoding/medusa-vicuna-7b-v1.3
     // and state-spaces/mamba-130m
     tracing::warn!("Odd tokenizer detected, falling back on legacy tokenization");

From 1708865fdc05432565c7fc33e0ba0ce99e235d1c Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Fri, 13 Dec 2024 16:19:06 +0100
Subject: [PATCH 003/183] Feat/trtllm cancellation dev container (#2795)

Add devcontainers for TRTLLM backend.

---------

Co-authored-by: Morgan Funtowicz <morgan@huggingface.co>
---
 .devcontainer/Dockerfile.trtllm |  0
 .devcontainer/Dockerfile_trtllm | 75 +++++++++++++++++++++++++++++++++
 .devcontainer/devcontainer.json | 19 +++++++++
 3 files changed, 94 insertions(+)
 delete mode 100644 .devcontainer/Dockerfile.trtllm
 create mode 100644 .devcontainer/Dockerfile_trtllm

diff --git a/.devcontainer/Dockerfile.trtllm b/.devcontainer/Dockerfile.trtllm
deleted file mode 100644
index e69de29b..00000000
diff --git a/.devcontainer/Dockerfile_trtllm b/.devcontainer/Dockerfile_trtllm
new file mode 100644
index 00000000..21b7114c
--- /dev/null
+++ b/.devcontainer/Dockerfile_trtllm
@@ -0,0 +1,75 @@
+ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
+ARG OMPI_VERSION="4.1.7rc1"
+
+# Build dependencies resolver stage
+FROM lukemathwalker/cargo-chef:latest AS chef
+WORKDIR /usr/src/text-generation-inference/backends/trtllm
+
+FROM chef AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+# CUDA dependent dependencies resolver stage
+FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt update && apt install -y \
+    build-essential \
+    cmake \
+    curl \
+    gcc-14  \
+    g++-14 \
+    git \
+    git-lfs \
+    libssl-dev \
+    libucx-dev \
+    ninja-build \
+    pkg-config \
+    pipx \
+    python3 \
+    python3-dev \
+    python3-setuptools \
+    tar \
+    wget && \
+    pipx ensurepath
+
+ENV TGI_INSTALL_PREFIX=/usr/local/tgi
+ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
+
+# Install OpenMPI
+FROM cuda-builder AS mpi-builder
+ARG OMPI_VERSION
+
+ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
+RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
+    mkdir /usr/src/mpi && \
+    tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
+    cd /usr/src/mpi && \
+    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
+    make -j all && \
+    make install && \
+    rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
+
+# Install TensorRT
+FROM cuda-builder AS trt-builder
+COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
+RUN chmod +x /opt/install_tensorrt.sh && \
+    /opt/install_tensorrt.sh
+
+# Build Backend
+FROM cuda-builder AS tgi-builder
+WORKDIR /usr/src/text-generation-inference
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
+    chmod -R a+w /root/.rustup && \
+    chmod -R a+w /root/.cargo
+
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef
+
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+
+ENV MPI_HOME=/usr/local/mpi
\ No newline at end of file
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index e69de29b..8b4eb89c 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,19 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/cpp
+{
+  "name": "CUDA",
+  "build": {
+    "dockerfile": "Dockerfile_trtllm",
+    "context": ".."
+  },
+  "remoteEnv": {
+    "PATH": "${containerEnv:PATH}:/usr/local/cuda/bin",
+    "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64",
+    "XLA_FLAGS": "--xla_gpu_cuda_data_dir=/usr/local/cuda"
+  },
+  "customizations" : {
+    "jetbrains" : {
+      "backend" : "CLion"
+    }
+  }
+}

From 6f0b8c947daacee9f5c47c63d60b7b16d50d35a1 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 16 Dec 2024 10:34:50 +0100
Subject: [PATCH 004/183] New arg. (#2845)

---
 router/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router/src/lib.rs b/router/src/lib.rs
index d6e42edb..84e9bc48 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -459,7 +459,7 @@ pub struct CompletionRequest {
     pub prompt: Prompt,
 
     /// The maximum number of tokens that can be generated in the chat completion.
-    #[serde(default)]
+    #[serde(default, alias = "max_completion_tokens")]
     #[schema(default = "1024", example = "32")]
     pub max_tokens: Option<u32>,
 

From 11ab329883b70796b3fd417d01caf57d0d2daf73 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 16 Dec 2024 10:58:15 +0100
Subject: [PATCH 005/183] Fixing CI. (#2846)

---
 backends/trtllm/build.rs      | 13 ++++++------
 backends/trtllm/src/looper.rs | 26 +++++++++++------------
 backends/trtllm/src/main.rs   | 40 ++++++++++++++++++-----------------
 docs/openapi.json             |  2 +-
 4 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index 0a0f6e6b..d9c1aa15 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -44,7 +44,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     }
 
     let mut config = cmake::Config::new(".");
-    config.uses_cxx11()
+    config
+        .uses_cxx11()
         .generator("Ninja")
         .profile(match is_debug {
             true => "Debug",
@@ -57,12 +58,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
         .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
         .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
 
-        // Allow to override which Python to use ...
-        if let Some(python3) = option_env!("Python3_EXECUTABLE") {
-            config.define("Python3_EXECUTABLE", python3);
-        }
+    // Allow to override which Python to use ...
+    if let Some(python3) = option_env!("Python3_EXECUTABLE") {
+        config.define("Python3_EXECUTABLE", python3);
+    }
 
-        config.build();
+    config.build();
 
     // Additional transitive CMake dependencies
     let deps_folder = out_dir.join("build").join("_deps");
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index 3addd95f..969046d1 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -133,13 +133,16 @@ fn executor_status_looper(
                                 Ok(decoded_token) => {
                                     post_process_decoded_token(&tokenizer, ctx, decoded_token)
                                 }
-                                Err(err) => Err(err)
+                                Err(err) => Err(err),
                             };
 
                             // Attempt to send back the response to the client
                             if let Err(_) = ctx.streamer.send(response) {
                                 // Client has dropped, remove from tracked requests
-                                debug!("Client dropped - removing request {} from tracked requests", step.request_id);
+                                debug!(
+                                    "Client dropped - removing request {} from tracked requests",
+                                    step.request_id
+                                );
                                 backend.as_mut().cancel(step.request_id);
                                 let _ = in_flights.remove(&step.request_id);
                             }
@@ -160,11 +163,14 @@ fn executor_status_looper(
     }
 }
 
-fn post_process_decoded_token(tokenizer: &Tokenizer, ctx: &mut GenerationContext, decoded_token: DecodedToken) -> InferResult<InferStreamResponse> {
+fn post_process_decoded_token(
+    tokenizer: &Tokenizer,
+    ctx: &mut GenerationContext,
+    decoded_token: DecodedToken,
+) -> InferResult<InferStreamResponse> {
     match tokenizer.decode(&[decoded_token.id], false) {
         Ok(text) => {
-            let is_special =
-                tokenizer.get_added_vocabulary().is_special_token(&text);
+            let is_special = tokenizer.get_added_vocabulary().is_special_token(&text);
             let token = Token {
                 id: decoded_token.id,
                 text,
@@ -186,7 +192,7 @@ fn post_process_decoded_token(tokenizer: &Tokenizer, ctx: &mut GenerationContext
                 let generated_text = GeneratedText {
                     text: text.unwrap(),
                     generated_tokens: ctx.tokens.len() as u32,
-                    finish_reason: FinishReason::EndOfSequenceToken,  // TODO : Map FinishReason
+                    finish_reason: FinishReason::EndOfSequenceToken, // TODO : Map FinishReason
                     seed: None,
                 };
 
@@ -248,7 +254,6 @@ unsafe impl Send for TensorRtLlmBackendImpl {}
 
 pub struct TensorRtLlmBackendV2(UnboundedSender<GenerationContext>);
 
-
 impl TensorRtLlmBackendV2 {
     pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
         tokenizer: Tokenizer,
@@ -268,12 +273,7 @@ impl TensorRtLlmBackendV2 {
 
         // Executor looper is responsible for scheduling and pulling requests state at regular interval
         spawn_blocking(move || {
-            executor_status_looper(
-                max_inflight_requests,
-                tokenizer,
-                backend,
-                executor_receiver,
-            )
+            executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver)
         });
 
         Ok(TensorRtLlmBackendV2(executor_sender))
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index 9c76bafa..af299f7d 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -7,9 +7,11 @@ use tracing::info;
 
 use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
 use text_generation_backends_trtllm::TensorRtLlmBackendV2;
+use text_generation_router::server::{
+    get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer,
+};
 use text_generation_router::usage_stats::UsageStatsLevel;
 use text_generation_router::{server, HubTokenizerConfig, Tokenizer};
-use text_generation_router::server::{get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer};
 
 /// App Configuration
 #[derive(Parser, Debug)]
@@ -129,14 +131,14 @@ async fn get_tokenizer(
         _tokenizer_config_filename,
         _preprocessor_config_filename,
         _processor_config_filename,
-        _model_info
+        _model_info,
     ) = match api {
         Type::None => (
             Some(local_path.join("config.json")),
             Some(local_path.join("tokenizer_config.json")),
             Some(local_path.join("preprocessor_config.json")),
             Some(local_path.join("processor_config.json")),
-            None
+            None,
         ),
         Type::Api(api) => {
             let api_repo = api.repo(Repo::with_revision(
@@ -145,7 +147,6 @@ async fn get_tokenizer(
                 revision.unwrap_or_else(|| "main").to_string(),
             ));
 
-
             let config_filename = api_repo.get("config.json").await.ok();
             let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
             let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
@@ -176,7 +177,7 @@ async fn get_tokenizer(
                 repo.get("tokenizer_config.json"),
                 repo.get("preprocessor_config.json"),
                 repo.get("processor_config.json"),
-                None
+                None,
             )
         }
     };
@@ -200,14 +201,14 @@ async fn get_tokenizer(
             py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), false)?;
             Ok(())
         })
-            .inspect_err(|err| {
-                tracing::error!("Failed to import python tokenizer {err}");
-            })
-            .or_else(|err| {
-                let out = legacy_tokenizer_handle(config_filename.as_ref());
-                out.ok_or(err)
-            })
-            .expect("We cannot load a tokenizer");
+        .inspect_err(|err| {
+            tracing::error!("Failed to import python tokenizer {err}");
+        })
+        .or_else(|err| {
+            let out = legacy_tokenizer_handle(config_filename.as_ref());
+            out.ok_or(err)
+        })
+        .expect("We cannot load a tokenizer");
         let filename = "out/tokenizer.json";
         if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
             Tokenizer::Rust(tok)
@@ -297,10 +298,11 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         revision.as_deref(),
     )
     .await
-    .expect("Failed to retrieve tokenizer implementation") {
-        Tokenizer::Python { .. } => {
-            Err(TensorRtLlmBackendError::Tokenizer("Failed to retrieve Rust based tokenizer".to_string()))
-        }
+    .expect("Failed to retrieve tokenizer implementation")
+    {
+        Tokenizer::Python { .. } => Err(TensorRtLlmBackendError::Tokenizer(
+            "Failed to retrieve Rust based tokenizer".to_string(),
+        )),
         Tokenizer::Rust(tokenizer) => {
             info!("Successfully retrieved tokenizer {}", &tokenizer_name);
             let backend = TensorRtLlmBackendV2::new(
@@ -337,9 +339,9 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
                 max_client_batch_size,
                 usage_stats,
                 payload_limit,
-            ).await?;
+            )
+            .await?;
             Ok(())
         }
     }
-
 }
diff --git a/docs/openapi.json b/docs/openapi.json
index 1caf6752..48120f77 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "3.0.1-dev0"
+    "version": "3.0.2-dev0"
   },
   "paths": {
     "/": {

From a72f339c79728c12e66629ac179d735d586ec10c Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Mon, 16 Dec 2024 16:12:34 -0500
Subject: [PATCH 006/183] fix: lint backend and doc files (#2850)

---
 .devcontainer/Dockerfile_trtllm         |  2 +-
 backends/trtllm/csrc/backend.hpp        |  2 +-
 backends/trtllm/csrc/ffi.hpp            |  2 +-
 backends/trtllm/csrc/hardware.hpp       |  2 +-
 backends/trtllm/tests/test_backend.cpp  |  2 +-
 backends/trtllm/tests/test_hardware.cpp |  2 +-
 docs/source/backends/trtllm.md          |  8 ++++----
 docs/source/multi_backend_support.md    | 12 ++++++------
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.devcontainer/Dockerfile_trtllm b/.devcontainer/Dockerfile_trtllm
index 21b7114c..239a7bf8 100644
--- a/.devcontainer/Dockerfile_trtllm
+++ b/.devcontainer/Dockerfile_trtllm
@@ -72,4 +72,4 @@ RUN cargo install cargo-chef
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 
-ENV MPI_HOME=/usr/local/mpi
\ No newline at end of file
+ENV MPI_HOME=/usr/local/mpi
diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp
index f49c437a..40b44a84 100644
--- a/backends/trtllm/csrc/backend.hpp
+++ b/backends/trtllm/csrc/backend.hpp
@@ -228,4 +228,4 @@ struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : f
     }
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index de2333af..d0342d4b 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -159,4 +159,4 @@ namespace huggingface::tgi::backends::trtllm {
         );
     }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/backends/trtllm/csrc/hardware.hpp b/backends/trtllm/csrc/hardware.hpp
index 8e5fa696..abfb4afd 100644
--- a/backends/trtllm/csrc/hardware.hpp
+++ b/backends/trtllm/csrc/hardware.hpp
@@ -78,4 +78,4 @@ namespace huggingface::tgi::hardware::cuda {
         [[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
     };
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/backends/trtllm/tests/test_backend.cpp b/backends/trtllm/tests/test_backend.cpp
index ae097405..14d92b75 100644
--- a/backends/trtllm/tests/test_backend.cpp
+++ b/backends/trtllm/tests/test_backend.cpp
@@ -149,4 +149,4 @@ TEST_CASE("sampling_params_t to tle::SamplingConfig", "[backend_t]")
 
     REQUIRE(config.getTemperature().has_value());
     REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f));
-}
\ No newline at end of file
+}
diff --git a/backends/trtllm/tests/test_hardware.cpp b/backends/trtllm/tests/test_hardware.cpp
index 4cb7b562..e14f1f35 100644
--- a/backends/trtllm/tests/test_hardware.cpp
+++ b/backends/trtllm/tests/test_hardware.cpp
@@ -79,4 +79,4 @@ TEST_CASE("is_at_least") {
     REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE));
     REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE));
     REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER));
-}
\ No newline at end of file
+}
diff --git a/docs/source/backends/trtllm.md b/docs/source/backends/trtllm.md
index 8eb37180..be6416b1 100644
--- a/docs/source/backends/trtllm.md
+++ b/docs/source/backends/trtllm.md
@@ -17,7 +17,7 @@ supported.
 You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you
 want to use.
 
-```bash 
+```bash
 MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
 
 # Install huggingface_cli
@@ -32,7 +32,7 @@ mkdir -p /tmp/models/$MODEL_NAME
 # Create a directory to store the compiled engine
 mkdir -p /tmp/engines/$MODEL_NAME
 
-# Download the model 
+# Download the model
 HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
 
 # Compile the engine using Optimum-NVIDIA
@@ -69,7 +69,7 @@ docker run \
   -e MODEL=$MODEL_NAME \
   -e PORT=3000 \
   -e HF_TOKEN='hf_XXX' \
-  -v /tmp/engines/$MODEL_NAME:/data \ 
+  -v /tmp/engines/$MODEL_NAME:/data \
   ghcr.io/huggingface/text-generation-inference:latest-trtllm \
   --executor-worker executorWorker \
   --model-id /data/$MODEL_NAME
@@ -78,4 +78,4 @@ docker run \
 ## Development
 
 To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
-`.devcontainer` directory.
\ No newline at end of file
+`.devcontainer` directory.
diff --git a/docs/source/multi_backend_support.md b/docs/source/multi_backend_support.md
index 5899e4b7..c4df15bc 100644
--- a/docs/source/multi_backend_support.md
+++ b/docs/source/multi_backend_support.md
@@ -1,13 +1,13 @@
 # Multi-backend support
 
 TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs).
-With multi-backend support, you can choose the backend that best suits your needs, 
-whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with 
+With multi-backend support, you can choose the backend that best suits your needs,
+whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with
 TGI remains consistent across backends, allowing you to switch between them seamlessly.
 
 **Supported backends:**
-* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option 
+* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option
   within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face.
-* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference. 
-  It utilizes specialized optimizations and custom kernels for enhanced performance. 
-  However, it requires a model-specific compilation step for each GPU architecture.
\ No newline at end of file
+* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference.
+  It utilizes specialized optimizations and custom kernels for enhanced performance.
+  However, it requires a model-specific compilation step for each GPU architecture.

From 7eeefa3b5741c6a59690a80900a0d5ba30f86d31 Mon Sep 17 00:00:00 2001
From: janne-alatalo <48620812+janne-alatalo@users.noreply.github.com>
Date: Tue, 17 Dec 2024 05:55:11 +0200
Subject: [PATCH 007/183] Qwen2-VL runtime error fix when prompted with
 multiple images (#2840)

* Fix runtime error when Qwen2-VL was prompted with multiple images

Fix runtime error when Qwen2-VL model is prompted with prompt with more
than one image. The runtime error was:

 File "text-generation-inference/server/text_generation_server/models/custom_modeling/qwen2_vl.py", line 459, in get_position_ids
    text_pos_ids = torch.arange(text_length, device=d)
RuntimeError: upper bound and larger bound inconsistent with step sign

The error was caused by text_length variable going to negative value
when multiple images caused multiple loops in the get_position_ids
function's main loop.

The error is a simple logic mistake where next_image_pos is initialized
as relative offset from current_pos, but was used like it was absolute
position from zero.

* Fix runtime error when Qwen2-VL was prompted with multiple images

Fix runtime error when Qwen2-VL model is prompted with prompt with more
than one image. The runtime error was:

File "text-generation-inference/server/text_generation_server/models/custom_modeling/qwen2_vl.py", line 534, in forward
    inputs_embeds[input_ids == self.image_token_id] = image_embeds
RuntimeError: shape mismatch: value tensor of shape [512, 3584] cannot be broadcast to indexing result of shape [1024, 3584]

(The error message shape numbers can be different depending on the input
image resolutions)

The error was caused by adding the wrong number of <|image_pad|> tokens
to the tokenized input in the image_text_replacement function.

The error is a simple logical mistake where the number of image pad
tokens is checked from pixel_value_shape tensor's first dimension
length. However, the pixel_value_shape contains patches from all of the
images. Therefore the code added the total number of required image pad
tokens for the whole input to each of the images locations. This
resulted to extra image pad tokens to be present in the tokenized input.

The fix was to check the number of required tokens from the
image_grid_thw tensor. The tensor includes grid_t, grid_h, and grid_w
values for each image. grid_t * grid_h * grid_w results to the total
number of patches for the image [1]. The number of required image pad
tokens is number_of_patches // 4.

[1] https://github.com/huggingface/transformers/blob/31f9a289a6207be6cae746e009d8e0db523be203/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L311

---------

Co-authored-by: Janne Alatalo <janne.alatalo@jamk.fi>
---
 .../text_generation_server/models/custom_modeling/qwen2_vl.py | 4 ++--
 server/text_generation_server/models/vlm_causal_lm.py         | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
index ddb4e36d..a8e1e8c1 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -450,7 +450,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
                     width //= self.spatial_merge_size
 
                     # calculate the length of the text and image tokens
-                    text_length = next_image_pos - current_pos
+                    text_length = next_image_pos
                     start_idx = (
                         llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
                     )
@@ -480,7 +480,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
                     )
                     llm_pos_ids_list.append(image_pos_ids)
 
-                    current_pos = next_image_pos + time_steps * height * width
+                    current_pos += next_image_pos + time_steps * height * width
                     image_index += 1
 
             if current_pos < batch_input_ids.size(1):
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index aa0fe107..81b4369b 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -68,7 +68,8 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
     elif config.model_type == "paligemma":
         return "<image>" * config.text_config.num_image_tokens
     elif config.model_type == "qwen2_vl":
-        num_pads = image_input.pixel_values.shape[0] // 4
+        grid_t, grid_h, grid_w = image_input["image_grid_thw"][image_id]
+        num_pads = grid_t * grid_h * grid_w // 4
         padding = "<|image_pad|>" * num_pads
         return f"<|vision_start|>{padding}<|vision_end|>"
     else:

From 8f66d323d038dcac93d5f73f47cb44ab1da2ce17 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Wed, 18 Dec 2024 17:14:42 +0530
Subject: [PATCH 008/183] Update vllm kernels for ROCM (#2826)

* (vllm) updated vllm rocm kernels

* revert silu

* update partition size

* remove grouped_topk

* (nit) remove log

* update moe-kernels commit
---
 Dockerfile_amd                                | 13 +++++
 server/Makefile-vllm                          |  2 +-
 .../layers/attention/kv_cache.py              |  4 +-
 .../layers/attention/rocm.py                  | 57 ++++++++++++-------
 .../layers/layernorm.py                       | 37 +++++++-----
 .../text_generation_server/layers/linear.py   |  8 +--
 .../layers/moe/__init__.py                    |  5 +-
 .../layers/moe/fused_moe_rocm.py              | 52 -----------------
 .../layers/moe/unquantized.py                 |  4 +-
 .../text_generation_server/layers/rotary.py   |  2 +-
 .../custom_modeling/flash_cohere_modeling.py  |  2 +-
 .../custom_modeling/flash_dbrx_modeling.py    |  4 +-
 .../flash_deepseek_v2_modeling.py             |  6 +-
 .../custom_modeling/flash_gptj_modeling.py    |  2 +-
 .../custom_modeling/flash_llama_modeling.py   |  6 +-
 .../custom_modeling/flash_mistral_modeling.py |  6 +-
 .../custom_modeling/idefics_modeling.py       |  2 +-
 17 files changed, 95 insertions(+), 117 deletions(-)
 delete mode 100644 server/text_generation_server/layers/moe/fused_moe_rocm.py

diff --git a/Dockerfile_amd b/Dockerfile_amd
index 7638947a..dc748f49 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -234,6 +234,7 @@ FROM kernel-builder AS vllm-builder
 WORKDIR /usr/src
 
 COPY server/Makefile-vllm Makefile
+RUN pip install setuptools_scm
 
 # Build specific version of vllm
 RUN make build-vllm-rocm
@@ -267,6 +268,15 @@ COPY server/exllamav2_kernels/ .
 
 RUN python setup.py build
 
+FROM kernel-builder AS moe-kernels
+WORKDIR /usr/src
+ENV MOE_KERNELS_BRANCH=a67b35841774b2056a73806c36661134b5054edd
+ENV VLLM_TARGET_DEVICE=rocm
+RUN git clone https://github.com/danieldk/moe-kernels.git && \
+    cd moe-kernels && \
+    git checkout ${MOE_KERNELS_BRANCH} && \
+    python setup.py install
+
 FROM install_deps AS base-copy
 
 # Text Generation Inference base env
@@ -289,6 +299,9 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311
 # Copy build artifacts from exllamav2 kernels builder
 COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
+# Copy build artifacts from moe kernels
+COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+
 # Install server
 COPY proto proto
 COPY server server
diff --git a/server/Makefile-vllm b/server/Makefile-vllm
index 45a7980d..90da96d2 100644
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@@ -1,4 +1,4 @@
-commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
+commit_rocm := de990cd12537f78f74e40b5c8ee1a62d63d734dd
 
 build-vllm-rocm:
 	if [ ! -d 'vllm' ]; then \
diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py
index cad1d98a..93d74732 100644
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@@ -215,7 +215,9 @@ def paged_reshape_and_cache(
             raise ImportError(
                 f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
             )
-        ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
+        ops.reshape_and_cache(
+            key, value, key_cache, value_cache, slots, "auto", 1.0, 1.0
+        )
     elif SYSTEM == "ipex":
         import intel_extension_for_pytorch as ipex
 
diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py
index ea11c2c2..0cfac25b 100644
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -6,26 +6,42 @@ from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import Seqlen
 from text_generation_server.utils.log import log_master
 from loguru import logger
+import vllm._custom_ops as ops
 
 major, minor = torch.cuda.get_device_capability()
 is_sm75 = major == 7 and minor == 5
 
-_PARTITION_SIZE_V1V2 = 512
+_PARTITION_SIZE_V1V2 = 1024
 _PARTITION_SIZE_CUSTOM = 256
 
+_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+_ON_MI250_MI300 = any(
+    arch in _GPU_ARCH for arch in ["gfx90a", "gfx940", "gfx941", "gfx942"]
+)
+
 use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"}
 ENGINE = "triton" if use_triton else "ck"
 
 use_rocm_custom_paged_attn = os.getenv("ROCM_USE_CUSTOM_PAGED_ATTN", "1") != "0"
-try:
-    if use_rocm_custom_paged_attn:
-        from vllm._custom_C import paged_attention_custom
-except ImportError as e:
-    log_master(
-        logger.info,
-        f"Custom Paged Attention not available. Complete error: {e}",
+
+
+def _use_rocm_custom_paged_attention(
+    qtype: torch.dtype,
+    head_size: int,
+    block_size: int,
+    gqa_ratio: int,
+    max_seq_len: int,
+) -> bool:
+    # rocm custom page attention not support on navi (gfx1*)
+    return (
+        use_rocm_custom_paged_attn
+        and _ON_MI250_MI300
+        and (qtype == torch.half or qtype == torch.bfloat16)
+        and (head_size == 64 or head_size == 128)
+        and (block_size == 16 or block_size == 32)
+        and (gqa_ratio >= 1 and gqa_ratio <= 16)
+        and max_seq_len <= 131072
     )
-    use_rocm_custom_paged_attn = False
 
 
 def paged_attention(
@@ -66,13 +82,8 @@ def paged_attention(
 
     num_kv_heads = kv_cache.key.shape[1]
     gqa_ratio = num_heads // num_kv_heads
-    use_custom = (
-        use_rocm_custom_paged_attn
-        and (query.dtype == torch.half or query.dtype == torch.bfloat16)
-        and (head_size == 128 or head_size == 64)
-        and (block_size == 16 or block_size == 32)
-        and (gqa_ratio >= 1 and gqa_ratio <= 16)
-        and max_s <= 32768
+    use_custom = _use_rocm_custom_paged_attention(
+        query.dtype, head_size, block_size, gqa_ratio, max_s
     )
 
     if not use_custom:
@@ -90,8 +101,6 @@ def paged_attention(
     # V1 to avoid the overhead of reduction. Also, if the number of
     # sequences or heads is large, we use V1 since there is enough work
     # to parallelize.
-    import vllm._custom_ops as ops
-
     use_v1 = (
         max_s <= 8192
         and (max_num_partitions == 1 or num_seqs * num_heads > 512)
@@ -103,7 +112,7 @@ def paged_attention(
             query,
             kv_cache.key,
             kv_cache.value,
-            kv_head_mapping,
+            num_kv_heads,
             softmax_scale,
             block_tables,
             input_lengths,
@@ -112,6 +121,7 @@ def paged_attention(
             None,
             "auto",
             1.0,
+            1.0,
         )
     else:
         # Run PagedAttention V2.
@@ -137,7 +147,7 @@ def paged_attention(
                 query,
                 kv_cache.key,
                 kv_cache.value,
-                kv_head_mapping,
+                num_kv_heads,
                 softmax_scale,
                 block_tables,
                 input_lengths,
@@ -146,9 +156,10 @@ def paged_attention(
                 None,
                 "auto",
                 1.0,
+                1.0,
             )
         else:
-            paged_attention_custom(
+            ops.paged_attention_rocm(
                 out,
                 exp_sums,
                 max_logits,
@@ -164,6 +175,10 @@ def paged_attention(
                 max_s,
                 None,
                 "auto",
+                1.0,
+                1.0,
+                None,
+                _PARTITION_SIZE,
             )
 
     return out
diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py
index ce5289f9..8c7a2eb0 100644
--- a/server/text_generation_server/layers/layernorm.py
+++ b/server/text_generation_server/layers/layernorm.py
@@ -72,7 +72,7 @@ if SYSTEM == "cuda":
                 return normed_hidden_states, residual
 
 elif SYSTEM == "rocm":
-    from vllm._C import ops
+    import vllm._custom_ops as ops
 
     class FastLayerNorm(nn.LayerNorm):
         def forward(self, hidden_states, residual=None):
@@ -121,6 +121,27 @@ class FastRMSNorm(nn.Module):
                 residual is not None,
             )
             return out, residual if residual is not None else hidden_states
+        elif SYSTEM == "rocm":
+            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
+            if residual is not None:
+                ops.fused_add_rms_norm(
+                    hidden_states,
+                    residual,
+                    self.weight.data,
+                    self.variance_epsilon,
+                )
+                return hidden_states, residual
+
+            residual = hidden_states
+
+            out = torch.empty_like(hidden_states)
+            ops.rms_norm(
+                out,
+                hidden_states,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return out, residual
         elif hidden_states.shape[-1] > 8192:
             if residual is not None:
                 hidden_states += residual
@@ -164,20 +185,6 @@ class FastRMSNorm(nn.Module):
                 res = hidden_states
 
             return normed_hidden_states, res
-        elif SYSTEM == "rocm":
-            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
-            if residual is not None:
-                hidden_states += residual
-            residual = hidden_states
-
-            out = torch.empty_like(hidden_states)
-            ops.rms_norm(
-                out,
-                hidden_states,
-                self.weight.data,
-                self.variance_epsilon,
-            )
-            return out, residual
         else:
             raise ValueError(
                 "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py
index 08306d57..166c2874 100644
--- a/server/text_generation_server/layers/linear.py
+++ b/server/text_generation_server/layers/linear.py
@@ -11,10 +11,10 @@ if SYSTEM == "rocm":
 
     if ROCM_USE_SKINNY_GEMM:
         try:
-            from vllm import _custom_C
+            import vllm._custom_ops as ops
         except Exception as e:
             raise ImportError(
-                f"Could not load `vllm._custom_C` for ROCm skinny gemm. Full error: {e}"
+                f"Could not load `vllm._custom_ops` for ROCm skinny gemm. Full error: {e}"
             )
 
 
@@ -95,12 +95,12 @@ class FastLinearROCm(torch.nn.Module):
                 out = torch.empty(
                     inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
                 )
-                _custom_C.wvSpltK(weight, inp, out, n, self.cu_count)
+                ops.wvSpltK(weight, inp, out, n, self.cu_count)
             elif m % 4 == 0 and n == 1 and k <= 8192:
                 out = torch.empty(
                     inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
                 )
-                _custom_C.LLMM1(weight, inp, out, 4)
+                ops.LLMM1(weight, inp, out, 4)
             else:
                 out = F.linear(inp, weight)
 
diff --git a/server/text_generation_server/layers/moe/__init__.py b/server/text_generation_server/layers/moe/__init__.py
index a5ae7ff4..be40d78a 100644
--- a/server/text_generation_server/layers/moe/__init__.py
+++ b/server/text_generation_server/layers/moe/__init__.py
@@ -24,10 +24,7 @@ from text_generation_server.utils.weights import (
     UnquantizedWeight,
 )
 
-if SYSTEM == "rocm":
-    from .fused_moe_rocm import grouped_topk
-    from vllm.model_executor.layers.fused_moe import fused_topk
-elif SYSTEM == "ipex":
+if SYSTEM == "ipex":
     from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
 else:
     from moe_kernels.fused_moe import fused_topk, grouped_topk
diff --git a/server/text_generation_server/layers/moe/fused_moe_rocm.py b/server/text_generation_server/layers/moe/fused_moe_rocm.py
deleted file mode 100644
index 68accb99..00000000
--- a/server/text_generation_server/layers/moe/fused_moe_rocm.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# coding=utf-8
-# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Tuple
-
-import torch
-import torch.distributed
-
-
-# TODO: Remove the functions once moe_kernel are built for ROCM
-def grouped_topk(
-    hidden_states: torch.Tensor,
-    gating_output: torch.Tensor,
-    topk: int,
-    renormalize: bool,
-    num_expert_group: int = 0,
-    topk_group: int = 0,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    scores = torch.softmax(gating_output, dim=-1)
-    num_token = scores.shape[0]
-    group_scores = (
-        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
-    )  # [n, n_group]
-    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
-        1
-    ]  # [n, top_k_group]
-    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
-    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
-    score_mask = (
-        group_mask.unsqueeze(-1)
-        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
-        .reshape(num_token, -1)
-    )  # [n, e]
-    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
-    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
-
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
-
-    return topk_weights, topk_ids
diff --git a/server/text_generation_server/layers/moe/unquantized.py b/server/text_generation_server/layers/moe/unquantized.py
index 75af0409..3c9bcaba 100644
--- a/server/text_generation_server/layers/moe/unquantized.py
+++ b/server/text_generation_server/layers/moe/unquantized.py
@@ -6,9 +6,7 @@ import torch.nn as nn
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.weights import UnquantizedWeight, Weights
 
-if SYSTEM == "rocm":
-    from vllm.model_executor.layers.fused_moe import fused_moe
-elif SYSTEM == "ipex":
+if SYSTEM == "ipex":
     from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
 else:
     from moe_kernels.fused_moe import fused_moe
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
index 123bbadb..e346d0f8 100644
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@@ -7,7 +7,7 @@ from text_generation_server.utils.import_utils import SYSTEM
 if SYSTEM == "cuda":
     import rotary_emb
 elif SYSTEM == "rocm":
-    from vllm._C import ops
+    import vllm._custom_ops as ops
 elif SYSTEM == "ipex":
     import intel_extension_for_pytorch as ipex
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
index 68719106..ece15e94 100644
--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -75,7 +75,7 @@ class CohereRotary(PositionRotaryEmbedding):
 
             rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
         elif SYSTEM == "rocm":
-            from vllm._C import ops
+            import vllm._custom_ops as ops
 
             # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
             # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
index 2d1aa96c..aa032782 100644
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -23,9 +23,7 @@ from typing import Optional, List, Tuple, Any
 from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
 
-if SYSTEM == "rocm":
-    from vllm.model_executor.layers.fused_moe import fused_moe
-elif SYSTEM == "ipex":
+if SYSTEM == "ipex":
     from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
 else:
     from moe_kernels.fused_moe import fused_moe
diff --git a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
index 906a83a4..2fb733cd 100644
--- a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
@@ -43,9 +43,9 @@ from text_generation_server.utils.weights import Weights
 
 if SYSTEM == "rocm":
     try:
-        from vllm import _custom_C
+        import vllm._custom_ops as ops
     except Exception as e:
-        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+        raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")
 
 
 class DeepseekV2Config(PretrainedConfig):
@@ -408,7 +408,7 @@ class DeepseekV2MLP(nn.Module):
                 dtype=hidden_states.dtype,
                 device="cuda",
             )
-            _custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
+            ops.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
             return self.down_proj(out, reduce=reduce)
         else:
             gate_up_states = self.gate_up_proj(hidden_states)
diff --git a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
index 692f8ca3..45b90679 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
@@ -91,7 +91,7 @@ class GPTJRotary(PositionRotaryEmbedding):
 
             rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
         elif SYSTEM == "rocm":
-            from vllm._C import ops
+            import vllm._custom_ops as ops
 
             # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
             # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 2c007d15..10309006 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -64,9 +64,9 @@ if SYSTEM != "ipex":
 
 if SYSTEM == "rocm":
     try:
-        from vllm import _custom_C
+        import vllm._custom_ops as ops
     except Exception as e:
-        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+        raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")
 
 
 def load_attention(config, prefix: str, weights, layer_id):
@@ -392,7 +392,7 @@ class LlamaMLP(nn.Module):
                 dtype=hidden_states.dtype,
                 device="cuda",
             )
-            _custom_C.LLMM_Silu(
+            ops.LLMM_Silu(
                 self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
             )
             return self.down_proj(out, adapter_data)
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
index c66c732f..0fa172d0 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -49,9 +49,9 @@ from text_generation_server.layers.layernorm import (
 
 if SYSTEM == "rocm":
     try:
-        from vllm import _custom_C
+        import vllm._custom_ops as ops
     except Exception as e:
-        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+        raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")
 
 
 class MistralConfig(PretrainedConfig):
@@ -318,7 +318,7 @@ class MistralMLP(nn.Module):
                 dtype=hidden_states.dtype,
                 device="cuda",
             )
-            _custom_C.LLMM_Silu(
+            ops.LLMM_Silu(
                 self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
             )
             return self.down_proj(out, adapter_data)
diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index fc6becc4..9fc9bca6 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -52,7 +52,7 @@ from loguru import logger
 if SYSTEM == "cuda":
     import dropout_layer_norm
 elif SYSTEM == "rocm":
-    from vllm._C import ops
+    import vllm._custom_ops as ops
 else:
     dropout_layer_norm = None
 

From ab5f61692096523c4766ca5d15a025e142e6b420 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Thu, 19 Dec 2024 19:18:58 +0800
Subject: [PATCH 009/183] change xpu lib download link (#2852)

Signed-off-by: Wang,Yi A <yi.a.wang@intel.com>
---
 Dockerfile_intel | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index e024f31a..720d7bee 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -97,11 +97,11 @@ ENV HF_HOME=/data \
 
 
 WORKDIR /usr/src
-RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir
-RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir
-RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir
-RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp311-cp311-linux_x86_64.whl --no-cache-dir
-RUN pip install https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir
+RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir
+RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir
+RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir
+RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp311-cp311-linux_x86_64.whl --no-cache-dir
+RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir
 
 RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
 

From 23bc38b10d06f8cc271d086c26270976faf67cc2 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Thu, 19 Dec 2024 16:55:17 -0500
Subject: [PATCH 010/183] fix: include add_special_tokens in kserve request
 (#2859)

merging as this patch is already used, and fully limit to the kserve feature
---
 router/src/kserve.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/router/src/kserve.rs b/router/src/kserve.rs
index c53fa481..ea85eb8c 100644
--- a/router/src/kserve.rs
+++ b/router/src/kserve.rs
@@ -205,6 +205,7 @@ pub async fn kserve_model_infer(
             let generate_request = GenerateRequest {
                 inputs: str_input.to_string(),
                 parameters: payload.parameters.clone(),
+                add_special_tokens: true,
             };
             let infer = infer.clone();
             let compute_type = compute_type.clone();
@@ -212,7 +213,7 @@ pub async fn kserve_model_infer(
             async move {
                 generate_internal(infer, compute_type, Json(generate_request), span)
                     .await
-                    .map(|(_, Json(generation))| {
+                    .map(|(_, _, Json(generation))| {
                         let generation_as_bytes = generation.generated_text.as_bytes().to_vec();
                         OutputChunk {
                             name: output.name.clone(),

From d37a43e58189e5556b9ed73e067db4a8d03191ef Mon Sep 17 00:00:00 2001
From: Ruida Zeng <31152346+ruidazeng@users.noreply.github.com>
Date: Thu, 9 Jan 2025 03:09:23 -0600
Subject: [PATCH 011/183] chore: fixed some typos and attribute issues in
 README (#2891)

* chore: fixed html repeated attribute in README

* chore: fix minor grammar/capitalization

* chore: fixed spelling mistakes in README
---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 6d3a9b12..31966ddb 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 <div align="center">
 
 <a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
-  <img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
+  <img width=560 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
 </a>
 
 # Text Generation Inference
@@ -141,8 +141,8 @@ You have the option to utilize the `HF_TOKEN` environment variable for configuri
 For example, if you want to serve the gated Llama V2 model variants:
 
 1. Go to https://huggingface.co/settings/tokens
-2. Copy your cli READ token
-3. Export `HF_TOKEN=<your cli READ token>`
+2. Copy your CLI READ token
+3. Export `HF_TOKEN=<your CLI READ token>`
 
 or with Docker:
 
@@ -157,7 +157,7 @@ docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/da
 ### A note on Shared Memory (shm)
 
 [`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
-`PyTorch` to do distributed training/inference. `text-generation-inference` make
+`PyTorch` to do distributed training/inference. `text-generation-inference` makes
 use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.
 
 In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if
@@ -196,7 +196,7 @@ Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with T
 
 You can also opt to install `text-generation-inference` locally.
 
-First clone the repository and change directoy into it:
+First clone the repository and change directory into it:
 
 ```shell
 git clone https://github.com/huggingface/text-generation-inference
@@ -213,7 +213,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 conda create -n text-generation-inference python=3.11
 conda activate text-generation-inference
 
-#using pyton venv
+#using python venv
 python3 -m venv .venv
 source .venv/bin/activate
 ```

From afb6c728d8df41d3552395b64d5743133d56db93 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Thu, 9 Jan 2025 17:11:03 +0800
Subject: [PATCH 012/183] update ipex xpu to fix issue in ARC770 (#2884)

* update ipex xpu to fix issue in ARC770

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* add ats support

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_intel | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index 720d7bee..3b5e4a13 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -100,7 +100,6 @@ WORKDIR /usr/src
 RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir
 RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir
 RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir
-RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp311-cp311-linux_x86_64.whl --no-cache-dir
 RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir
 
 RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
@@ -119,6 +118,9 @@ ENV CCL_ZE_IPC_EXCHANGE=sockets
 #ENV TORCH_LLM_ALLREDUCE=1
 #ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
 
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 033af6f63745ac748cccdadee5c6140c7971edf6
+RUN cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc,ats-m150' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch
+
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router

From a9c7d2e3b6b73c49955e7d18ad09ff5b99341b9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 9 Jan 2025 16:25:00 +0100
Subject: [PATCH 013/183] Basic flashinfer 0.2 support (#2862)

* Basic flashinfer 0.2 support

This change does not use any of the new features yet, but makes
some small compatibility changes.

* Update to flashinfer 0.2.0.post1

* flashinfer: remove `contiguous` calls

* Fix flashinfer install

* flashinfer: fixup kv cache dtype

* Fix some annoying perturbations

* More output changes
---
 flake.lock                                    |   7 +-
 flake.nix                                     |   2 +-
 ...ompressed_tensors_w8a8_int_all_params.json |   6 +-
 ...rs_w8a8_int_dynamic_weight_all_params.json |  36 +++---
 ..._tensors_w8a8_int_dynamic_weight_load.json |  80 ++++++------
 ...t_compressed_tensors_wna16_all_params.json |   2 +-
 .../test_flash_gemma_gptq_all_params.json     |  70 +++++------
 .../test_flash_starcoder2_default_params.json | 114 +++++++++---------
 ...pressed_tensors_w8a8_int_dynamic_weight.py |   2 +-
 server/Makefile-flashinfer                    |   5 +-
 .../layers/attention/cuda.py                  |   6 +-
 .../layers/attention/flashinfer.py            |  49 ++------
 .../models/flash_causal_lm.py                 |   5 +-
 13 files changed, 177 insertions(+), 207 deletions(-)

diff --git a/flake.lock b/flake.lock
index ec87d569..44802e18 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,15 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1732218602,
-        "narHash": "sha256-BElslL34KjOJCFMPkNtilOz6S/7iY7Vd72FNbRRWKDY=",
+        "lastModified": 1736179589,
+        "narHash": "sha256-/zZCSieBJncVXqOFbvbSov76g2eWAxVxEJNNA6SmQKc=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "f79638ac4e420e661321261744e745a3a747e182",
+        "rev": "fc7ff53b2cd5c984ad1434f20c271e3b7600d1c4",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
+        "ref": "flashinfer-v0.2",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 83cedfa6..a302db3e 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/flashinfer-v0.2";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_all_params.json
index 7d35e8f9..771708eb 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_all_params.json
@@ -32,7 +32,7 @@
       },
       {
         "id": 1101,
-        "logprob": -1.0947266,
+        "logprob": -1.0136719,
         "special": false,
         "text": " also"
       },
@@ -56,13 +56,13 @@
       },
       {
         "id": 4009,
-        "logprob": -0.15563965,
+        "logprob": -0.21923828,
         "special": false,
         "text": " network"
       },
       {
         "id": 477,
-        "logprob": -1.4003906,
+        "logprob": -1.4824219,
         "special": false,
         "text": " or"
       }
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
index 0db48f3e..6b3f5092 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
@@ -8,7 +8,7 @@
     "tokens": [
       {
         "id": 1939,
-        "logprob": -2.2675781,
+        "logprob": -2.2460938,
         "special": false,
         "text": "?\n\n"
       },
@@ -20,13 +20,13 @@
       },
       {
         "id": 20909,
-        "logprob": -0.37695312,
+        "logprob": -0.48608398,
         "special": false,
         "text": " Learning"
       },
       {
         "id": 4102,
-        "logprob": -1.9316406,
+        "logprob": -2.265625,
         "special": false,
         "text": " "
       },
@@ -38,25 +38,13 @@
       },
       {
         "id": 458,
-        "logprob": -0.80859375,
+        "logprob": -0.6328125,
         "special": false,
         "text": " an"
       },
-      {
-        "id": 3082,
-        "logprob": -1.4541016,
-        "special": false,
-        "text": " area"
-      },
-      {
-        "id": 315,
-        "logprob": 0.0,
-        "special": false,
-        "text": " of"
-      },
       {
         "id": 20443,
-        "logprob": -0.5136719,
+        "logprob": -0.1796875,
         "special": false,
         "text": " artificial"
       },
@@ -65,9 +53,21 @@
         "logprob": 0.0,
         "special": false,
         "text": " intelligence"
+      },
+      {
+        "id": 320,
+        "logprob": -0.37695312,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 15469,
+        "logprob": 0.0,
+        "special": false,
+        "text": "AI"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "What is deep learning?\n\nDeep Learning is an area of artificial intelligence"
+  "generated_text": "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
 }
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
index abcaf876..1fa4e33a 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
@@ -9,61 +9,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.6669922,
+          "logprob": -1.4912109,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.08959961,
+          "logprob": -0.075683594,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.14685059,
+          "logprob": -0.12408447,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.125,
+          "logprob": -0.12768555,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.81640625,
+          "logprob": -0.82128906,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0013418198,
+          "logprob": -0.0012636185,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.16259766,
+          "logprob": -0.12878418,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0016393661,
+          "logprob": -0.0015888214,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.4477539,
+          "logprob": -0.49194336,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2802734,
+          "logprob": -1.2626953,
           "special": false,
           "text": " uses"
         }
@@ -82,61 +82,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.6669922,
+          "logprob": -1.4912109,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.08959961,
+          "logprob": -0.075683594,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.14685059,
+          "logprob": -0.12408447,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.125,
+          "logprob": -0.12768555,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.81640625,
+          "logprob": -0.82128906,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0013418198,
+          "logprob": -0.0012636185,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.16259766,
+          "logprob": -0.12878418,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0016393661,
+          "logprob": -0.0015888214,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.4477539,
+          "logprob": -0.49194336,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2802734,
+          "logprob": -1.2626953,
           "special": false,
           "text": " uses"
         }
@@ -155,61 +155,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.6669922,
+          "logprob": -1.4912109,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.08959961,
+          "logprob": -0.075683594,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.14685059,
+          "logprob": -0.12408447,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.125,
+          "logprob": -0.12768555,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.81640625,
+          "logprob": -0.82128906,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0013418198,
+          "logprob": -0.0012636185,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.16259766,
+          "logprob": -0.12878418,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0016393661,
+          "logprob": -0.0015888214,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.4477539,
+          "logprob": -0.49194336,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2802734,
+          "logprob": -1.2626953,
           "special": false,
           "text": " uses"
         }
@@ -228,61 +228,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.6669922,
+          "logprob": -1.4912109,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.08959961,
+          "logprob": -0.075683594,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.14685059,
+          "logprob": -0.12408447,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.125,
+          "logprob": -0.12768555,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.81640625,
+          "logprob": -0.82128906,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0013418198,
+          "logprob": -0.0012636185,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.16259766,
+          "logprob": -0.12878418,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0016393661,
+          "logprob": -0.0015888214,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.4477539,
+          "logprob": -0.49194336,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2802734,
+          "logprob": -1.2626953,
           "special": false,
           "text": " uses"
         }
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
index 08c63e79..29709676 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
@@ -44,7 +44,7 @@
       },
       {
         "id": 38397,
-        "logprob": -0.12695312,
+        "logprob": 0.0,
         "special": false,
         "text": " subset"
       },
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
index 6306f75e..0f54bbe8 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
@@ -14,60 +14,60 @@
       },
       {
         "id": 573,
-        "logprob": -0.18493652,
+        "logprob": -0.19030762,
         "special": false,
         "text": " the"
       },
       {
         "id": 16819,
-        "logprob": -1.4804688,
+        "logprob": -1.4863281,
         "special": false,
         "text": " detection"
       },
       {
         "id": 576,
-        "logprob": -0.7011719,
+        "logprob": -0.7089844,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 573,
+        "logprob": -2.0410156,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 8566,
+        "logprob": 0.0,
+        "special": false,
+        "text": " presence"
+      },
+      {
+        "id": 689,
+        "logprob": -0.16491699,
+        "special": false,
+        "text": " or"
+      },
+      {
+        "id": 14862,
+        "logprob": 0.0,
+        "special": false,
+        "text": " absence"
+      },
+      {
+        "id": 576,
+        "logprob": -0.9970703,
         "special": false,
         "text": " of"
       },
       {
         "id": 671,
-        "logprob": -2.1738281,
+        "logprob": -0.5292969,
         "special": false,
         "text": " an"
-      },
-      {
-        "id": 24646,
-        "logprob": -3.0449219,
-        "special": false,
-        "text": " RNA"
-      },
-      {
-        "id": 12369,
-        "logprob": -0.19299316,
-        "special": false,
-        "text": " virus"
-      },
-      {
-        "id": 575,
-        "logprob": -0.10632324,
-        "special": false,
-        "text": " in"
-      },
-      {
-        "id": 6022,
-        "logprob": -0.98095703,
-        "special": false,
-        "text": " patients"
-      },
-      {
-        "id": 1064,
-        "logprob": -1.3095703,
-        "special": false,
-        "text": " who"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "Test request for the detection of an RNA virus in patients who"
+  "generated_text": "Test request for the detection of the presence or absence of an"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json
index 914e59c0..6674cf50 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json
@@ -8,7 +8,7 @@
     "tokens": [
       {
         "id": 2284,
-        "logprob": -0.296875,
+        "logprob": -0.31323242,
         "special": false,
         "text": "():"
       },
@@ -38,13 +38,13 @@
       },
       {
         "id": 10914,
-        "logprob": -0.7734375,
+        "logprob": -0.7871094,
         "special": false,
         "text": " World"
       },
       {
         "id": 16013,
-        "logprob": -0.61816406,
+        "logprob": -0.64746094,
         "special": false,
         "text": "!\")"
       },
@@ -62,7 +62,7 @@
       },
       {
         "id": 610,
-        "logprob": -0.4152832,
+        "logprob": -0.41064453,
         "special": false,
         "text": "def"
       },
@@ -92,7 +92,7 @@
       },
       {
         "id": 444,
-        "logprob": -0.21618652,
+        "logprob": -0.21655273,
         "special": false,
         "text": "name"
       },
@@ -139,28 +139,16 @@
         "text": "Hello"
       },
       {
-        "id": 925,
-        "logprob": -3.3476562,
+        "id": 332,
+        "logprob": -0.034698486,
         "special": false,
-        "text": " %"
+        "text": " \""
       },
       {
-        "id": 120,
+        "id": 494,
         "logprob": 0.0,
         "special": false,
-        "text": "s"
-      },
-      {
-        "id": 11571,
-        "logprob": -0.08892822,
-        "special": false,
-        "text": "!\""
-      },
-      {
-        "id": 925,
-        "logprob": 0.0,
-        "special": false,
-        "text": " %"
+        "text": " +"
       },
       {
         "id": 655,
@@ -169,10 +157,22 @@
         "text": " name"
       },
       {
-        "id": 46,
+        "id": 494,
+        "logprob": -0.20141602,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 332,
         "logprob": 0.0,
         "special": false,
-        "text": ")"
+        "text": " \""
+      },
+      {
+        "id": 16013,
+        "logprob": 0.0,
+        "special": false,
+        "text": "!\")"
       },
       {
         "id": 222,
@@ -230,7 +230,7 @@
       },
       {
         "id": 400,
-        "logprob": -0.074279785,
+        "logprob": 0.0,
         "special": false,
         "text": "age"
       },
@@ -289,22 +289,34 @@
         "text": "Hello"
       },
       {
-        "id": 925,
+        "id": 332,
         "logprob": 0.0,
         "special": false,
-        "text": " %"
+        "text": " \""
       },
       {
-        "id": 120,
+        "id": 494,
         "logprob": 0.0,
         "special": false,
-        "text": "s"
+        "text": " +"
       },
       {
-        "id": 49,
-        "logprob": -0.07891846,
+        "id": 655,
+        "logprob": 0.0,
         "special": false,
-        "text": ","
+        "text": " name"
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 3021,
+        "logprob": -0.5761719,
+        "special": false,
+        "text": " \","
       },
       {
         "id": 863,
@@ -319,55 +331,43 @@
         "text": " are"
       },
       {
-        "id": 925,
+        "id": 332,
         "logprob": 0.0,
         "special": false,
-        "text": " %"
+        "text": " \""
       },
       {
-        "id": 105,
+        "id": 494,
         "logprob": 0.0,
         "special": false,
-        "text": "d"
+        "text": " +"
       },
       {
-        "id": 11339,
+        "id": 615,
         "logprob": 0.0,
         "special": false,
-        "text": " years"
+        "text": " str"
       },
       {
-        "id": 3627,
+        "id": 45,
         "logprob": 0.0,
         "special": false,
-        "text": " old"
+        "text": "("
       },
       {
-        "id": 11571,
+        "id": 400,
         "logprob": 0.0,
         "special": false,
-        "text": "!\""
+        "text": "age"
       },
       {
-        "id": 925,
+        "id": 46,
         "logprob": 0.0,
         "special": false,
-        "text": " %"
-      },
-      {
-        "id": 327,
-        "logprob": 0.0,
-        "special": false,
-        "text": " ("
-      },
-      {
-        "id": 444,
-        "logprob": 0.0,
-        "special": false,
-        "text": "name"
+        "text": ")"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "():\n    print(\"Hello World!\")\n\ndef print_hello_name(name):\n    print(\"Hello %s!\" % name)\n\ndef print_hello_name_age(name, age):\n    print(\"Hello %s, you are %d years old!\" % (name"
+  "generated_text": "():\n    print(\"Hello World!\")\n\ndef print_hello_name(name):\n    print(\"Hello \" + name + \"!\")\n\ndef print_hello_name_age(name, age):\n    print(\"Hello \" + name + \", you are \" + str(age)"
 }
diff --git a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
index 7cc82a4e..a0b0416b 100644
--- a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
+++ b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
@@ -64,7 +64,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "What is deep learning?\n\nDeep Learning is an area of artificial intelligence"
+        == "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
     )
     assert response == response_snapshot
 
diff --git a/server/Makefile-flashinfer b/server/Makefile-flashinfer
index f0a27622..d5f684ba 100644
--- a/server/Makefile-flashinfer
+++ b/server/Makefile-flashinfer
@@ -1,2 +1,5 @@
 install-flashinfer:
-	pip install flashinfer==0.1.6 -i https://flashinfer.ai/whl/cu124/torch2.4
+	# We need fsspec as an additional dependency, but
+	# `pip install flashinfer` cannot resolve it.
+	pip install fsspec
+	pip install flashinfer==0.2.0.post1 -i https://flashinfer.ai/whl/cu124/torch2.4
diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py
index 3038602e..7b5af3c4 100644
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -60,8 +60,7 @@ def paged_attention(
         from text_generation_server.layers.attention.flashinfer import decode_state
 
         return decode_state.get().forward(
-            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
-            query.contiguous(),
+            query,
             paged_kv_cache=(kv_cache.key, kv_cache.value),
             logits_soft_cap=softcap,
             sm_scale=softmax_scale,
@@ -231,8 +230,7 @@ def attention(
             softcap = 0.0
 
         return prefill_with_paged_kv_state.get().forward(
-            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
-            query.contiguous(),
+            query,
             causal=causal,
             paged_kv_cache=(kv_cache.key, kv_cache.value),
             logits_soft_cap=softcap,
diff --git a/server/text_generation_server/layers/attention/flashinfer.py b/server/text_generation_server/layers/attention/flashinfer.py
index 26a72d9b..909eea27 100644
--- a/server/text_generation_server/layers/attention/flashinfer.py
+++ b/server/text_generation_server/layers/attention/flashinfer.py
@@ -50,7 +50,8 @@ def use_prefill_with_paged_kv_state(
     num_kv_heads: int,
     head_size: int,
     page_size: int,
-    dtype: torch.dtype,
+    kv_dtype: torch.dtype,
+    q_dtype: torch.dtype,
     window_left: int,
 ):
     """
@@ -91,9 +92,10 @@ def use_prefill_with_paged_kv_state(
             num_qo_heads=num_heads,
             num_kv_heads=num_kv_heads,
             head_dim=head_size,
-            q_data_type=dtype,
+            kv_data_type=kv_dtype,
+            q_data_type=q_dtype,
             page_size=page_size,
-            window_left=window_left,
+            window_left=-1 if window_left is None else window_left,
         )
         yield
     finally:
@@ -113,41 +115,6 @@ def create_prefill_state(
     )
 
 
-@contextmanager
-def use_prefill_state(
-    *,
-    state: flashinfer.BatchPrefillWithRaggedKVCacheWrapper,
-    cu_seqlens: torch.Tensor,
-    num_heads: int,
-    num_kv_heads: int,
-    head_size: int,
-    dtype: torch.dtype,
-    window_left: int,
-):
-    """
-    Context manager to set the active flashinfer prefill state to the given
-    `state` and parameters. This state will be used by all calls to the
-    `attention` function while the context manager is active.
-    """
-
-    token = prefill_state.set(state)
-    try:
-        state.begin_forward(
-            qo_indptr=cu_seqlens,
-            kv_indptr=cu_seqlens,
-            num_qo_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=head_size,
-            q_data_type=dtype,
-            window_left=window_left,
-        )
-        yield
-    finally:
-        state.end_forward()
-        if token is not None:
-            prefill_state.reset(token)
-
-
 def create_decode_state(
     *,
     device: torch.device,
@@ -205,7 +172,7 @@ def use_decode_state(
     head_size: int,
     page_size: int,
     kv_cache_dtype: torch.dtype,
-    dtype: torch.dtype,
+    q_dtype: torch.dtype,
     window_left: int,
 ):
     """
@@ -242,8 +209,8 @@ def use_decode_state(
             head_dim=head_size,
             page_size=page_size,
             data_type=kv_cache_dtype,
-            q_data_type=dtype,
-            window_left=window_left,
+            q_data_type=q_dtype,
+            window_left=-1 if window_left is None else window_left,
         )
         yield
     finally:
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 5d376990..c63ca1db 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -2480,7 +2480,8 @@ class FlashCausalLM(Model):
                 num_kv_heads=self.num_kv_heads,
                 head_size=self.head_size,
                 page_size=BLOCK_SIZE,
-                dtype=self.dtype,
+                kv_dtype=self.kv_cache_dtype,
+                q_dtype=self.dtype,
                 window_left=self.sliding_window,
             )
         else:
@@ -2494,6 +2495,6 @@ class FlashCausalLM(Model):
                 head_size=self.head_size,
                 page_size=BLOCK_SIZE,
                 kv_cache_dtype=self.kv_cache_dtype,
-                dtype=self.dtype,
+                q_dtype=self.dtype,
                 window_left=self.sliding_window,
             )

From da5ab467059dc6039a088f87178a719d6fec8c49 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Thu, 9 Jan 2025 10:35:32 -0500
Subject: [PATCH 014/183] Improve vlm support (add idefics3 support) (#2437)

* feat: expand vlm support and add image token logic and tests

* fix: avoid unused perceiver config

* feat: integrate image tokens into inputs embeds

* feat: add simple idefics3 test

* feat: update docs, image token logic and weight names

* fix: improve image processing

* feat: improve prefix for idefics3

* fix: bump idefics3 tests and snapshots

* fix: improve text model loading

* feat: consolidate changes with existing vlms and add support and test for smolvlm

* fix: create new idefic3 file, simplify logic and adjust llama weight loading

* fix: lint with ruff

* fix: clean up idefics 3 and improve prefix handling

* fix: improve typing

* fix: improve prompt_split_image with ref to original impl

* fix: adjust ruff lints and small refactors

* fix: adjust FlashLlamaModel prefix logic
---
 docs/source/supported_models.md               |   1 +
 integration-tests/conftest.py                 |   4 +
 .../test_flash_idefics3_next_simple_url.json  |  67 ++
 .../test_flash_smolvlm_next_simple_url.json   |  61 ++
 integration-tests/models/test_idefics3.py     |  31 +
 integration-tests/models/test_smolvlm.py      |  31 +
 router/src/config.rs                          |  19 +
 router/src/lib.rs                             |   1 +
 router/src/validation.rs                      |  70 ++-
 .../text_generation_server/models/__init__.py |  27 +
 .../custom_modeling/flash_llama_modeling.py   |  45 +-
 .../models/custom_modeling/idefics3.py        | 584 ++++++++++++++++++
 .../models/custom_modeling/vlm.py             |   2 +-
 .../models/flash_causal_lm.py                 |   2 +-
 .../models/vlm_causal_lm.py                   |  86 ++-
 15 files changed, 988 insertions(+), 43 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_idefics3/test_flash_idefics3_next_simple_url.json
 create mode 100644 integration-tests/models/__snapshots__/test_smolvlm/test_flash_smolvlm_next_simple_url.json
 create mode 100644 integration-tests/models/test_idefics3.py
 create mode 100644 integration-tests/models/test_smolvlm.py
 create mode 100644 server/text_generation_server/models/custom_modeling/idefics3.py

diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index 0f39ff28..5ac90351 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -5,6 +5,7 @@ Text Generation Inference enables serving optimized models. The following sectio
 
 - [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
 - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
+- [Idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index c9c47766..c702ae70 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -354,6 +354,7 @@ def launcher(event_loop):
         kv_cache_dtype: Optional[str] = None,
         revision: Optional[str] = None,
         max_input_length: Optional[int] = None,
+        max_input_tokens: Optional[int] = None,
         max_batch_prefill_tokens: Optional[int] = None,
         max_total_tokens: Optional[int] = None,
         lora_adapters: Optional[List[str]] = None,
@@ -402,6 +403,9 @@ def launcher(event_loop):
         if max_input_length:
             args.append("--max-input-length")
             args.append(str(max_input_length))
+        if max_input_tokens:
+            args.append("--max-input-tokens")
+            args.append(str(max_input_tokens))
         if max_batch_prefill_tokens:
             args.append("--max-batch-prefill-tokens")
             args.append(str(max_batch_prefill_tokens))
diff --git a/integration-tests/models/__snapshots__/test_idefics3/test_flash_idefics3_next_simple_url.json b/integration-tests/models/__snapshots__/test_idefics3/test_flash_idefics3_next_simple_url.json
new file mode 100644
index 00000000..6bf2b93a
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_idefics3/test_flash_idefics3_next_simple_url.json
@@ -0,0 +1,67 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 9,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 2684,
+        "logprob": -0.24902344,
+        "special": false,
+        "text": " There"
+      },
+      {
+        "id": 374,
+        "logprob": -0.0703125,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.23535156,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 35372,
+        "logprob": -0.125,
+        "special": false,
+        "text": " statue"
+      },
+      {
+        "id": 304,
+        "logprob": -0.30273438,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 279,
+        "logprob": -0.20507812,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2217,
+        "logprob": -0.076171875,
+        "special": false,
+        "text": " image"
+      },
+      {
+        "id": 13,
+        "logprob": -0.053710938,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 128258,
+        "logprob": -0.011352539,
+        "special": true,
+        "text": "<end_of_utterance>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " There is a statue in the image."
+}
diff --git a/integration-tests/models/__snapshots__/test_smolvlm/test_flash_smolvlm_next_simple_url.json b/integration-tests/models/__snapshots__/test_smolvlm/test_flash_smolvlm_next_simple_url.json
new file mode 100644
index 00000000..17a69d0d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_smolvlm/test_flash_smolvlm_next_simple_url.json
@@ -0,0 +1,61 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 8,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 330,
+        "logprob": -0.118652344,
+        "special": false,
+        "text": " A"
+      },
+      {
+        "id": 11426,
+        "logprob": -0.28320312,
+        "special": false,
+        "text": " bee"
+      },
+      {
+        "id": 335,
+        "logprob": -0.95703125,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 253,
+        "logprob": -0.06982422,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 11986,
+        "logprob": -0.49414062,
+        "special": false,
+        "text": " pink"
+      },
+      {
+        "id": 8525,
+        "logprob": -0.07763672,
+        "special": false,
+        "text": " flower"
+      },
+      {
+        "id": 30,
+        "logprob": -1.0703125,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 49154,
+        "logprob": -0.092285156,
+        "special": true,
+        "text": "<end_of_utterance>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " A bee on a pink flower."
+}
diff --git a/integration-tests/models/test_idefics3.py b/integration-tests/models/test_idefics3.py
new file mode 100644
index 00000000..80be2350
--- /dev/null
+++ b/integration-tests/models/test_idefics3.py
@@ -0,0 +1,31 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_idefics3_next_handle(launcher):
+    with launcher("HuggingFaceM4/Idefics3-8B-Llama3") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_idefics3_next(flash_idefics3_next_handle):
+    await flash_idefics3_next_handle.health(300)
+    return flash_idefics3_next_handle.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_idefics3_next_simple_url(flash_idefics3_next, response_snapshot):
+    ny_skyline = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+    query = "What is in this image?"
+    response = await flash_idefics3_next.generate(
+        f"<|begin_of_text|><|begin_of_text|>User:![]({ny_skyline}){query}<end_of_utterance>\nAssistant:",
+        max_new_tokens=10,
+        seed=1337,
+    )
+    print(response)
+    assert (
+        response.generated_text == " There is a statue in the image."
+    ), f"{repr(response.generated_text)}"
+    assert response.details.generated_tokens == 9
+    assert response == response_snapshot
diff --git a/integration-tests/models/test_smolvlm.py b/integration-tests/models/test_smolvlm.py
new file mode 100644
index 00000000..cd105d84
--- /dev/null
+++ b/integration-tests/models/test_smolvlm.py
@@ -0,0 +1,31 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_smolvlm_next_handle(launcher):
+    with launcher("HuggingFaceTB/SmolVLM-Instruct") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_smolvlm_next(flash_smolvlm_next_handle):
+    await flash_smolvlm_next_handle.health(300)
+    return flash_smolvlm_next_handle.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_smolvlm_next_simple_url(flash_smolvlm_next, response_snapshot):
+    ny_skyline = "https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg"
+    query = "What is in this image?"
+    response = await flash_smolvlm_next.generate(
+        f"<|begin_of_text|><|begin_of_text|>User:![]({ny_skyline}){query}<end_of_utterance>\nAssistant:",
+        max_new_tokens=10,
+        seed=1337,
+    )
+    print(response)
+    assert (
+        response.generated_text == " A bee on a pink flower."
+    ), f"{repr(response.generated_text)}"
+    assert response.details.generated_tokens == 8
+    assert response == response_snapshot
diff --git a/router/src/config.rs b/router/src/config.rs
index 5d07a293..4d5fcfa0 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -110,6 +110,24 @@ pub struct ClipVisionModel {
     patch_size: usize,
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Idefics3 {}
+
+impl Idefics3 {
+    pub fn get_max_longest_edge(&self) -> usize {
+        364
+    }
+
+    pub fn get_number_of_features(&self) -> usize {
+        169
+    }
+
+    pub fn get_max_longest_edge_for_image_resize(&self) -> usize {
+        1456
+    }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct Idefics2 {}
@@ -178,6 +196,7 @@ pub enum Config {
     Idefics,
     Mllama,
     Idefics2(Idefics2),
+    Idefics3(Idefics3),
     Ssm,
     GptBigcode,
     Granite,
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 84e9bc48..21c45241 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -170,6 +170,7 @@ impl TokenizerConfigToken {
 #[serde(tag = "processor_class")]
 pub enum HubPreprocessorConfig {
     Idefics2Processor(Idefics2Preprocessor),
+    Idefics3Processor(Idefics2Preprocessor),
 }
 
 impl HubPreprocessorConfig {
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 8137ac58..6d5b06bd 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -614,6 +614,73 @@ fn image_tokens(
 
             image_string
         }
+        Idefics3(config) => {
+            const FAKE: &str = "<fake_token_around_image>";
+            const IMAGE: &str = "<image>";
+            const GLOBAL_IMG: &str = "<global-img>";
+
+            let max_longest_edge_for_image_resize = config.get_max_longest_edge_for_image_resize();
+
+            // resize image if it is larger than max_longest_edge_for_image_resize keeping aspect ratio
+            let (height, width) = if height > max_longest_edge_for_image_resize
+                || width > max_longest_edge_for_image_resize
+            {
+                let aspect_ratio = height as f32 / width as f32;
+                if height > width {
+                    (
+                        max_longest_edge_for_image_resize,
+                        (max_longest_edge_for_image_resize as f32 / aspect_ratio) as usize,
+                    )
+                } else {
+                    (
+                        (max_longest_edge_for_image_resize as f32 * aspect_ratio) as usize,
+                        max_longest_edge_for_image_resize,
+                    )
+                }
+            } else {
+                (height, width)
+            };
+
+            let image_seq_len = config.get_number_of_features();
+            let max_edge = config.get_max_longest_edge();
+
+            let (image_rows, image_cols) = if height > max_edge || width > max_edge {
+                (
+                    (height as f32 / max_edge as f32).ceil() as usize,
+                    (width as f32 / max_edge as f32).ceil() as usize,
+                )
+            } else {
+                (0, 0)
+            };
+
+            let mut image_string = String::new();
+
+            if image_rows == 0 && image_cols == 0 {
+                // Single image case
+                image_string.push_str(FAKE);
+                image_string.push_str(GLOBAL_IMG);
+                image_string.push_str(&IMAGE.repeat(image_seq_len));
+                image_string.push_str(FAKE);
+            } else {
+                // Split image case
+                for n_h in 0..image_rows {
+                    for n_w in 0..image_cols {
+                        image_string.push_str(FAKE);
+                        image_string.push_str(&format!("<row_{}_col_{}>", n_h + 1, n_w + 1));
+                        image_string.push_str(&IMAGE.repeat(image_seq_len));
+                    }
+                    image_string.push('\n');
+                }
+
+                image_string.push('\n');
+                image_string.push_str(FAKE);
+                image_string.push_str(GLOBAL_IMG);
+                image_string.push_str(&IMAGE.repeat(image_seq_len));
+                image_string.push_str(FAKE);
+            }
+
+            image_string
+        }
         Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
         LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
         Qwen2Vl(config) => format!(
@@ -647,7 +714,8 @@ fn prepare_input<T: TokenizerTrait>(
     static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
     let (tokenizer_query, input_chunks) = match config {
         Some(
-            config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_)),
+            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Paligemma(_) | LlavaNext(_)
+            | Qwen2Vl(_)),
         ) => {
             let mut input_chunks = Vec::new();
             let mut tokenizer_query = String::with_capacity(inputs.len());
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index fcc79608..beefeb01 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -152,6 +152,9 @@ try:
     from text_generation_server.models.custom_modeling.idefics2 import (
         Idefics2ForConditionalGeneration,
     )
+    from text_generation_server.models.custom_modeling.idefics3 import (
+        Idefics3ForConditionalGeneration,
+    )
     from text_generation_server.models.custom_modeling.qwen2_vl import (
         Qwen2VLForConditionalGeneration,
     )
@@ -188,6 +191,12 @@ class ModelType(enum.Enum):
         "url": "https://huggingface.co/HuggingFaceM4/idefics2-8b",
         "multimodal": True,
     }
+    IDEFICS3 = {
+        "type": "idefics3",
+        "name": "Idefics 3",
+        "url": "https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3",
+        "multimodal": True,
+    }
     LLAVA_NEXT = {
         "type": "llava_next",
         "name": "Llava Next (1.6)",
@@ -1253,6 +1262,24 @@ def get_model(
             )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
+    if model_type == IDEFICS3:
+        if FLASH_ATTENTION:
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=Idefics3ForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                # XXX: Extremely important to cap resolution in order to limit
+                # VRAM usage.
+                processor_kwargs={"size": {"longest_edge": 1456}},
+            )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
     if model_type == PALIGEMMA:
         if FLASH_ATTENTION:
             return VlmCausalLM(
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 10309006..28db42fe 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -515,9 +515,7 @@ class FlashLlamaModel(torch.nn.Module):
             self.layers.append(
                 FlashLlamaLayer(
                     index=0,
-                    prefix=(
-                        "model.layers.0" if not prefix else f"{prefix}.model.layers.0"
-                    ),
+                    prefix=f"{prefix}.layers.0",
                     config=config,
                     weights=weights,
                 )
@@ -533,11 +531,7 @@ class FlashLlamaModel(torch.nn.Module):
                 self.layers.append(
                     FlashLlamaCrossLayer(
                         index=layer_id,
-                        prefix=(
-                            f"model.layers.{layer_id}"
-                            if not prefix
-                            else f"{prefix}.model.layers.{layer_id}"
-                        ),
+                        prefix=(f"{prefix}.layers.{layer_id}"),
                         config=config,
                         weights=weights,
                     )
@@ -546,11 +540,7 @@ class FlashLlamaModel(torch.nn.Module):
                 self.layers.append(
                     FlashLlamaLayer(
                         index=layer_id,
-                        prefix=(
-                            f"model.layers.{layer_id}"
-                            if not prefix
-                            else f"{prefix}.model.layers.{layer_id}"
-                        ),
+                        prefix=(f"{prefix}.layers.{layer_id}"),
                         config=config,
                         weights=weights,
                     )
@@ -561,18 +551,14 @@ class FlashLlamaModel(torch.nn.Module):
             self.layers.append(
                 FlashLlamaLayer(
                     index=last_layer_id,
-                    prefix=(
-                        f"model.layers.{last_layer_id}"
-                        if not prefix
-                        else f"{prefix}.model.layers.{last_layer_id}"
-                    ),
+                    prefix=(f"{prefix}.layers.{last_layer_id}"),
                     config=config,
                     weights=weights,
                 )
             )
 
         self.norm = FastRMSNorm.load(
-            prefix="model.norm" if not prefix else f"{prefix}.model.norm",
+            prefix=f"{prefix}.norm",
             weights=weights,
             eps=config.rms_norm_eps,
         )
@@ -629,19 +615,24 @@ class FlashLlamaModel(torch.nn.Module):
 
 
 class FlashLlamaForCausalLM(torch.nn.Module):
-    def __init__(self, prefix: str, config, weights):
+    def __init__(self, prefix: str, config, weights, name=None):
+        if name is None:
+            name = "model"
         super().__init__()
-
         with no_fp8(weights):
             self.embed_tokens = TensorParallelEmbedding(
                 prefix=(
-                    "model.embed_tokens"
+                    f"{name}.embed_tokens"
                     if not prefix
-                    else f"{prefix}.model.embed_tokens"
+                    else f"{prefix}.{name}.embed_tokens"
                 ),
                 weights=weights,
             )
-        self.model = FlashLlamaModel(prefix, config, weights)
+        self.model = FlashLlamaModel(
+            prefix=name if not prefix else f"{prefix}.{name}",
+            config=config,
+            weights=weights,
+        )
         if config.tie_word_embeddings:
             suffix = "model.embed_tokens"
         else:
@@ -652,11 +643,13 @@ class FlashLlamaForCausalLM(torch.nn.Module):
         if embedding_multiplier is not None:
             self.embed_tokens.weight.data *= embedding_multiplier
 
+        prefix = "lm_head" if not prefix or name != "model" else f"{prefix}.{suffix}"
+
         with no_fp8(weights):
             self.lm_head = SpeculativeHead.load(
                 config,
-                prefix=suffix if not prefix else f"{prefix}.{suffix}",
-                weights=weights,
+                prefix,
+                weights,
             )
 
         # Used in Granite
diff --git a/server/text_generation_server/models/custom_modeling/idefics3.py b/server/text_generation_server/models/custom_modeling/idefics3.py
new file mode 100644
index 00000000..580398cb
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/idefics3.py
@@ -0,0 +1,584 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Idefics3 model."""
+
+from typing import List, Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+from text_generation_server.models.custom_modeling.vlm import (
+    load_text_model,
+)
+from text_generation_server.layers.attention import Seqlen
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+)
+from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Idefics3VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the need to resize them to the same
+    fixed size. In particular, we start from the original pre-trained SigLIP model
+    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+        self.patch_embedding.bias = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.position_embedding", weights=weights
+        )
+
+    def forward(
+        self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics3VisionAttention(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = self.embed_dim // self.num_heads
+        if self.head_size * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_size**-0.5
+        self.dropout = config.attention_dropout
+
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.embed_dim = self.embed_dim // weights.process_group.size()
+
+        self.qkv = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
+        )
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv(hidden_states)
+        query_states, key_states, value_states = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                self.head_size * self.num_heads,
+                self.head_size * self.num_heads,
+            ],
+            dim=2,
+        )
+
+        query_states = query_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = (
+            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        )
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output
+
+
+class Idefics3VisionMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics3EncoderLayer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Idefics3VisionAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.layer_norm1 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm1", eps=config.layer_norm_eps, weights=weights
+        )
+        self.layer_norm2 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm2", eps=config.layer_norm_eps, weights=weights
+        )
+        self.mlp = Idefics3VisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+
+    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Idefics3Encoder(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Idefics3EncoderLayer(
+                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+        return hidden_states
+
+
+class Idefics3VisionTransformer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embeddings = Idefics3VisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
+        self.encoder = Idefics3Encoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+        self.post_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.post_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+    ):
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_size = self.config.patch_size
+            patch_attention_mask = torch.ones(
+                (
+                    batch_size,
+                    pixel_values.size(2) // patch_size,
+                    pixel_values.size(3) // patch_size,
+                )
+            )
+            patch_attention_mask = patch_attention_mask.to(
+                dtype=torch.bool, device=pixel_values.device
+            )
+
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask
+        )
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            patch_attention_mask = None
+        else:
+            patch_attention_mask = _prepare_4d_attention_mask(
+                patch_attention_mask, hidden_states.dtype
+            )
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=patch_attention_mask,
+        )
+
+        last_hidden_state = encoder_outputs
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        return last_hidden_state
+
+
+class Idefics3SimpleMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        input_size = config.vision_config.hidden_size * (config.scale_factor**2)
+        output_size = config.text_config.hidden_size
+        proj = nn.Parameter(
+            weights.get_tensor(f"{prefix}.modality_projection.proj.weight"),
+            requires_grad=False,
+        ).to(weights.dtype)
+        self.proj = nn.Linear(input_size, output_size, bias=False)
+        self.proj.weight = proj
+
+    def forward(self, x):
+        return self.proj(x)
+
+
+class Idefics3Connector(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.modality_projection = Idefics3SimpleMLP(prefix, config, weights)
+        self.scale_factor = config.scale_factor
+
+    def pixel_shuffle(self, x, scale_factor=2):
+        bsz, seq, embed_dim = x.size()
+        height = width = int(seq**0.5)
+        x = x.view(bsz, height, width, embed_dim)
+        x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(
+            bsz,
+            int(width / scale_factor),
+            int(height / scale_factor),
+            embed_dim * (scale_factor**2),
+        )
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
+        return x
+
+    def forward(self, image_hidden_states):
+        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        return image_hidden_states
+
+
+class Idefics3ForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = None
+        config.vision_config.speculator = config.speculator
+        config.text_config.quantize = config.quantize
+        config.text_config.speculator = config.speculator
+        # set tie_word_embeddings to True to load `.embed_tokens.weight` instead of `.lm_head.weight`
+        # since Idefics3 uses the `embed_tokens` for the final prediction
+        # config.text_config.tie_word_embeddings = True
+
+        vision_config = config.vision_config
+        self.text_model = load_text_model(
+            prefix="model" if not prefix else f"{prefix}.model",
+            config=config.text_config,
+            weights=weights,
+            name="text_model",
+        )
+        self.dtype = weights.dtype
+
+        # The vision and connector models are not quantized.
+        with weights.use_loader(DefaultWeightsLoader(UnquantizedWeight)):
+            self.vision_model = Idefics3VisionTransformer(
+                prefix=(
+                    f"{prefix}.model.vision_model" if prefix else "model.vision_model"
+                ),
+                config=vision_config,
+                weights=weights,
+            )
+
+            config.quantize = None
+            self.connector = Idefics3Connector(
+                prefix=f"{prefix}.model.connector" if prefix else "model.connector",
+                config=config,
+                weights=weights,
+            )
+
+        self.config = config
+        self.image_token_id = config.image_token_id
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+
+    def _merge_input_ids_with_image_features(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        image_features: torch.Tensor,
+    ):
+        """In place merges in vision_embeddings with inputs_embeds."""
+        # mask = input_ids == self.config.image_token_index
+        mask = input_ids == self.config.image_token_id
+        # Let's pray we have enabled enough slots !
+        inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        # Unused here
+        image_sizes: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        image_indices=None,
+    ):
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
+        if pixel_values is not None:
+            batch_size, num_images, num_channels, height, width = pixel_values.shape
+            all_states = []
+            all_pixel_values = pixel_values
+            all_pixel_mask = pixel_attention_mask
+            for i in range(batch_size):
+                pixel_values = all_pixel_values.to(
+                    dtype=self.dtype
+                )  # fp16 compatibility
+                pixel_values = pixel_values[i : i + 1]
+                pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])
+
+                # Remove padding images - padding images are full 0.
+                nb_values_per_image = pixel_values.shape[1:].numel()
+                real_images_inds = (pixel_values == 0.0).sum(
+                    dim=(-1, -2, -3)
+                ) != nb_values_per_image
+                pixel_values = pixel_values[real_images_inds].contiguous()
+                # Handle the vision attention mask
+                if pixel_attention_mask is None:
+                    pixel_attention_mask = torch.ones(
+                        size=(
+                            pixel_values.size(0),
+                            pixel_values.size(2),
+                            pixel_values.size(3),
+                        ),
+                        dtype=torch.bool,
+                        device=pixel_values.device,
+                    )
+                else:
+                    # Remove padding images from the mask/pP p
+                    pixel_attention_mask = all_pixel_mask[i : i + 1]
+                    pixel_attention_mask = pixel_attention_mask.view(
+                        1 * num_images, *pixel_attention_mask.shape[2:]
+                    )
+                    pixel_attention_mask = pixel_attention_mask[
+                        real_images_inds
+                    ].contiguous()
+
+                patch_size = self.config.vision_config.patch_size
+                patches_subgrid = pixel_attention_mask.unfold(
+                    dimension=1, size=patch_size, step=patch_size
+                )
+                patches_subgrid = patches_subgrid.unfold(
+                    dimension=2, size=patch_size, step=patch_size
+                )
+                patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+                # Get sequence from the vision encoder
+                image_hidden_states = self.vision_model(
+                    pixel_values=pixel_values,
+                    patch_attention_mask=patch_attention_mask,
+                )
+
+                # Modality projection & resampling
+                image_hidden_states = self.connector(
+                    image_hidden_states,
+                )
+
+                all_states.append(image_hidden_states)
+            image_hidden_states = torch.stack(all_states, dim=0)
+
+            inputs_embeds = self._merge_input_ids_with_image_features(
+                input_ids, inputs_embeds, image_hidden_states
+            )
+
+        hidden_states = self.text_model.model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=None,
+            adapter_data=adapter_data,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/vlm.py b/server/text_generation_server/models/custom_modeling/vlm.py
index 82e409a6..94b8522d 100644
--- a/server/text_generation_server/models/custom_modeling/vlm.py
+++ b/server/text_generation_server/models/custom_modeling/vlm.py
@@ -4,7 +4,7 @@ def load_text_model(prefix, config, weights, name=None):
             FlashLlamaForCausalLM,
         )
 
-        return FlashLlamaForCausalLM(prefix, config, weights)
+        return FlashLlamaForCausalLM(prefix, config, weights, name=name)
     elif config.model_type == "mistral":
         from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
             FlashMistralForCausalLM,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index c63ca1db..19fda9f1 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1288,7 +1288,7 @@ class FlashCausalLM(Model):
             weights_loader=weights_loader,
         )
 
-        prefix = ""
+        prefix = None
         model = model_class(prefix, config, weights)
         torch.distributed.barrier(group=self.process_group)
 
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 81b4369b..db78341d 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -13,6 +13,7 @@ from text_generation_server.models.flash_causal_lm import (
     FlashCausalLM,
 )
 from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
+from loguru import logger
 from text_generation_server.utils.log import log_master
 from transformers import AutoProcessor
 from text_generation_server.layers.attention import Seqlen
@@ -23,6 +24,40 @@ tracer = trace.get_tracer(__name__)
 IDEFICS2_FAKE_TOKEN = "<fake_token_around_image>"
 IDEFICS2_IMAGE_TOKEN = "<image>"
 
+IDEFICS3_IMAGE_TOKEN = "<image>"
+IDEFICS3_FAKE_IMAGE_TOKEN = "<fake_token_around_image>"
+IDEFICS3_GLOBAL_IMG_TOKEN = "<global-img>"
+
+
+# copied from: https://github.com/huggingface/transformers/blob/02ed609285c2448b3b54c31e362f2c389fa952ab/src/transformers/models/idefics3/processing_idefics3.py#L44-L60
+def _prompt_split_image(
+    *,
+    image_seq_len: int,
+    image_rows: int,
+    image_cols: int,
+    fake_token_around_image: str,
+    image_token: str,
+    global_img_token: str,
+):
+    """Prompt with expanded image tokens for when the image is split into patches."""
+    text_split_images = ""
+    for n_h in range(image_rows):
+        for n_w in range(image_cols):
+            text_split_images += (
+                f"{fake_token_around_image}"
+                + f"<row_{n_h + 1}_col_{n_w + 1}>"
+                + f"{image_token}" * image_seq_len
+            )
+        text_split_images += "\n"
+
+    text_split_images += (
+        f"\n{fake_token_around_image}"
+        + f"{global_img_token}"
+        + f"{image_token}" * image_seq_len
+        + f"{fake_token_around_image}"
+    )
+    return text_split_images
+
 
 def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
     """
@@ -54,10 +89,26 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
         if processor.image_processor.do_image_splitting:
             image_str *= 5
         return image_str
+    if config.model_type == "idefics3":
+        # TODO: implement this in a more general way
+        n_rows = image_input["rows"][0][image_id]
+        n_cols = image_input["cols"][0][image_id]
+        image_seq_len = int(
+            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2)
+            / (config.scale_factor**2)
+        )
+        image_str = _prompt_split_image(
+            image_seq_len=image_seq_len,
+            image_rows=n_rows,
+            image_cols=n_cols,
+            fake_token_around_image=IDEFICS3_FAKE_IMAGE_TOKEN,
+            image_token=IDEFICS3_IMAGE_TOKEN,
+            global_img_token=IDEFICS3_GLOBAL_IMG_TOKEN,
+        )
+        return image_str
     elif config.model_type == "llava_next":
         height, width = image_input["image_sizes"][image_id]
         num_features = get_number_of_features(height, width, config)
-        from loguru import logger
 
         log_master(
             logger.info,
@@ -194,12 +245,21 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                     raise RuntimeError(f"Invalid chunk type {chunk_type}")
 
         if images:
-            image_inputs = processor.image_processor(images, return_tensors="pt")
+            kwargs = {}
+            if (
+                hasattr(processor, "image_processor_class")
+                and processor.image_processor_class == "Idefics3ImageProcessor"
+            ):
+                kwargs["return_row_col_info"] = True
+
+            image_inputs = processor.image_processor(
+                images, return_tensors="pt", **kwargs
+            )
         else:
             image_inputs = None
 
-        batch_inputs = []
-        max_truncation = 0
+        batch_tokenized_inputs = []
+        max_length = 0
         image_id = 0
         for r in requests:
             full_text = ""
@@ -214,16 +274,14 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                     image_id += 1
 
             full_text = image_text_replacement_fixup(config, full_text)
-
-            batch_inputs.append(full_text)
-            max_truncation = max(max_truncation, r.truncate)
-
-        batch_tokenized_inputs = tokenizer(
-            batch_inputs,
-            truncation=True,
-            max_length=max_truncation,
-            add_special_tokens=not config.model_type == "paligemma",
-        )["input_ids"]
+            input_ids = tokenizer(
+                full_text,
+                truncation=True,
+                max_length=r.truncate,
+                add_special_tokens=r.add_special_tokens,
+            )["input_ids"]
+            max_length = max(max_length, len(input_ids))
+            batch_tokenized_inputs.append(input_ids)
 
         return batch_tokenized_inputs, image_inputs
 

From 4f7e00f4ce855935116218d3ef1d074dad7c0f98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 10 Jan 2025 12:43:44 +0100
Subject: [PATCH 015/183] Update to marlin-kernels 0.3.7 (#2882)

This fixes a race condition. See:

https://github.com/vllm-project/vllm/pull/11493
---
 flake.lock            |  7 +++----
 flake.nix             |  2 +-
 server/poetry.lock    | 28 ++++++++++++++--------------
 server/pyproject.toml |  8 ++++----
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/flake.lock b/flake.lock
index 44802e18..d304e3cc 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,16 +978,15 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1736179589,
-        "narHash": "sha256-/zZCSieBJncVXqOFbvbSov76g2eWAxVxEJNNA6SmQKc=",
+        "lastModified": 1736436388,
+        "narHash": "sha256-CIyxVPpM9RrSwthNT/4DQ10YPk/uwzP7AeE83kBNsrE=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "fc7ff53b2cd5c984ad1434f20c271e3b7600d1c4",
+        "rev": "5103c3fb1f9ad1fd33b6e09ff05e957884b112d5",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "flashinfer-v0.2",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index a302db3e..83cedfa6 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/flashinfer-v0.2";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/server/poetry.lock b/server/poetry.lock
index 7cf440dd..69133015 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
 
 [[package]]
 name = "accelerate"
@@ -1289,12 +1289,12 @@ files = [
 
 [[package]]
 name = "marlin-kernels"
-version = "0.3.6"
+version = "0.3.7"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.6+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:afedaa9a15e8991442bc8c81f62833fbf5c1556ae9d7a5a9e13b747ce97beef9"},
+    {file = "marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:bb416d14623dc0ad0eeb2835446c37a41f994555f1baec8701de6d4c1fc17ec8"},
 ]
 
 [package.dependencies]
@@ -1302,16 +1302,16 @@ torch = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.6/marlin_kernels-0.3.6+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
 
 [[package]]
 name = "marlin-kernels"
-version = "0.3.6"
+version = "0.3.7"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.6+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:c0c05621d5e87144415d8a6e439072bd844d5f3cb55e4c4c69eabdc4c94610f4"},
+    {file = "marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:a89bb61d718002d4432158641bce95c6fd68f9ee1a7d5402dd283903397f3185"},
 ]
 
 [package.dependencies]
@@ -1319,16 +1319,16 @@ torch = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.6/marlin_kernels-0.3.6+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
 
 [[package]]
 name = "marlin-kernels"
-version = "0.3.6"
+version = "0.3.7"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.6+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:3be4662c8d25a3cdb1793dafe0e2e76dd600913a69a468e2c68d1fed4e149255"},
+    {file = "marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:ed938d196fc5e9cce9fc44cd2b889d5adc5ca7475c8a23858f1474d29e38bdbf"},
 ]
 
 [package.dependencies]
@@ -1336,16 +1336,16 @@ torch = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.6/marlin_kernels-0.3.6+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
 
 [[package]]
 name = "marlin-kernels"
-version = "0.3.6"
+version = "0.3.7"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.3.6+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:89eac9d46bc084a256b538afda6053683eb7e505db0e0d4f6dbeca32368caac6"},
+    {file = "marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:113c54f68565ad476ca12366b4de92131fa3e9ddb16cbe8ad63272972a15ac28"},
 ]
 
 [package.dependencies]
@@ -1353,7 +1353,7 @@ torch = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.6/marlin_kernels-0.3.6+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
 
 [[package]]
 name = "mdurl"
@@ -4097,4 +4097,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "c7fdcff2b752cd3beb3995c1ecd15f0f4d9b4e117048b06ab991c6d0e0c86ff3"
+content-hash = "25f96d5dea777bfa7a959f863e35d2e05e1a6172d0dd45193dbe25ac2f32cc25"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 0d56e9c7..bc74a05a 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -48,10 +48,10 @@ attention-kernels = [
   { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.6/marlin_kernels-0.3.6+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.6/marlin_kernels-0.3.6+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.6/marlin_kernels-0.3.6+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.6/marlin_kernels-0.3.6+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 moe-kernels = [
   { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },

From 01067f8ba8bc1d378bb0fb3cd7ef6fe43fba86c0 Mon Sep 17 00:00:00 2001
From: Dmitry Dygalo <dadygalo@gmail.com>
Date: Fri, 10 Jan 2025 06:01:54 -0800
Subject: [PATCH 016/183] chore: Update jsonschema to 0.28.0 (#2870)

* chore: Update jsonschema to 0.28.0

Signed-off-by: Dmitry Dygalo <dmitry@dygalo.dev>

* chore: Enable blocking feature for reqwest

Signed-off-by: Dmitry Dygalo <dmitry@dygalo.dev>

---------

Signed-off-by: Dmitry Dygalo <dmitry@dygalo.dev>
---
 Cargo.lock               | 214 ++++++++++++++++++++++++++++++++-------
 backends/v2/Cargo.toml   |   2 +-
 backends/v3/Cargo.toml   |   2 +-
 router/Cargo.toml        |   4 +-
 router/src/validation.rs |   5 +-
 5 files changed, 180 insertions(+), 47 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 74ae6e16..e63d1540 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -456,18 +456,18 @@ dependencies = [
 
 [[package]]
 name = "bit-set"
-version = "0.5.3"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
 dependencies = [
  "bit-vec",
 ]
 
 [[package]]
 name = "bit-vec"
-version = "0.6.3"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
 
 [[package]]
 name = "bit_field"
@@ -502,6 +502,12 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "borrow-or-share"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3eeab4423108c5d7c744f4d234de88d18d636100093ae04caf4825134b9c3a32"
+
 [[package]]
 name = "built"
 version = "0.7.5"
@@ -1139,6 +1145,15 @@ version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
+[[package]]
+name = "email_address"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e079f19b08ca6239f47f8ba8509c11cf3ea30095831f7fed61441475edd8c449"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "encode_unicode"
 version = "0.3.6"
@@ -1196,12 +1211,13 @@ dependencies = [
 
 [[package]]
 name = "fancy-regex"
-version = "0.11.0"
+version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b95f7c0680e4142284cf8b22c14a476e87d61b004a3a0861872b32ef7ead40a2"
+checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
 dependencies = [
  "bit-set",
- "regex",
+ "regex-automata 0.4.9",
+ "regex-syntax 0.8.5",
 ]
 
 [[package]]
@@ -1247,6 +1263,17 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28a80e3145d8ad11ba0995949bbcf48b9df2be62772b3d351ef017dff6ecb853"
 
+[[package]]
+name = "fluent-uri"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1918b65d96df47d3591bed19c5cca17e3fa5d0707318e4b5ef2eae01764df7e5"
+dependencies = [
+ "borrow-or-share",
+ "ref-cast",
+ "serde",
+]
+
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -1285,9 +1312,9 @@ dependencies = [
 
 [[package]]
 name = "fraction"
-version = "0.13.1"
+version = "0.15.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3027ae1df8d41b4bed2241c8fdad4acc1e7af60c8e17743534b545e77182d678"
+checksum = "0f158e3ff0a1b334408dc9fb811cd99b446986f4d8b741bb08f9df1604085ae7"
 dependencies = [
  "lazy_static",
  "num",
@@ -1414,10 +1441,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 dependencies = [
  "cfg-if",
- "js-sys",
  "libc",
  "wasi",
- "wasm-bindgen",
 ]
 
 [[package]]
@@ -1573,7 +1598,7 @@ dependencies = [
  "native-tls",
  "num_cpus",
  "rand",
- "reqwest",
+ "reqwest 0.11.27",
  "serde",
  "serde_json",
  "thiserror",
@@ -2051,15 +2076,6 @@ version = "1.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
 
-[[package]]
-name = "iso8601"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "924e5d73ea28f59011fec52a0d12185d496a9b075d360657aed2a5707f701153"
-dependencies = [
- "nom",
-]
-
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -2128,32 +2144,27 @@ dependencies = [
 
 [[package]]
 name = "jsonschema"
-version = "0.17.1"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a071f4f7efc9a9118dfb627a0a94ef247986e1ab8606a4c806ae2b3aa3b6978"
+checksum = "74d8eb539cdb4222da29bb658cc9881aa2477b33fb1a74c5c31450395fc1a4b2"
 dependencies = [
  "ahash",
- "anyhow",
- "base64 0.21.7",
+ "base64 0.22.1",
  "bytecount",
- "clap 4.5.21",
+ "email_address",
  "fancy-regex",
  "fraction",
- "getrandom",
- "iso8601",
+ "idna",
  "itoa",
- "memchr",
  "num-cmp",
  "once_cell",
- "parking_lot",
  "percent-encoding",
- "regex",
- "reqwest",
+ "referencing",
+ "regex-syntax 0.8.5",
+ "reqwest 0.12.9",
  "serde",
  "serde_json",
- "time",
- "url",
- "uuid",
+ "uuid-simd",
 ]
 
 [[package]]
@@ -2984,6 +2995,12 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "outref"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
+
 [[package]]
 name = "overload"
 version = "0.1.1"
@@ -3557,6 +3574,39 @@ dependencies = [
  "thiserror",
 ]
 
+[[package]]
+name = "ref-cast"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf0a6f84d5f1d581da8b41b47ec8600871962f2a528115b542b362d4b744931"
+dependencies = [
+ "ref-cast-impl",
+]
+
+[[package]]
+name = "ref-cast-impl"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.89",
+]
+
+[[package]]
+name = "referencing"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "093a875008827c0ae15c746189966e162faa05bf347719d06302c548ac63630f"
+dependencies = [
+ "ahash",
+ "fluent-uri",
+ "once_cell",
+ "percent-encoding",
+ "serde_json",
+]
+
 [[package]]
 name = "regex"
 version = "1.11.1"
@@ -3641,6 +3691,42 @@ dependencies = [
  "winreg",
 ]
 
+[[package]]
+name = "reqwest"
+version = "0.12.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.1",
+ "http-body-util",
+ "hyper 1.5.1",
+ "hyper-util",
+ "ipnet",
+ "js-sys",
+ "log",
+ "mime",
+ "once_cell",
+ "percent-encoding",
+ "pin-project-lite",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper 1.0.2",
+ "tokio",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+ "windows-registry",
+]
+
 [[package]]
 name = "rgb"
 version = "0.8.50"
@@ -4220,6 +4306,9 @@ name = "sync_wrapper"
 version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]
 
 [[package]]
 name = "synstructure"
@@ -4404,7 +4493,7 @@ dependencies = [
  "once_cell",
  "pyo3",
  "regex",
- "reqwest",
+ "reqwest 0.11.27",
  "serde",
  "serde_json",
  "thiserror",
@@ -4445,7 +4534,7 @@ dependencies = [
  "pyo3",
  "rand",
  "regex",
- "reqwest",
+ "reqwest 0.11.27",
  "serde",
  "serde_json",
  "sysinfo",
@@ -4493,7 +4582,7 @@ dependencies = [
  "prost-build",
  "rand",
  "regex",
- "reqwest",
+ "reqwest 0.11.27",
  "serde",
  "serde_json",
  "slotmap",
@@ -4544,7 +4633,7 @@ dependencies = [
  "prost-build",
  "rand",
  "regex",
- "reqwest",
+ "reqwest 0.11.27",
  "serde",
  "serde_json",
  "slotmap",
@@ -5298,6 +5387,17 @@ dependencies = [
  "syn 2.0.89",
 ]
 
+[[package]]
+name = "uuid-simd"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23b082222b4f6619906941c17eb2297fff4c2fb96cb60164170522942a200bd8"
+dependencies = [
+ "outref",
+ "uuid",
+ "vsimd",
+]
+
 [[package]]
 name = "v_frame"
 version = "0.3.8"
@@ -5349,6 +5449,12 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
 
+[[package]]
+name = "vsimd"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64"
+
 [[package]]
 name = "walkdir"
 version = "2.5.0"
@@ -5558,6 +5664,36 @@ dependencies = [
  "windows-targets 0.52.6",
 ]
 
+[[package]]
+name = "windows-registry"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0"
+dependencies = [
+ "windows-result",
+ "windows-strings",
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-result"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-strings"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10"
+dependencies = [
+ "windows-result",
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.45.0"
diff --git a/backends/v2/Cargo.toml b/backends/v2/Cargo.toml
index 4d32474e..0decf41a 100644
--- a/backends/v2/Cargo.toml
+++ b/backends/v2/Cargo.toml
@@ -23,7 +23,7 @@ clap = { version = "4.4.5", features = ["derive", "env"] }
 grpc-metadata = { path = "../grpc-metadata" }
 futures = "0.3.28"
 hf-hub = { workspace = true }
-jsonschema = { version = "0.17.1", features = ["draft202012"] }
+jsonschema = { version = "0.28.0" }
 metrics = { workspace = true }
 metrics-exporter-prometheus = { workspace = true }
 nohash-hasher = "0.2.0"
diff --git a/backends/v3/Cargo.toml b/backends/v3/Cargo.toml
index 69dad072..996290ed 100644
--- a/backends/v3/Cargo.toml
+++ b/backends/v3/Cargo.toml
@@ -23,7 +23,7 @@ clap = { version = "4.4.5", features = ["derive", "env"] }
 grpc-metadata = { path = "../grpc-metadata" }
 futures = "0.3.28"
 hf-hub = { workspace = true }
-jsonschema = { version = "0.17.1", features = ["draft202012"] }
+jsonschema = { version = "0.28.0" }
 metrics = { workspace = true }
 metrics-exporter-prometheus = { workspace = true }
 nohash-hasher = "0.2.0"
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 9258fe03..2e621dfc 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -17,7 +17,7 @@ clap = { version = "4.4.5", features = ["derive", "env"] }
 futures = "0.3.28"
 hf-hub = { workspace = true }
 itertools = "0.10"
-jsonschema = { version = "0.17.1", features = ["draft202012"] }
+jsonschema = { version = "0.28.0" }
 metrics = { workspace = true }
 metrics-exporter-prometheus = { workspace = true }
 nohash-hasher = "0.2.0"
@@ -25,7 +25,7 @@ opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
 opentelemetry-otlp = "0.13.0"
 outlines-core = { git = "https://github.com/dottxt-ai/outlines-core.git", rev = "ba10c619fc9bf3c487e43f49bdecb95a24bb465c" }
 rand = "0.8.5"
-reqwest = { version = "0.11.20", features = [] }
+reqwest = { version = "0.11.20", features = ["blocking"] }
 serde = "1.0.188"
 serde_json = "1.0.107"
 thiserror = "1.0.48"
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 6d5b06bd..cdf954de 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -7,7 +7,6 @@ use crate::{
 use crate::{PyTokenizer, Tokenizer};
 use base64::{engine::general_purpose::STANDARD, Engine};
 use image::{ImageFormat, ImageReader};
-use jsonschema::{Draft, JSONSchema};
 use outlines_core::json_schema::to_regex as json_schema_to_regex;
 use rand::{thread_rng, Rng};
 use serde_json::Value;
@@ -355,9 +354,7 @@ impl Validation {
                         }?;
 
                         // Check if the json is a valid JSONSchema
-                        JSONSchema::options()
-                            .with_draft(Draft::Draft202012)
-                            .compile(&json)
+                        jsonschema::draft202012::meta::validate(&json)
                             .map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?;
 
                         // The schema can be valid but lack properties.

From 83624a07be8894dcdb9d35c8519201b23ccb07c9 Mon Sep 17 00:00:00 2001
From: lazariv <lazariv.taras@gmail.com>
Date: Fri, 10 Jan 2025 16:12:02 +0100
Subject: [PATCH 017/183] Add possible variants for A100 and H100 GPUs for
 auto-detecting flops (#2837)

* Update main.rs with A100 and H100 variants

* Add another variant "nvidia-h100-nvl"

* Update main.rs

Add nvidia-a100-sxm4-40gb
---
 launcher/src/main.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index fb6ba2b2..c092d745 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1652,7 +1652,11 @@ impl From<&str> for Gpu {
             "nvidia-l40s" => Gpu::L40S,
             "nvidia-a10g" => Gpu::A10G,
             "nvidia-h100-80gb-hbm3" => Gpu::H100,
+            "nvidia-h100-nvl" => Gpu::H100,
+            "nvidia-h100" => Gpu::H100,
             "nvidia-a100-sxm4-80gb" => Gpu::A100,
+            "nvidia-a100-sxm4-40gb" => Gpu::A100,
+            "nvidia-a100-80gb-pcie" => Gpu::A100,
             "nvidia-a100" => Gpu::A100,
             card => Gpu::Unknown(card.to_string()),
         }

From 2e22164d4a3430c51792b041debd04a35f2ceccb Mon Sep 17 00:00:00 2001
From: Nicholas Broad <nbroad94@gmail.com>
Date: Mon, 13 Jan 2025 02:09:35 -0800
Subject: [PATCH 018/183] Update using_guidance.md (#2901)

deletes one copy of a sentence that repeated twice
---
 docs/source/basic_tutorials/using_guidance.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/basic_tutorials/using_guidance.md b/docs/source/basic_tutorials/using_guidance.md
index 2d55c952..e389fbbc 100644
--- a/docs/source/basic_tutorials/using_guidance.md
+++ b/docs/source/basic_tutorials/using_guidance.md
@@ -187,8 +187,6 @@ In addition to the grammar parameter, we've also introduced a set of tools and f
 
 Tools are a set of user defined functions that can be used in tandem with the chat functionality to enhance the LLM's capabilities. Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
 
-Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
-
 ```json
 curl localhost:3000/v1/chat/completions \
     -X POST \

From 1660154ae656e18244261df6244c4daf15e5a38a Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Mon, 13 Jan 2025 18:11:31 +0800
Subject: [PATCH 019/183] fix crash in torch2.6 if TP=1 (#2885)

error like "ValueError: Expecting a ProcessGroup, but got a <class
'text_generation_server.utils.dist.FakeGroup'>. rank=0"

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 server/text_generation_server/utils/dist.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/server/text_generation_server/utils/dist.py b/server/text_generation_server/utils/dist.py
index 82aeba6c..1b766ddf 100644
--- a/server/text_generation_server/utils/dist.py
+++ b/server/text_generation_server/utils/dist.py
@@ -1,6 +1,6 @@
 import os
 import torch
-
+from torch.distributed import ProcessGroup
 from datetime import timedelta
 from loguru import logger
 from text_generation_server.utils.import_utils import SYSTEM
@@ -18,10 +18,11 @@ class FakeBarrier:
         pass
 
 
-class FakeGroup:
+class FakeGroup(ProcessGroup):
     def __init__(self, rank, size):
         self._rank = rank
         self._size = size
+        super().__init__(rank, size)
 
     def allreduce(self, *args, **kwargs):
         return FakeBarrier()

From 880ab9c2f344fe6b97e35dbf5bfbafc0a8736cb9 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 13 Jan 2025 15:42:35 +0530
Subject: [PATCH 020/183] Add Flash decoding kernel ROCm (#2855)

* (vllm) updated vllm rocm kernels

* revert silu

* update partition size

* remove grouped_topk

* (nit) remove log

* add flash decoding
---
 server/Makefile-flash-att-v2                  |  2 +-
 .../layers/attention/rocm.py                  | 48 +++++++++++++++++--
 .../models/flash_causal_lm.py                 | 14 ++++--
 3 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2
index a9cdf782..9a946d97 100644
--- a/server/Makefile-flash-att-v2
+++ b/server/Makefile-flash-att-v2
@@ -1,5 +1,5 @@
 flash_att_v2_commit_cuda := v2.6.1
-flash_att_v2_commit_rocm := 2092111b9f975b3347c652ff7fabd431130256c4
+flash_att_v2_commit_rocm := 47bd46e0204a95762ae48712fd1a3978827c77fd
 
 build-flash-attention-v2-cuda:
 	pip install -U packaging wheel
diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py
index 0cfac25b..69a245ad 100644
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -5,6 +5,10 @@ from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import Seqlen
 from text_generation_server.utils.log import log_master
+from text_generation_server.models.globals import (
+    ATTENTION,
+    BLOCK_SIZE,
+)
 from loguru import logger
 import vllm._custom_ops as ops
 
@@ -73,11 +77,44 @@ def paged_attention(
     # limitations under the License.
     #
 
+    if ATTENTION == "flashdecoding":
+        max_q = 1
+        max_k = max_s
+        import flash_attn_2_cuda
+
+        if softcap is None:
+            softcap = 0.0
+        out = flash_attn_2_cuda.varlen_fwd(
+            query,
+            kv_cache.key,
+            kv_cache.value,
+            None,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_k,
+            None,  # pad_k
+            None,
+            block_tables,
+            None,
+            max_q,
+            max_k,
+            0.0,  # dropout
+            softmax_scale,
+            False,  # zero_tensors
+            True,  # causal
+            -1,  # Window_left
+            -1,  # Window right
+            softcap,
+            False,  # return softmax
+            None,  # generator
+        )
+        return out[0]
+
     if softcap is not None:
         raise RuntimeError("Paged attention doesn't support softcapping")
 
     # value_cache => [num_blocks, num_heads, head_size, block_size]
-    block_size = kv_cache.value.shape[3]
+    # block_size = kv_cache.value.shape[3]
+    block_size = BLOCK_SIZE
     num_seqs, num_heads, head_size = query.shape
 
     num_kv_heads = kv_cache.key.shape[1]
@@ -247,14 +284,15 @@ def attention(
         # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
         return flash_attn_2_cuda.varlen_fwd(
             query,
-            key,
-            value,
+            # flashdecoding: pass the KV caches, paged: pass the KV.
+            kv_cache.key if ATTENTION == "flashdecoding" else key,
+            kv_cache.value if ATTENTION == "flashdecoding" else value,
             out,
             seqlen.cu_seqlen_q,
-            seqlen.cu_seqlen_q,
-            None,
+            seqlen.cu_seqlen_k,
             None,
             None,
+            block_tables if ATTENTION == "flashdecoding" else None,
             None,
             seqlen.max_q,
             seqlen.max_k,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 19fda9f1..03fc6147 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1663,7 +1663,7 @@ class FlashCausalLM(Model):
 
                 for seqlen in tuning_sequences:
                     log_master(logger.info, f"Warming up TunableOp for seqlen={seqlen}")
-                    self.tunableop_warmup(seqlen)
+                    self.tunableop_warmup(seqlen, max_total_tokens)
                     torch.cuda.tunable.write_file(tunableop_filepath)
                 if os.environ.get("PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP") != "1":
                     torch.cuda.tunable.tuning_enable(False)
@@ -1710,7 +1710,7 @@ class FlashCausalLM(Model):
         assert max_total_tokens is not None
         return int(num_blocks * BLOCK_SIZE), max_input_tokens, max_total_tokens
 
-    def tunableop_warmup(self, seqlen: int):
+    def tunableop_warmup(self, seqlen: int, max_bt: int):
         input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device)
         position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
         slots = torch.arange(seqlen, dtype=torch.int64, device=self.device)
@@ -1724,11 +1724,15 @@ class FlashCausalLM(Model):
             [0, seqlen], device=self.device, dtype=torch.int32
         )
         max_s = seqlen
+
+        block_tables = torch.arange(
+            max_bt, dtype=torch.int32, device=self.device
+        ).repeat(seqlen)
+        block_tables = block_tables.reshape((seqlen, max_bt))
+
         seqlen = Seqlen(
             input_lengths=input_lengths,
             cache_lengths=cache_lengths_tensor,
-            cu_seqlen_q=cu_seqlen_prefill,
-            max_q=1,
             max_k=seqlen,
         )
 
@@ -1738,7 +1742,7 @@ class FlashCausalLM(Model):
             position_ids=position_ids,
             cu_seqlen_prefill=cu_seqlen_prefill,
             kv_cache=self.kv_cache,
-            block_tables=None,
+            block_tables=block_tables,
             seqlen=seqlen,
             slots=slots,
             max_s=max_s,

From e07acc7f68c6271ce675010ec607cca9449635c0 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Wed, 15 Jan 2025 11:38:58 +0530
Subject: [PATCH 021/183] Enable FP8 Per-Tensor Scales and Integrate Marlin/MoE
 Kernels Repo for ROCm (#2825)

* (feat) convert tscales to tensorwise

* (fix) fp8 scaling for cuda

* (kernel) add marlin-kernels

* add moe-kernels

* fix moe kernel comit

* fix scaling

* nm changes
---
 Dockerfile_amd                                |  12 ++
 .../layers/attention/rocm.py                  |   6 +-
 .../layers/compressed_tensors/w8an_fp.py      |  40 ++++--
 server/text_generation_server/layers/fp8.py   | 125 +++++++++++++-----
 4 files changed, 134 insertions(+), 49 deletions(-)

diff --git a/Dockerfile_amd b/Dockerfile_amd
index dc748f49..1f34ffa3 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -268,6 +268,15 @@ COPY server/exllamav2_kernels/ .
 
 RUN python setup.py build
 
+FROM kernel-builder AS marlin-kernels
+WORKDIR /usr/src
+ENV MARLIN_KERNELS_BRANCH=v0.3.6
+ENV VLLM_TARGET_DEVICE=rocm
+RUN git clone https://github.com/danieldk/marlin-kernels.git && \
+    cd marlin-kernels && \
+    git checkout ${MARLIN_KERNELS_BRANCH} && \
+    python setup.py install
+
 FROM kernel-builder AS moe-kernels
 WORKDIR /usr/src
 ENV MOE_KERNELS_BRANCH=a67b35841774b2056a73806c36661134b5054edd
@@ -299,6 +308,9 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311
 # Copy build artifacts from exllamav2 kernels builder
 COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
+# Copy build artifacts from marlin kernels
+COPY --from=marlin-kernels /usr/src/marlin-kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+
 # Copy build artifacts from moe kernels
 COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py
index 69a245ad..b94b737d 100644
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -163,17 +163,17 @@ def paged_attention(
     else:
         # Run PagedAttention V2.
         assert _PARTITION_SIZE % block_size == 0
-        tmp_output = torch.empty(
+        tmp_output = torch.zeros(
             size=(num_seqs, num_heads, max_num_partitions, head_size),
             dtype=out.dtype,
             device=out.device,
         )
-        exp_sums = torch.empty(
+        exp_sums = torch.zeros(
             size=(num_seqs, num_heads, max_num_partitions),
             dtype=torch.float32,
             device=out.device,
         )
-        max_logits = torch.empty_like(exp_sums)
+        max_logits = torch.zeros_like(exp_sums)
 
         if not use_custom:
             ops.paged_attention_v2(
diff --git a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
index e63c5212..15bdce08 100644
--- a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
+++ b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
@@ -3,8 +3,14 @@ from typing import List, Optional, Union
 import torch
 from compressed_tensors.quantization import QuantizationArgs, QuantizationType
 
-from text_generation_server.layers.fp8 import Fp8Weight, _load_scalar_or_matrix_scale
+from text_generation_server.layers.fp8 import (
+    Fp8Weight,
+    _load_scalar_or_matrix_scale,
+    requantize_with_max_scale,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
 from text_generation_server.utils.weights import Weights, WeightsLoader
+from text_generation_server.utils.import_utils import SYSTEM
 
 
 class W8ANFpLoader(WeightsLoader):
@@ -47,11 +53,10 @@ class W8ANFpLoader(WeightsLoader):
 
         weight_scale = None
         if self.load_weight_scale:
-            weight_scale = (
-                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
-                .reshape(-1)
-                .expand(w.shape[0])
-            )
+            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+
+            if SYSTEM == "cuda":
+                weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
 
         input_scale = None
         if self.load_input_scale:
@@ -87,7 +92,8 @@ class W8ANFpLoader(WeightsLoader):
                     block_sizes=block_sizes,
                     to_dtype=False,
                 )
-            weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
+            if SYSTEM == "cuda":
+                weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
 
         input_scale = None
         if self.load_input_scale:
@@ -141,6 +147,17 @@ class W8ANFpLoader(WeightsLoader):
                 else None
             )
 
+        if self.load_weight_scale or SYSTEM == "rocm":
+            w, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                w, weight_scale, input_scale
+            )
+
+            if weight_scale.numel() == len(prefixes):
+                logical_widths = [x[0] for x in shapes]
+                w, weight_scale = requantize_with_max_scale(
+                    w, weight_scale.to(weights.device), logical_widths, weights.dtype
+                )
+
         return Fp8Weight(
             weight=w,
             weight_scale=weight_scale,
@@ -153,11 +170,10 @@ class W8ANFpLoader(WeightsLoader):
         w = weights.get_sharded(f"{prefix}.weight", dim=1)
         weight_scale = None
         if self.load_weight_scale:
-            weight_scale = (
-                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
-                .reshape(-1)
-                .expand(w.shape[0])
-            )
+            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+
+            if SYSTEM == "cuda":
+                weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
 
         input_scale = None
         if self.load_input_scale:
diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
index 1e5c8b3d..4e83ec9d 100644
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@@ -19,6 +19,9 @@ try:
 except ImportError:
     marlin_kernels = None
 
+quant_dtype: torch.dtype = (
+    torch.float8_e4m3fnuz if SYSTEM == "rocm" else torch.float8_e4m3fn
+)
 
 if SYSTEM == "cuda" and marlin_kernels is not None:
     major, minor = torch.cuda.get_device_capability()
@@ -60,25 +63,58 @@ def normalize_e4m3fn_to_e4m3fnuz(
     weight_scale: torch.Tensor,
     input_scale: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    assert weight.dtype == torch.float8_e4m3fn
-    # The bits pattern 10000000(-128) represents zero in e4m3fn
-    # but NaN in e4m3fnuz. So here we set it to 0.
-    # https://onnx.ai/onnx/technical/float8.html
-    weight_as_int8 = weight.view(torch.int8)
-    ROCM_FP8_NAN_AS_INT = -128
-    weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
-    weight = weight_as_int8.view(torch.float8_e4m3fnuz)
+    if weight.dtype == torch.float8_e4m3fn:
+        # The bits pattern 10000000(-128) represents zero in e4m3fn
+        # but NaN in e4m3fnuz. So here we set it to 0.
+        # https://onnx.ai/onnx/technical/float8.html
+        weight_as_int8 = weight.view(torch.int8)
+        ROCM_FP8_NAN_AS_INT = -128
+        weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
+        weight = weight_as_int8.view(torch.float8_e4m3fnuz)
 
-    # For the same bits representation, e4m3fnuz value is half of
-    # the e4m3fn value, so we should double the scaling factor to
-    # get the same dequantized value.
-    # https://onnx.ai/onnx/technical/float8.html
-    weight_scale = weight_scale * 2.0
-    if input_scale is not None:
-        input_scale = input_scale * 2.0
+        # For the same bits representation, e4m3fnuz value is half of
+        # the e4m3fn value, so we should double the scaling factor to
+        # get the same dequantized value.
+        # https://onnx.ai/onnx/technical/float8.html
+        weight_scale = weight_scale * 2.0
+        if input_scale is not None:
+            input_scale = input_scale * 2.0
     return weight, weight_scale, input_scale
 
 
+def per_tensor_dequantize(
+    tensor: torch.Tensor,
+    inv_scale: Union[float, torch.Tensor],
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    fake_qweight = tensor.to(dtype)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight
+
+
+def requantize_with_max_scale(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    logical_widths: int,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for requanitzation.
+    max_w_scale = weight_scale.max().float()
+
+    start = 0
+    for idx, logical_width in enumerate(logical_widths):
+        end = start + logical_width
+        weight_dq = per_tensor_dequantize(
+            weight[start:end, :], weight_scale[idx], dtype
+        )
+        weight[start:end, :], max_w_scale_normalized = fp8_quantize(
+            weight_dq, max_w_scale
+        )
+        start = end
+
+    return weight, max_w_scale_normalized
+
+
 def fp8_quantize(
     weight: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
@@ -96,7 +132,7 @@ def fp8_quantize(
         shape = weight.shape
         qweight, scale = marlin_kernels.scaled_fp8_quant(
             weight.reshape(-1, shape[-1]),
-            dtype=qdtype,
+            dtype=quant_dtype,
             scale=scale,
             scale_ub=scale_upper_bound,
             # TODO: don't do this when we have to use the Torch kernel.
@@ -116,6 +152,8 @@ def fp8_quantize(
         qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
         scale = scale.float().reciprocal()
     else:
+        if SYSTEM == "rocm":
+            scale = scale / 2.0
         # Use reciprocal to avoid more expensive division.
         qweight = (weight * scale.reciprocal()).clamp(min=finfo.min, max=finfo.max)
 
@@ -141,17 +179,18 @@ class HybridFP8UnquantLoader(WeightsLoader):
 
         if w.dtype == torch.float8_e4m3fn:
             # FP8 branch
-            scale = (
-                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
-                .reshape(-1)
-                .expand(w.shape[0])
-            )
+            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+
+            if SYSTEM == "cuda":
+                scale.reshape(-1).expand(w.shape[0])
 
             input_scale = None
             if weights.has_tensor(f"{prefix}.input_scale"):
-                input_scale = weights.get_tensor(
-                    f"{prefix}.input_scale", to_dtype=False
-                ).reshape(-1)
+                input_scale = (
+                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
+                    .reshape(-1)
+                    .max()
+                )
 
             return Fp8Weight(
                 weight=w,
@@ -178,6 +217,7 @@ class HybridFP8UnquantLoader(WeightsLoader):
         if w.dtype == torch.float8_e4m3fn:
             # FP8 branch
             scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+
             if scale.numel() > 1:
                 scale = weights.get_packed_sharded(
                     f"{prefix}.weight_scale",
@@ -185,7 +225,8 @@ class HybridFP8UnquantLoader(WeightsLoader):
                     block_sizes=block_sizes,
                     to_dtype=False,
                 )
-            scale = scale.reshape(-1).expand(w.shape[0])
+            if SYSTEM == "cuda":
+                scale = scale.reshape(-1).expand(w.shape[0])
 
             input_scale = None
             if weights.has_tensor(f"{prefix}.input_scale"):
@@ -243,6 +284,17 @@ class HybridFP8UnquantLoader(WeightsLoader):
                 else None
             )
 
+            if SYSTEM == "rocm":
+                w, scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    w, scale, input_scale
+                )
+
+                if scale.numel() == len(prefixes):
+                    logical_widths = [x[0] for x in shapes]
+                    w, scale = requantize_with_max_scale(
+                        w, scale.to(weights.device), logical_widths, weights.dtype
+                    )
+
             return Fp8Weight(
                 weight=w,
                 weight_scale=scale,
@@ -259,16 +311,18 @@ class HybridFP8UnquantLoader(WeightsLoader):
         w = weights.get_sharded(f"{prefix}.weight", dim=1)
         # FP8 branch
         if w.dtype == torch.float8_e4m3fn:
-            scale = (
-                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
-                .reshape(-1)
-                .expand(w.shape[0])
-            )
+            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+
+            if SYSTEM == "cuda":
+                scale = scale.reshape(-1).expand(w.shape[0])
+
             input_scale = None
             if weights.has_tensor(f"{prefix}.input_scale"):
-                input_scale = weights.get_tensor(
-                    f"{prefix}.input_scale", to_dtype=False
-                ).reshape(-1)
+                input_scale = (
+                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
+                    .reshape(-1)
+                    .max()
+                )
 
             return Fp8Weight(
                 weight=w,
@@ -326,7 +380,7 @@ class Fp8Linear(torch.nn.Module):
         if CUTLASS_FP8_AVAILABLE:
             log_once(logger.info, "Using cutlass w8a8 kernels")
         if SYSTEM == "rocm" and qweight.dtype == torch.float8_e4m3fn:
-            qweight, scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+            qweight, scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                 weight=qweight, weight_scale=scale
             )
 
@@ -443,6 +497,9 @@ class Fp8Linear(torch.nn.Module):
 
 def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: torch.Size):
     scale = weights.get_tensor(prefix, to_dtype=False)
+
     if scale.numel() > 1:
         scale = weights.get_sharded(prefix, dim=0, to_dtype=False)
+    elif SYSTEM == "rocm":
+        return scale.reshape(-1)
     return scale.reshape(-1).expand(shape[0])

From cc8b9650bdae3125c70d979c19e465080b32789c Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Wed, 15 Jan 2025 22:56:52 +0800
Subject: [PATCH 022/183] Baichuan2-13B does not have max_position_embeddings
 in config (#2903)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Baichuan2-13B does not have max_position_embeddings in config
see https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/config.json

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* Update server/text_generation_server/models/flash_causal_lm.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

* fmt

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
---
 server/text_generation_server/models/flash_causal_lm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 03fc6147..d097c54f 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1595,7 +1595,9 @@ class FlashCausalLM(Model):
         if max_total_tokens is None:
             if get_support_chunking():
                 model_max_length = self.tokenizer.model_max_length
-                max_position_embeddings = self.config.max_position_embeddings
+                max_position_embeddings = getattr(
+                    self.config, "max_position_embeddings", model_max_length
+                )
                 max_total_tokens = min(
                     num_blocks * BLOCK_SIZE, model_max_length, max_position_embeddings
                 )

From 3c7ae48f7feb33bd379e101290c908cbf094ceee Mon Sep 17 00:00:00 2001
From: Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
Date: Wed, 15 Jan 2025 22:05:54 +0700
Subject: [PATCH 023/183] docs(conceptual/speculation): available links Train
 Medusa (#2863)

---
 docs/source/conceptual/speculation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conceptual/speculation.md b/docs/source/conceptual/speculation.md
index 45618ae3..74e010c8 100644
--- a/docs/source/conceptual/speculation.md
+++ b/docs/source/conceptual/speculation.md
@@ -27,7 +27,7 @@ You can check a few existing  fine-tunes for popular models:
 - [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa)
 
 
-In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [../basic_tutorials/train_medusa.md](../basic_tutorials/train_medusa.md)
+In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. Read for more in [Train Medusa](../basic_tutorials/train_medusa#training).
 
 
 In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically.

From dc9b8e981425e08ca9be7dcb23412449cdb3d8f8 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Wed, 15 Jan 2025 16:07:10 +0100
Subject: [PATCH 024/183] Fix `docker run` in `README.md` (#2861)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix `docker run` in `README.md`

* Add line-break in `docker run` for readability

Co-authored-by: Daniël de Kok <danieldk@users.noreply.github.com>

* Add line-break in `docker run` for readability

Co-authored-by: Daniël de Kok <danieldk@users.noreply.github.com>

---------

Co-authored-by: Daniël de Kok <danieldk@users.noreply.github.com>
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 31966ddb..9842a2a7 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-3.0.0   ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
 ```
 
 And then you can make requests like
@@ -151,7 +151,8 @@ model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
 
-docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)

From 46994b34fb5231b00328f19ed9c6f784b9d61db1 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Wed, 15 Jan 2025 16:26:11 +0100
Subject: [PATCH 025/183] :memo: add guide on using TPU with TGI in the docs
 (#2907)

---
 docs/source/_toctree.yml        | 2 ++
 docs/source/installation_tpu.md | 3 +++
 2 files changed, 5 insertions(+)
 create mode 100644 docs/source/installation_tpu.md

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 4503424b..ab85682b 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -13,6 +13,8 @@
     title: Using TGI with Intel Gaudi
   - local: installation_inferentia
     title: Using TGI with AWS Inferentia
+  - local: installation_tpu
+    title: Using TGI with Google TPU
   - local: installation_intel
     title: Using TGI with Intel GPUs
   - local: installation
diff --git a/docs/source/installation_tpu.md b/docs/source/installation_tpu.md
new file mode 100644
index 00000000..208ebce3
--- /dev/null
+++ b/docs/source/installation_tpu.md
@@ -0,0 +1,3 @@
+# Using TGI with Google TPU
+
+Check out this [guide](https://huggingface.co/docs/optimum-tpu) on how to serve models with TGI on TPUs.

From 203cade2448dcb05c53f67ec3f2236254c735c8c Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 15 Jan 2025 17:04:03 +0100
Subject: [PATCH 026/183] Upgrading our rustc version. (#2908)

* Upgrading our rustc version.

* Fixing the rust tests to proper version.

* Clippy everything.
---
 .github/workflows/tests.yaml       |  2 +-
 Dockerfile                         |  2 +-
 Dockerfile_amd                     |  2 +-
 Dockerfile_intel                   |  2 +-
 Dockerfile_trtllm                  |  2 +-
 backends/grpc-metadata/src/lib.rs  |  2 +-
 backends/v2/src/queue.rs           | 11 ++++-----
 backends/v3/src/block_allocator.rs |  4 ++--
 backends/v3/src/queue.rs           |  3 +--
 backends/v3/src/radix.rs           |  2 +-
 flake.lock                         | 12 +++++-----
 launcher/src/main.rs               | 36 +++++++++++++-----------------
 router/src/lib.rs                  |  2 +-
 router/src/validation.rs           | 15 +++++--------
 rust-toolchain.toml                |  2 +-
 15 files changed, 43 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 4eeca334..6bcf7d96 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -31,7 +31,7 @@ jobs:
         with:
           # Released on: 02 May, 2024
           # https://releases.rs/docs/1.78.0/
-          toolchain: 1.80.0
+          toolchain: 1.84.0
           override: true
           components: rustfmt, clippy
       - name: Install Protoc
diff --git a/Dockerfile b/Dockerfile
index 0c08d48f..0f2ae6cc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.80.1 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_amd b/Dockerfile_amd
index 1f34ffa3..92acff5a 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.80.1 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 3b5e4a13..2b41fd8b 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -1,6 +1,6 @@
 ARG PLATFORM=xpu
 
-FROM lukemathwalker/cargo-chef:latest-rust-1.80.1 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index b4523ea5..e6a16ecc 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -2,7 +2,7 @@ ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
 ARG OMPI_VERSION="4.1.7rc1"
 
 # Build dependencies resolver stage
-FROM lukemathwalker/cargo-chef:latest AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
 WORKDIR /usr/src/text-generation-inference/backends/trtllm
 
 FROM chef AS planner
diff --git a/backends/grpc-metadata/src/lib.rs b/backends/grpc-metadata/src/lib.rs
index 3068a61c..822b0307 100644
--- a/backends/grpc-metadata/src/lib.rs
+++ b/backends/grpc-metadata/src/lib.rs
@@ -8,7 +8,7 @@ use tracing_opentelemetry::OpenTelemetrySpanExt;
 /// Inject context in the metadata of a gRPC request.
 struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap);
 
-impl<'a> Injector for MetadataInjector<'a> {
+impl Injector for MetadataInjector<'_> {
     /// Set a key and value in the MetadataMap.  Does nothing if the key or value are not valid inputs
     fn set(&mut self, key: &str, value: String) {
         if let Ok(key) = tonic::metadata::MetadataKey::from_bytes(key.as_bytes()) {
diff --git a/backends/v2/src/queue.rs b/backends/v2/src/queue.rs
index 61a3eebc..c9a9335d 100644
--- a/backends/v2/src/queue.rs
+++ b/backends/v2/src/queue.rs
@@ -213,8 +213,7 @@ impl State {
         }
 
         // Pad prefill_token_budget to be a multiple of block size
-        let prefill_token_budget =
-            ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
+        let prefill_token_budget = prefill_token_budget.div_ceil(self.block_size) * self.block_size;
 
         // Create span for this batch to add context to inference calls
         let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
@@ -245,9 +244,8 @@ impl State {
                 prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length
             } else {
                 // pad to block size
-                prefill_tokens += ((entry.request.input_length + self.block_size - 1)
-                    / self.block_size)
-                    * self.block_size;
+                prefill_tokens +=
+                    entry.request.input_length.div_ceil(self.block_size) * self.block_size;
             }
 
             if self.requires_padding {
@@ -262,8 +260,7 @@ impl State {
                 };
 
                 // pad to block size
-                decode_tokens +=
-                    ((max_new_tokens + self.block_size - 1) / self.block_size) * self.block_size;
+                decode_tokens += max_new_tokens.div_ceil(self.block_size) * self.block_size;
             }
 
             if prefill_tokens > prefill_token_budget
diff --git a/backends/v3/src/block_allocator.rs b/backends/v3/src/block_allocator.rs
index 4fea172b..e7f3d85a 100644
--- a/backends/v3/src/block_allocator.rs
+++ b/backends/v3/src/block_allocator.rs
@@ -165,13 +165,13 @@ impl Allocator for SimpleAllocator {
             let (tokens, repeats) = match self.window_size {
                 None => (tokens, 1),
                 Some(window_size) => {
-                    let repeats = (tokens + window_size - 1) / window_size;
+                    let repeats = tokens.div_ceil(window_size);
                     let tokens = core::cmp::min(tokens, window_size);
                     (tokens, repeats as usize)
                 }
             };
             // Pad to a multiple of block size
-            let required_blocks = (tokens + self.block_size - 1) / self.block_size;
+            let required_blocks = tokens.div_ceil(self.block_size);
             (required_blocks, repeats)
         };
 
diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs
index dd27806f..249eebf7 100644
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@@ -257,8 +257,7 @@ impl State {
         }
 
         // Pad prefill_token_budget to be a multiple of block size
-        let prefill_token_budget =
-            ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
+        let prefill_token_budget = prefill_token_budget.div_ceil(self.block_size) * self.block_size;
 
         // Create span for this batch to add context to inference calls
         let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
diff --git a/backends/v3/src/radix.rs b/backends/v3/src/radix.rs
index 8a544891..532ec6dd 100644
--- a/backends/v3/src/radix.rs
+++ b/backends/v3/src/radix.rs
@@ -103,7 +103,7 @@ impl Allocator for RadixAllocator {
         let prefix_len = blocks.len() * self.block_size as usize;
         let suffix_len = tokens - prefix_len as u32;
 
-        let suffix_blocks = (suffix_len + self.block_size - 1) / self.block_size;
+        let suffix_blocks = suffix_len.div_ceil(self.block_size);
 
         tracing::info!("Prefix {prefix_len} - Suffix {suffix_len}");
 
diff --git a/flake.lock b/flake.lock
index d304e3cc..23e76b8b 100644
--- a/flake.lock
+++ b/flake.lock
@@ -108,11 +108,11 @@
         "pre-commit-hooks": "pre-commit-hooks_3"
       },
       "locked": {
-        "lastModified": 1732039290,
-        "narHash": "sha256-LQKY7bShf2H9kJouxa9ZspfdrulnZF9o4kLTqGqCDYM=",
+        "lastModified": 1734429562,
+        "narHash": "sha256-V2XNs3Ir8WXNHdocfzkR/fu0FzkZ9uTDJkVecxJrGmQ=",
         "owner": "nix-community",
         "repo": "crate2nix",
-        "rev": "9ff208ce7f5a482272b1bcefbe363c772d7ff914",
+        "rev": "8537c2d7cb623679aaeff62c4c4c43a91566ab09",
         "type": "github"
       },
       "original": {
@@ -853,11 +853,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1732242723,
-        "narHash": "sha256-NWI8csIK0ujFlFuEXKnoc+7hWoCiEtINK9r48LUUMeU=",
+        "lastModified": 1736907983,
+        "narHash": "sha256-fw55wVwpJW36Md2HZBKuxX3YHGeqsGsspPLtCMVr1Y8=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "a229311fcb45b88a95fdfa5cecd8349c809a272a",
+        "rev": "eaa365c911441e07e387ff6acc596619fc50b156",
         "type": "github"
       },
       "original": {
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index c092d745..7df9abda 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -5,7 +5,6 @@ use hf_hub::{
 };
 use nix::sys::signal::{self, Signal};
 use nix::unistd::Pid;
-use regex::Regex;
 use serde::Deserialize;
 use std::env;
 use std::ffi::OsString;
@@ -2176,26 +2175,21 @@ fn main() -> Result<(), LauncherError> {
             }
 
             // capture adapter_id, path, revision in format of adapter_id=path@revision
-            let re = Regex::new(r"^([^=@]+)(?:=([^@]+))?(?:@(.+))?$").unwrap();
-            if let Some(caps) = re.captures(adapter) {
-                let adapter_id = caps.get(1).map_or("", |m| m.as_str());
-                let revision = caps.get(3).map(|m| m.as_str());
-
-                download_convert_model(
-                    adapter_id,
-                    revision,
-                    args.trust_remote_code,
-                    args.huggingface_hub_cache.as_deref(),
-                    args.weights_cache_override.as_deref(),
-                    running.clone(),
-                    false, // avoid merging lora adapters if using multi-lora
-                )?;
-            } else {
-                return Err(LauncherError::ArgumentValidation(format!(
-                    "Invalid LoRA adapter format: {}",
-                    adapter
-                )));
-            }
+            // path is disabled beforehand.
+            let mut splits = adapter.split("@");
+            let adapter_id = splits.next().ok_or_else(|| {
+                LauncherError::ArgumentValidation("Missing adapter id".to_string())
+            })?;
+            let revision = splits.next();
+            download_convert_model(
+                adapter_id,
+                revision,
+                args.trust_remote_code,
+                args.huggingface_hub_cache.as_deref(),
+                args.weights_cache_override.as_deref(),
+                running.clone(),
+                false, // avoid merging lora adapters if using multi-lora
+            )?;
         }
     }
 
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 21c45241..dbd36827 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -79,7 +79,7 @@ impl TokenizerTrait for tokenizers::Tokenizer {
     }
 }
 
-impl<'a> TokenizerTrait for PyTokenizer<'a> {
+impl TokenizerTrait for PyTokenizer<'_> {
     fn encode_trait(
         &self,
         query: String,
diff --git a/router/src/validation.rs b/router/src/validation.rs
index cdf954de..7ac05b21 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -1229,12 +1229,11 @@ mod tests {
         assert!(
             chunks
                 == vec![
-                    Chunk::Text("test".to_string()).into(),
+                    Chunk::Text("test".to_string()),
                     Chunk::Image(Image {
                         data: pixel_data.clone(),
                         mimetype: "image/gif".to_string()
                     })
-                    .into()
                 ],
             "Failed to process images",
         );
@@ -1289,17 +1288,15 @@ mod tests {
         assert!(
             chunks
                 == vec![
-                    Chunk::Text("test".to_string()).into(),
+                    Chunk::Text("test".to_string()),
+                    Chunk::Image(Image {
+                        data: pixel_data.clone(),
+                        mimetype: "image/gif".to_string()
+                    }),
                     Chunk::Image(Image {
                         data: pixel_data.clone(),
                         mimetype: "image/gif".to_string()
                     })
-                    .into(),
-                    Chunk::Image(Image {
-                        data: pixel_data.clone(),
-                        mimetype: "image/gif".to_string()
-                    })
-                    .into()
                 ],
             "Failed to process images",
         );
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 12d58532..25959e0e 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
 # Released on: June 13, 2024
 # https://releases.rs/docs/1.79.0/
-channel = "1.80.1"
+channel = "1.84.0"
 components = ["rustfmt", "clippy"]

From 1470aec9d9f871767169610deb2778e9eb8f9358 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Wed, 15 Jan 2025 18:32:07 +0100
Subject: [PATCH 027/183] Fix typo in TPU docs (#2911)

docs(tpu): fix typo
---
 docs/source/_toctree.yml        | 2 +-
 docs/source/installation_tpu.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index ab85682b..8fcba516 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -14,7 +14,7 @@
   - local: installation_inferentia
     title: Using TGI with AWS Inferentia
   - local: installation_tpu
-    title: Using TGI with Google TPU
+    title: Using TGI with Google TPUs
   - local: installation_intel
     title: Using TGI with Intel GPUs
   - local: installation
diff --git a/docs/source/installation_tpu.md b/docs/source/installation_tpu.md
index 208ebce3..559e83aa 100644
--- a/docs/source/installation_tpu.md
+++ b/docs/source/installation_tpu.md
@@ -1,3 +1,3 @@
-# Using TGI with Google TPU
+# Using TGI with Google TPUs
 
 Check out this [guide](https://huggingface.co/docs/optimum-tpu) on how to serve models with TGI on TPUs.

From 120bd3e3bbc0190c8f47f2a41b82a768ece8e8bd Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 15 Jan 2025 19:20:44 +0100
Subject: [PATCH 028/183] Removing the github runner. (#2912)

---
 .github/workflows/build.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c0199a66..c43d8eb9 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,7 +67,8 @@ jobs:
                 export label_extension="-rocm"
                 export docker_devices="/dev/kfd,/dev/dri"
                 export docker_volume="/mnt"
-                export runs_on="amd-gpu-runners"
+                # This runner was deactivated.
+                export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest="-k test_flash_gemma_gptq_load"
                 ;;

From 922cc38fbcf689e77da4bfc0d6cbd7aae7e4327f Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 15 Jan 2025 20:07:21 +0100
Subject: [PATCH 029/183] Upgrading bitsandbytes. (#2910)

* Upgrading bitsandbytes.

Co-Authored-By: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>

* Tighter lock.

---------

Co-authored-by: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
---
 launcher/src/main.rs  |  9 +--------
 server/poetry.lock    | 13 +++++++------
 server/pyproject.toml |  2 +-
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 7df9abda..18badeaf 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -2078,14 +2078,7 @@ fn main() -> Result<(), LauncherError> {
     let cuda_graphs = match (&args.cuda_graphs, &quantize) {
         (Some(cuda_graphs), _) => cuda_graphs.iter().cloned().filter(|&c| c > 0).collect(),
         #[allow(deprecated)]
-        (
-            None,
-            Some(
-                Quantization::Bitsandbytes
-                | Quantization::BitsandbytesNf4
-                | Quantization::BitsandbytesFp4,
-            ),
-        ) => {
+        (None, Some(Quantization::Bitsandbytes)) => {
             tracing::warn!("Bitsandbytes doesn't work with cuda graphs, deactivating them");
             vec![]
         }
diff --git a/server/poetry.lock b/server/poetry.lock
index 69133015..93db8dc9 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "accelerate"
@@ -290,22 +290,23 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
 
 [[package]]
 name = "bitsandbytes"
-version = "0.43.3"
+version = "0.45.0"
 description = "k-bit optimizers and matrix multiplication routines."
 optional = true
 python-versions = "*"
 files = [
-    {file = "bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:cc99507c352be0715098b2c7577b690dd158972dc4ea10c7495bac104c7c79f0"},
-    {file = "bitsandbytes-0.43.3-py3-none-win_amd64.whl", hash = "sha256:257f6552f2144748a84e6c44e1f7a98f3da888f675ed74e18fd7f7eb13c6cafa"},
+    {file = "bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:0f0323de1ff1fdf8383e79bdad1283516a4c05a6fd2b44a363bf4e059422305b"},
+    {file = "bitsandbytes-0.45.0-py3-none-win_amd64.whl", hash = "sha256:ebbf96e0ecb466716a65ecdeaef3fa1983575447b9ab66b74e5211892507c6ff"},
 ]
 
 [package.dependencies]
 numpy = "*"
 torch = "*"
+typing_extensions = ">=4.8.0"
 
 [package.extras]
 benchmark = ["matplotlib", "pandas"]
-test = ["scipy"]
+test = ["lion_pytorch", "scipy"]
 
 [[package]]
 name = "certifi"
@@ -4097,4 +4098,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "25f96d5dea777bfa7a959f863e35d2e05e1a6172d0dd45193dbe25ac2f32cc25"
+content-hash = "0ead8472620eeef6f9ff81f70bcb48403f9c831b6914245efa5e249724d80d0b"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index bc74a05a..0386ae55 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -16,7 +16,7 @@ grpcio-reflection = "^1.51.1"
 grpc-interceptor = "^0.15.4"
 typer = "^0.12.5"
 accelerate = {version = "^1.1.0", optional = true}
-bitsandbytes = { version = "^0.43.0", optional = true }
+bitsandbytes = { version = "^0.45.0", optional = true }
 safetensors = "^0.4.5"
 loguru = "^0.7.2"
 opentelemetry-api = "^1.27.0"

From 5f78ec32a5e8c4bf16286702e223ab15ed50b1f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 16 Jan 2025 13:44:32 +0100
Subject: [PATCH 030/183] Do not convert weight scale to e4m3fnuz on CUDA
 (#2917)

---
 .../text_generation_server/layers/compressed_tensors/w8an_fp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
index 15bdce08..ebcc06d6 100644
--- a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
+++ b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
@@ -147,7 +147,7 @@ class W8ANFpLoader(WeightsLoader):
                 else None
             )
 
-        if self.load_weight_scale or SYSTEM == "rocm":
+        if self.load_weight_scale and SYSTEM == "rocm":
             w, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                 w, weight_scale, input_scale
             )

From 82f6ea1b714183f184a3ee8d4aa4fa7d59ac87ab Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Thu, 16 Jan 2025 16:23:55 -0500
Subject: [PATCH 031/183] feat: improve star coder to support multi lora layers
 (#2883)

* feat: improve star coder to support multi lora layers

* feat: improve weight that support adapters and add tests for starcoder with lora

* fix: bump snapshot for added tests

* fix: rerun pre commit lints

* fix: bump adapter test for added later names
---
 .../test_flash_starcoder2.json                |  73 ++++
 .../test_flash_starcoder2_default_params.json | 373 ++++++++++++++++++
 .../test_flash_starcoder2_load.json           | 294 ++++++++++++++
 ...flash_starcoder2_with_hugcode_adapter.json |  71 ++++
 .../models/test_flash_starcoder2_lora.py      |  79 ++++
 server/tests/utils/test_adapter.py            |   6 +
 .../text_generation_server/adapters/lora.py   |  11 +
 .../text_generation_server/models/__init__.py |   3 +
 .../flash_starcoder2_modeling.py              | 118 ++++--
 .../text_generation_server/utils/adapter.py   |   6 +
 10 files changed, 1009 insertions(+), 25 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_default_params.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_load.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_with_hugcode_adapter.json
 create mode 100644 integration-tests/models/test_flash_starcoder2_lora.py

diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2.json b/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2.json
new file mode 100644
index 00000000..1bc1e0fd
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2.json
@@ -0,0 +1,73 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 2284,
+        "logprob": -0.9355469,
+        "special": false,
+        "text": "():"
+      },
+      {
+        "id": 303,
+        "logprob": -0.40795898,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": -0.27954102,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": -0.6142578,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": -0.68310547,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 10914,
+        "logprob": -1.4599609,
+        "special": false,
+        "text": " World"
+      },
+      {
+        "id": 16013,
+        "logprob": -0.80126953,
+        "special": false,
+        "text": "!\")"
+      },
+      {
+        "id": 222,
+        "logprob": -0.625,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 222,
+        "logprob": -0.23242188,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 610,
+        "logprob": -1.2294922,
+        "special": false,
+        "text": "def"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_default_params.json
new file mode 100644
index 00000000..ce3831b0
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_default_params.json
@@ -0,0 +1,373 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 60,
+    "prefill": [],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 40,
+        "logprob": -0.7944336,
+        "special": false,
+        "text": "#"
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 447,
+        "logprob": -0.1796875,
+        "special": false,
+        "text": " ["
+      },
+      {
+        "id": 9009,
+        "logprob": 0.0,
+        "special": false,
+        "text": "markdown"
+      },
+      {
+        "id": 98,
+        "logprob": 0.0,
+        "special": false,
+        "text": "]"
+      },
+      {
+        "id": 37402,
+        "logprob": 0.0,
+        "special": false,
+        "text": " slideshow"
+      },
+      {
+        "id": 8492,
+        "logprob": 0.0,
+        "special": false,
+        "text": "={\""
+      },
+      {
+        "id": 7277,
+        "logprob": 0.0,
+        "special": false,
+        "text": "slide"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 700,
+        "logprob": 0.0,
+        "special": false,
+        "text": "type"
+      },
+      {
+        "id": 582,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\":"
+      },
+      {
+        "id": 332,
+        "logprob": 0.0,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 7277,
+        "logprob": -0.06994629,
+        "special": false,
+        "text": "slide"
+      },
+      {
+        "id": 3667,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\"}"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 40,
+        "logprob": 0.0,
+        "special": false,
+        "text": "#"
+      },
+      {
+        "id": 607,
+        "logprob": -0.8261719,
+        "special": false,
+        "text": " #"
+      },
+      {
+        "id": 244,
+        "logprob": -1.8574219,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 55,
+        "logprob": -1.4541016,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 51,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 6208,
+        "logprob": -0.9794922,
+        "special": false,
+        "text": " What"
+      },
+      {
+        "id": 458,
+        "logprob": 0.0,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 341,
+        "logprob": 0.0,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 10609,
+        "logprob": -0.69189453,
+        "special": false,
+        "text": " difference"
+      },
+      {
+        "id": 3761,
+        "logprob": 0.0,
+        "special": false,
+        "text": " between"
+      },
+      {
+        "id": 331,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1168,
+        "logprob": -0.27172852,
+        "special": false,
+        "text": " list"
+      },
+      {
+        "id": 480,
+        "logprob": 0.0,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 331,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 8871,
+        "logprob": 0.0,
+        "special": false,
+        "text": " tuple"
+      },
+      {
+        "id": 68,
+        "logprob": 0.0,
+        "special": false,
+        "text": "?"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 40,
+        "logprob": -1.3359375,
+        "special": false,
+        "text": "#"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 40,
+        "logprob": 0.0,
+        "special": false,
+        "text": "#"
+      },
+      {
+        "id": 449,
+        "logprob": -0.03164673,
+        "special": false,
+        "text": " -"
+      },
+      {
+        "id": 418,
+        "logprob": -1.0947266,
+        "special": false,
+        "text": " A"
+      },
+      {
+        "id": 1168,
+        "logprob": 0.0,
+        "special": false,
+        "text": " list"
+      },
+      {
+        "id": 458,
+        "logprob": 0.0,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 331,
+        "logprob": -0.3305664,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 14792,
+        "logprob": 0.0,
+        "special": false,
+        "text": " mutable"
+      },
+      {
+        "id": 6645,
+        "logprob": -0.40478516,
+        "special": false,
+        "text": " sequence"
+      },
+      {
+        "id": 451,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 4725,
+        "logprob": -0.50390625,
+        "special": false,
+        "text": " elements"
+      },
+      {
+        "id": 49,
+        "logprob": -2.1269531,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 2236,
+        "logprob": -0.1427002,
+        "special": false,
+        "text": " while"
+      },
+      {
+        "id": 331,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 8871,
+        "logprob": 0.0,
+        "special": false,
+        "text": " tuple"
+      },
+      {
+        "id": 458,
+        "logprob": 0.0,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 619,
+        "logprob": 0.0,
+        "special": false,
+        "text": " an"
+      },
+      {
+        "id": 26079,
+        "logprob": 0.0,
+        "special": false,
+        "text": " immutable"
+      },
+      {
+        "id": 6645,
+        "logprob": 0.0,
+        "special": false,
+        "text": " sequence"
+      },
+      {
+        "id": 451,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 4725,
+        "logprob": 0.0,
+        "special": false,
+        "text": " elements"
+      },
+      {
+        "id": 51,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 40,
+        "logprob": 0.0,
+        "special": false,
+        "text": "#"
+      },
+      {
+        "id": 449,
+        "logprob": 0.0,
+        "special": false,
+        "text": " -"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\n# + [markdown] slideshow={\"slide_type\": \"slide\"}\n# # 2. What is the difference between a list and a tuple?\n#\n# - A list is a mutable sequence of elements, while a tuple is an immutable sequence of elements.\n# -"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_load.json b/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_load.json
new file mode 100644
index 00000000..bf9e3010
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_load.json
@@ -0,0 +1,294 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 222,
+          "logprob": -1.9091797,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -1.0478516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 40,
+          "logprob": -3.015625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 494,
+          "logprob": -1.4228516,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 447,
+          "logprob": -1.1025391,
+          "special": false,
+          "text": " ["
+        },
+        {
+          "id": 9009,
+          "logprob": -0.0008444786,
+          "special": false,
+          "text": "markdown"
+        },
+        {
+          "id": 98,
+          "logprob": -8.8095665e-05,
+          "special": false,
+          "text": "]"
+        },
+        {
+          "id": 37402,
+          "logprob": -0.5810547,
+          "special": false,
+          "text": " slideshow"
+        },
+        {
+          "id": 8492,
+          "logprob": -0.00022864342,
+          "special": false,
+          "text": "={\""
+        },
+        {
+          "id": 7277,
+          "logprob": -0.00030994415,
+          "special": false,
+          "text": "slide"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\n# + [markdown] slideshow={\"slide"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 222,
+          "logprob": -1.9091797,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -1.0478516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 40,
+          "logprob": -3.015625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 494,
+          "logprob": -1.4228516,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 447,
+          "logprob": -1.1025391,
+          "special": false,
+          "text": " ["
+        },
+        {
+          "id": 9009,
+          "logprob": -0.0008444786,
+          "special": false,
+          "text": "markdown"
+        },
+        {
+          "id": 98,
+          "logprob": -8.8095665e-05,
+          "special": false,
+          "text": "]"
+        },
+        {
+          "id": 37402,
+          "logprob": -0.5810547,
+          "special": false,
+          "text": " slideshow"
+        },
+        {
+          "id": 8492,
+          "logprob": -0.00022864342,
+          "special": false,
+          "text": "={\""
+        },
+        {
+          "id": 7277,
+          "logprob": -0.00030994415,
+          "special": false,
+          "text": "slide"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\n# + [markdown] slideshow={\"slide"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 222,
+          "logprob": -1.9091797,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -1.0478516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 40,
+          "logprob": -3.015625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 494,
+          "logprob": -1.4228516,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 447,
+          "logprob": -1.1025391,
+          "special": false,
+          "text": " ["
+        },
+        {
+          "id": 9009,
+          "logprob": -0.0008444786,
+          "special": false,
+          "text": "markdown"
+        },
+        {
+          "id": 98,
+          "logprob": -8.8095665e-05,
+          "special": false,
+          "text": "]"
+        },
+        {
+          "id": 37402,
+          "logprob": -0.5810547,
+          "special": false,
+          "text": " slideshow"
+        },
+        {
+          "id": 8492,
+          "logprob": -0.00022864342,
+          "special": false,
+          "text": "={\""
+        },
+        {
+          "id": 7277,
+          "logprob": -0.00030994415,
+          "special": false,
+          "text": "slide"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\n# + [markdown] slideshow={\"slide"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 222,
+          "logprob": -1.9091797,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -1.0478516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 40,
+          "logprob": -3.015625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 494,
+          "logprob": -1.4228516,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 447,
+          "logprob": -1.1025391,
+          "special": false,
+          "text": " ["
+        },
+        {
+          "id": 9009,
+          "logprob": -0.0008444786,
+          "special": false,
+          "text": "markdown"
+        },
+        {
+          "id": 98,
+          "logprob": -8.8095665e-05,
+          "special": false,
+          "text": "]"
+        },
+        {
+          "id": 37402,
+          "logprob": -0.5810547,
+          "special": false,
+          "text": " slideshow"
+        },
+        {
+          "id": 8492,
+          "logprob": -0.00022864342,
+          "special": false,
+          "text": "={\""
+        },
+        {
+          "id": 7277,
+          "logprob": -0.00030994415,
+          "special": false,
+          "text": "slide"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\n# + [markdown] slideshow={\"slide"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_with_hugcode_adapter.json b/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_with_hugcode_adapter.json
new file mode 100644
index 00000000..de76dd50
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_with_hugcode_adapter.json
@@ -0,0 +1,71 @@
+{
+  "details": {
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 100,
+        "logprob": -0.9824219,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 5879,
+        "logprob": -0.3017578,
+        "special": false,
+        "text": "world"
+      },
+      {
+        "id": 2284,
+        "logprob": -0.68652344,
+        "special": false,
+        "text": "():"
+      },
+      {
+        "id": 303,
+        "logprob": -0.27734375,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": -0.4482422,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": -0.54248047,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": -0.4296875,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 10914,
+        "logprob": -0.8544922,
+        "special": false,
+        "text": " World"
+      },
+      {
+        "id": 16013,
+        "logprob": -0.7573242,
+        "special": false,
+        "text": "!\")"
+      },
+      {
+        "id": 222,
+        "logprob": -0.81347656,
+        "special": false,
+        "text": "\n"
+      }
+    ]
+  },
+  "generated_text": "_world():\n    print(\"Hello World!\")\n"
+}
diff --git a/integration-tests/models/test_flash_starcoder2_lora.py b/integration-tests/models/test_flash_starcoder2_lora.py
new file mode 100644
index 00000000..6480f669
--- /dev/null
+++ b/integration-tests/models/test_flash_starcoder2_lora.py
@@ -0,0 +1,79 @@
+import pytest
+import requests
+
+
+@pytest.fixture(scope="module")
+def flash_starcoder2_handle(launcher):
+    with launcher(
+        "bigcode/starcoder2-3b", lora_adapters=["smangrul/starcoder-3b-hugcoder"]
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_starcoder2(flash_starcoder2_handle):
+    await flash_starcoder2_handle.health(300)
+    return flash_starcoder2_handle.client
+
+
+@pytest.mark.asyncio
+async def test_flash_starcoder2(flash_starcoder2, response_snapshot):
+    response = await flash_starcoder2.generate(
+        "def print_hello", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_starcoder2_default_params(flash_starcoder2, response_snapshot):
+    response = await flash_starcoder2.generate(
+        "who are you?",
+        max_new_tokens=60,
+        temperature=0.2,
+        top_p=0.95,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 60
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_starcoder2_load(
+    flash_starcoder2, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_starcoder2, "who are you?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_starcoder2_with_hugcode_adapter(
+    flash_starcoder2, response_snapshot
+):
+    response = requests.post(
+        f"{flash_starcoder2.base_url}/generate",
+        headers=flash_starcoder2.headers,
+        json={
+            "inputs": "def print_hello",
+            "parameters": {
+                "max_new_tokens": 10,
+                "adapter_id": "smangrul/starcoder-3b-hugcoder",
+                "details": True,
+            },
+        },
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["generated_text"] == '_world():\n    print("Hello World!")\n'
+
+    assert data == response_snapshot
diff --git a/server/tests/utils/test_adapter.py b/server/tests/utils/test_adapter.py
index a27c1055..ab0312e4 100644
--- a/server/tests/utils/test_adapter.py
+++ b/server/tests/utils/test_adapter.py
@@ -94,6 +94,8 @@ def test_get_mlp_weights_with_gate_up_proj():
 
     # assert the result
     expected = {
+        (3, "c_fc"): ("model.layers.3.mlp.c_fc", mock_layer.mlp.c_fc),
+        (3, "c_proj"): ("model.layers.3.mlp.c_proj", mock_layer.mlp.c_proj),
         (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
         (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
         (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
@@ -188,6 +190,8 @@ def test_get_mlp_weights_llama_compatibility():
     result = get_mlp_weights(3, mock_layer)
 
     expected = {
+        (3, "c_fc"): ("model.layers.3.mlp.c_fc", mock_layer.mlp.c_fc),
+        (3, "c_proj"): ("model.layers.3.mlp.c_proj", mock_layer.mlp.c_proj),
         (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
         (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
         (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
@@ -240,6 +244,8 @@ def test_get_mlp_weights_gemma_compatibility():
     result = get_mlp_weights(3, mock_layer)
 
     expected = {
+        (3, "c_fc"): ("model.layers.3.mlp.c_fc", mock_layer.mlp.c_fc),
+        (3, "c_proj"): ("model.layers.3.mlp.c_proj", mock_layer.mlp.c_proj),
         (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_proj),
         (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.up_proj),
         (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
diff --git a/server/text_generation_server/adapters/lora.py b/server/text_generation_server/adapters/lora.py
index f1edd9a0..cdcfe91b 100644
--- a/server/text_generation_server/adapters/lora.py
+++ b/server/text_generation_server/adapters/lora.py
@@ -6,9 +6,11 @@ from collections import defaultdict
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Set, Tuple, Type, Union
 
+from loguru import logger
 import torch
 from peft import LoraConfig as _LoraConfig
 from torch.distributed import ProcessGroup
+from text_generation_server.utils.log import log_master
 
 from text_generation_server.adapters.config import AdapterConfig, ModuleMap
 
@@ -203,8 +205,17 @@ class LoraWeights(AdapterWeights):
         lora_a_list = [None] * nlayers
         lora_b_list = [None] * nlayers
 
+        # import ipdb; ipdb.set_trace()
         for layer_id in range(nlayers):
             key = (layer_id, layer_type)
+            if key not in target_to_layer:
+                # There is no layer of this type in the model
+                log_master(
+                    logger.warning,
+                    f"Key specified in lora weights but not found in base model: {key}",
+                )
+                return None
+
             weight_name, layer = target_to_layer[key]
             base_weight = layer.base_layer.linear.weight
             base_device = base_weight.device
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index beefeb01..e2d24643 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -1449,6 +1449,9 @@ def get_model_with_lora_adapters(
                 "up_proj",
                 "down_proj",
                 "qkv_proj",
+                # add c_* layers used in starcoder2
+                "c_proj",
+                "c_fc",
             ]
 
             for layer_name in adapter_layers:
diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
index c793982d..5e090369 100644
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -32,6 +32,8 @@ from text_generation_server.layers.attention import (
     Seqlen,
 )
 from text_generation_server.layers import (
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
@@ -109,17 +111,31 @@ class Starcoder2Config(PretrainedConfig):
         )
 
 
-def load_attention(config, prefix, weights):
+def load_attention(config, prefix, weights, layer_id):
+    prefixes = [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
+    head_size = config.hidden_size // config.num_attention_heads
+    sizes = [
+        head_size * config.num_attention_heads,
+        head_size * config.num_key_value_heads,
+        head_size * config.num_key_value_heads,
+    ]
     if config.num_attention_heads != config.num_key_value_heads:
-        return _load_gqa(config, prefix, weights)
+        base_layer = _load_gqa(config, prefix, weights)
     else:
-        return TensorParallelColumnLinear.load_multi(
+        base_layer = TensorParallelColumnLinear.load_multi(
             config,
-            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            prefixes=prefixes,
             dim=0,
             weights=weights,
             bias=config.use_bias,
         )
+    return TensorParallelMultiAdapterLinear.load(
+        base_layer=base_layer,
+        layer_id=layer_id,
+        layer_names=prefixes,
+        sizes=sizes,
+        process_group=weights.process_group,
+    )
 
 
 def _load_gqa(config, prefix: str, weights):
@@ -157,6 +173,7 @@ def _load_gqa(config, prefix: str, weights):
 class Starcoder2Attention(torch.nn.Module):
     def __init__(
         self,
+        index: int,
         prefix: str,
         config,
         weights,
@@ -188,15 +205,23 @@ class Starcoder2Attention(torch.nn.Module):
             config.num_key_value_heads // weights.process_group.size()
         )
 
-        self.query_key_value = load_attention(config, prefix, weights)
+        self.query_key_value = load_attention(config, prefix, weights, index)
         self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
-        self.o_proj = TensorParallelRowLinear.load(
+        o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
-            bias=config.use_bias,
+            bias=getattr(config, "use_bias", False),
         )
+
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            index,
+            "o_proj",
+            process_group=weights.process_group,
+        )
+
         self.num_groups = self.num_heads // self.num_key_value_heads
         self.kv_head_mapping = torch.arange(
             0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
@@ -214,8 +239,9 @@ class Starcoder2Attention(torch.nn.Module):
         seqlen,
         max_s,
         prefill_cache_indices,
+        adapter_data,
     ):
-        qkv = self.query_key_value(hidden_states)
+        qkv = self.query_key_value(hidden_states, adapter_data)
         query, kv = qkv.split(
             [
                 self.head_size * self.num_heads,
@@ -267,11 +293,13 @@ class Starcoder2Attention(torch.nn.Module):
                 kv_scales=self.kv_scales,
             )
 
-        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
 
 
 class Starcoder2MLP(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix, config, weights, index):
         super().__init__()
         act = config.hidden_act
         self.act = (
@@ -285,27 +313,42 @@ class Starcoder2MLP(nn.Module):
             )
         )
         # Fuse gate and up proj
-        self.c_fc = TensorParallelColumnLinear.load(
+        c_fc = TensorParallelColumnLinear.load(
             config,
             prefix=f"{prefix}.c_fc",
             weights=weights,
             bias=config.use_bias,
         )
-        self.c_proj = TensorParallelRowLinear.load(
+        c_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.c_proj",
             weights=weights,
             bias=config.use_bias,
         )
 
-    def forward(self, hidden_states):
-        hidden_states = self.c_fc(hidden_states)
+        self.c_fc = TensorParallelMultiAdapterLinear.load(
+            c_fc,
+            layer_id=index,
+            layer_names=[f"{prefix}.c_fc"],
+            sizes=[config.intermediate_size, config.intermediate_size],
+            process_group=weights.process_group,
+        )
+
+        self.c_proj = TensorParallelAdapterRowLinear.load(
+            c_proj,
+            index,
+            "c_proj",
+            process_group=weights.process_group,
+        )
+
+    def forward(self, hidden_states, adapter_data):
+        hidden_states = self.c_fc(hidden_states, adapter_data)
         hidden_states = self.act(hidden_states)
-        return self.c_proj(hidden_states)
+        return self.c_proj(hidden_states, adapter_data)
 
 
 class Starcoder2GatedMLP(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, index, prefix, config, weights):
         super().__init__()
         act = config.hidden_act
         self.act = (
@@ -319,27 +362,47 @@ class Starcoder2GatedMLP(nn.Module):
             )
         )
         # Fuse gate and up proj
-        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+        prefixes = [f"{prefix}.gate_proj", f"{prefix}.up_proj"]
+        sizes = [
+            config.intermediate_size,
+            config.intermediate_size,
+        ]
+        gate_up_proj = TensorParallelColumnLinear.load_multi(
             config,
-            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            prefixes=prefixes,
             weights=weights,
             dim=0,
             bias=config.use_bias,
         )
-        self.down_proj = TensorParallelRowLinear.load(
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            index,
+            layer_names=prefixes,
+            sizes=sizes,
+            process_group=weights.process_group,
+        )
+        down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
             bias=config.use_bias,
         )
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            index,
+            "down_proj",
+            process_group=weights.process_group,
+        )
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()
         )
 
-    def forward(self, hidden_states):
-        gate_up_states = self.gate_up_proj(hidden_states)
+    def forward(self, hidden_states, adapter_data):
+        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
         gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
-        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+        )
 
 
 STARCODER2_NORMALIZATION_CLASSES = {
@@ -358,11 +421,11 @@ class Starcoder2Layer(nn.Module):
         super().__init__()
         prefix = f"model.layers.{layer_id}"
         self.self_attn = Starcoder2Attention(
-            prefix=f"{prefix}.self_attn", config=config, weights=weights
+            prefix=f"{prefix}.self_attn", config=config, weights=weights, index=layer_id
         )
 
         self.mlp = STARCODER2_MLP_CLASSES[config.mlp_type](
-            prefix=f"{prefix}.mlp", config=config, weights=weights
+            prefix=f"{prefix}.mlp", config=config, weights=weights, index=layer_id
         )
 
         self.input_layernorm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
@@ -389,6 +452,7 @@ class Starcoder2Layer(nn.Module):
         seqlen,
         max_s,
         prefill_cache_indices,
+        adapter_data,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -404,6 +468,7 @@ class Starcoder2Layer(nn.Module):
             seqlen,
             max_s,
             prefill_cache_indices,
+            adapter_data,
         )
 
         # faster post attention rms norm
@@ -411,7 +476,7 @@ class Starcoder2Layer(nn.Module):
             attn_output, res
         )
 
-        mlp_output = self.mlp(normed_attn_res_output)
+        mlp_output = self.mlp(normed_attn_res_output, adapter_data)
 
         return mlp_output, attn_res
 
@@ -458,6 +523,7 @@ class Starcoder2Model(torch.nn.Module):
         max_s: int,
         true_max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
+        adapter_data,
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
 
@@ -481,6 +547,7 @@ class Starcoder2Model(torch.nn.Module):
                 seqlen,
                 max_s,
                 prefill_cache_indices,
+                adapter_data,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -552,6 +619,7 @@ class FlashStarcoder2ForCausalLM(torch.nn.Module):
             max_s,
             true_max_s,
             prefill_cache_indices,
+            adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/server/text_generation_server/utils/adapter.py b/server/text_generation_server/utils/adapter.py
index 09254b68..50abfafd 100644
--- a/server/text_generation_server/utils/adapter.py
+++ b/server/text_generation_server/utils/adapter.py
@@ -281,6 +281,12 @@ def get_mlp_weights(i, layer):
             if hasattr(mlp, "up_proj"):
                 weights[(i, "up_proj")] = (f"model.layers.{i}.mlp.up_proj", mlp.up_proj)
 
+        if hasattr(mlp, "c_fc"):
+            weights[(i, "c_fc")] = (f"model.layers.{i}.mlp.c_fc", mlp.c_fc)
+
+        if hasattr(mlp, "c_proj"):
+            weights[(i, "c_proj")] = (f"model.layers.{i}.mlp.c_proj", mlp.c_proj)
+
         if hasattr(mlp, "down_proj"):
             weights[(i, "down_proj")] = (
                 f"model.layers.{i}.mlp.down_proj",

From 885144166f8e73d34e61d49147518f2755458d46 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Fri, 17 Jan 2025 19:04:57 +0800
Subject: [PATCH 032/183] Flash decoding kernel adding and prefill-chunking and
 prefix caching enabling in intel cpu/xpu (#2815)

* flash decoding

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* enable xpu flashdecoding

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* set flashdecoding blocksize as 64

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* enable flashdecoding, prefill chunking and prefix caching

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* add flashdecoding-ipex

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_intel                              |  6 +-
 launcher/src/main.rs                          |  4 +-
 .../layers/attention/ipex.py                  | 98 +++++++++++++------
 .../layers/attention/kv_cache.py              | 11 ++-
 .../text_generation_server/models/globals.py  | 11 ++-
 server/text_generation_server/models/model.py |  7 +-
 6 files changed, 97 insertions(+), 40 deletions(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index 2b41fd8b..d6556bea 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -224,9 +224,9 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/loca
 COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
 
 FROM ${PLATFORM} AS final
-ENV ATTENTION=paged
-ENV PREFIX_CACHING=0
-ENV PREFILL_CHUNKING=0
+ENV ATTENTION=flashdecoding-ipex
+ENV PREFIX_CACHING=1
+ENV PREFILL_CHUNKING=1
 ENV CUDA_GRAPHS=0
 ENTRYPOINT ["text-generation-launcher"]
 CMD ["--json-output"]
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 18badeaf..394cc1e6 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -143,7 +143,9 @@ fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<String>) ->
             }
         }
 
-        let fallback_attention = if matches!(compute_capability, Some((major, _)) if major < 8) {
+        let fallback_attention = if compute_capability.is_none()
+            || matches!(compute_capability, Some((major, _)) if major < 8)
+        {
             "paged"
         } else {
             "flashdecoding"
diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py
index 677f3f56..54422308 100644
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@@ -1,9 +1,12 @@
 import intel_extension_for_pytorch as ipex
 import torch
 from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
-from text_generation_server.models.flash_causal_lm import BLOCK_SIZE
 from text_generation_server.layers.attention import Seqlen
 from typing import Optional
+from text_generation_server.models.globals import (
+    ATTENTION,
+    BLOCK_SIZE,
+)
 
 SUPPORTS_WINDOWING = False
 
@@ -28,22 +31,38 @@ def attention(
     out = torch.empty_like(query)
 
     # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
-    ipex.llm.functional.varlen_attention(
-        query.contiguous() if query.device.type == "xpu" else query,
-        key.contiguous() if key.device.type == "xpu" else key,
-        value.contiguous() if value.device.type == "xpu" else value,
-        out,
-        seqlen.cu_seqlen_q,
-        seqlen.cu_seqlen_q,
-        seqlen.max_q,
-        seqlen.max_q,
-        0.0,
-        softmax_scale,
-        False,
-        causal,
-        False,
-        None,
-    )
+    if ATTENTION == "flashdecoding-ipex":
+        ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
+            out,
+            query.contiguous() if query.device.type == "xpu" else query,
+            kv_cache.key,
+            kv_cache.value,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_k,
+            seqlen.max_q,
+            seqlen.max_k,
+            softmax_scale,
+            causal,
+            block_tables,
+            None,
+        )
+    else:
+        ipex.llm.functional.varlen_attention(
+            query.contiguous() if query.device.type == "xpu" else query,
+            key.contiguous() if key.device.type == "xpu" else key,
+            value.contiguous() if value.device.type == "xpu" else value,
+            out,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_q,
+            seqlen.max_q,
+            seqlen.max_q,
+            0.0,
+            softmax_scale,
+            False,
+            causal,
+            False,
+            None,
+        )
 
     return out
 
@@ -64,20 +83,37 @@ def paged_attention(
         raise NotImplementedError("softcap is not available in IPEX")
 
     out = torch.empty_like(query)
-    input_lengths = seqlen.input_lengths + seqlen.cache_lengths
-    ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
-        out,
-        query,
-        kv_cache.key,
-        kv_cache.value,
-        kv_head_mapping,
-        softmax_scale,
-        block_tables,
-        input_lengths,
-        BLOCK_SIZE,
-        max_s,
-        None,
-    )
+
+    if ATTENTION == "flashdecoding-ipex":
+        ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
+            out,
+            query.contiguous() if query.device.type == "xpu" else query,
+            kv_cache.key,
+            kv_cache.value,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_k,
+            seqlen.max_q,
+            seqlen.max_k,
+            softmax_scale,
+            True,
+            block_tables,
+            None,
+        )
+    else:
+        input_lengths = seqlen.input_lengths + seqlen.cache_lengths
+        ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
+            out,
+            query,
+            kv_cache.key,
+            kv_cache.value,
+            kv_head_mapping,
+            softmax_scale,
+            block_tables,
+            input_lengths,
+            BLOCK_SIZE,
+            max_s,
+            None,
+        )
     return out
 
 
diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py
index 93d74732..00308601 100644
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@@ -66,7 +66,9 @@ class KVCache:
         else:
             x = BLOCK_SIZE // element_size
 
-        if ATTENTION in {"flashdecoding", "flashinfer"}:
+        if ATTENTION in {"flashdecoding", "flashinfer"} or (
+            ATTENTION == "flashdecoding-ipex" and device.type == "xpu"
+        ):
             self.kv_cache = (
                 torch.empty(
                     (num_blocks, BLOCK_SIZE, num_heads, head_size),
@@ -80,6 +82,7 @@ class KVCache:
                 ),
             )
         elif SYSTEM == "ipex" and device == torch.device("cpu"):
+            # ipex cpu flashdecoding kernel and paged attention kernel share same layout
             self.kv_cache = (
                 torch.empty(
                     (num_blocks, num_heads, BLOCK_SIZE, head_size),
@@ -187,6 +190,12 @@ class KVCache:
             shape = key_cache.shape
             key_cache.view(-1, shape[-2], shape[-1])[slots] = key
             value_cache.view(-1, shape[-2], shape[-1])[slots] = value
+        elif ATTENTION == "flashdecoding-ipex" and key.device.type == "xpu":
+            import intel_extension_for_pytorch as ipex
+
+            ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
+                key, value, key_cache, value_cache, slots
+            )
         else:
             paged_reshape_and_cache(key, value, key_cache, value_cache, slots)
 
diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py
index ce879141..889de028 100644
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@@ -14,13 +14,17 @@ PREFIX_CACHING = os.environ["PREFIX_CACHING"].lower() in {
 }
 PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "1").lower() in {"1", "true"}
 log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
-_expected = {"paged", "flashdecoding", "flashinfer"}
+_expected = {"paged", "flashdecoding", "flashdecoding-ipex", "flashinfer"}
 assert (
     ATTENTION in _expected
 ), f"Attention is not valid {ATTENTION}, expected {_expected}"
 log_master(logger.info, f"Using Attention = {ATTENTION}")
 
-if PREFIX_CACHING and ATTENTION not in {"flashinfer", "flashdecoding"}:
+if PREFIX_CACHING and ATTENTION not in {
+    "flashinfer",
+    "flashdecoding",
+    "flashdecoding-ipex",
+}:
     raise RuntimeError("Prefix caching is only supported with flashinfer")
 
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
@@ -28,12 +32,15 @@ TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.95"))
 assert TGI_WIGGLE_ROOM > 0
 assert TGI_WIGGLE_ROOM < 1
 
+
 # This is overridden by the cli
 BLOCK_SIZE: int
 if ATTENTION == "flashdecoding":
     BLOCK_SIZE = 256
 elif ATTENTION == "flashinfer":
     BLOCK_SIZE = 1
+elif ATTENTION == "flashdecoding-ipex":
+    BLOCK_SIZE = 64
 else:
     BLOCK_SIZE = 16
 
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index 805fd771..af4d1f08 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -79,10 +79,13 @@ class Model(ABC):
                 "Prefill chunking will be turned off",
             )
             support_chunking = False
-        if ATTENTION not in ["flashinfer", "flashdecoding"] and support_chunking:
+        if (
+            ATTENTION not in ["flashinfer", "flashdecoding", "flashdecoding-ipex"]
+            and support_chunking
+        ):
             log_master(
                 logger.warning,
-                "Prefill chunking is only supported with `flashinfer` or `flashdecoding` attention types.",
+                "Prefill chunking is only supported with `flashinfer` or `flashdecoding` or `flashdecoding-ipex` attention types.",
             )
             support_chunking = False
 

From d61f14f271dbe363737d57516cb28b934bad3fe5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 17 Jan 2025 12:12:11 +0100
Subject: [PATCH 033/183] nix: update to PyTorch 2.5.1 (#2921)

---
 flake.lock | 7 ++++---
 flake.nix  | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/flake.lock b/flake.lock
index 23e76b8b..a7c3beb5 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,15 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1736436388,
-        "narHash": "sha256-CIyxVPpM9RrSwthNT/4DQ10YPk/uwzP7AeE83kBNsrE=",
+        "lastModified": 1737042758,
+        "narHash": "sha256-yW8IIcNU08IXxFJEv9lqnmx+4qcT6IfTvDqgw8NO1nM=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "5103c3fb1f9ad1fd33b6e09ff05e957884b112d5",
+        "rev": "12137321ce83850131ef5307e846c4e304366e3a",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
+        "ref": "pytorch-3.5.1",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 83cedfa6..eb55f512 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/pytorch-3.5.1";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {

From de19e7e84445c44ccd605628c2bd5c36b32d89dd Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 17 Jan 2025 12:32:00 +0100
Subject: [PATCH 034/183] Moving to `uv` instead of `poetry`. (#2919)

* Moving to `uv` instead of `poetry`.

More in the standard, faster, seemingly better lockfile.

* Creating venv if not created.

* Create the venv.

* Fix ?

* Fixing the test by activating the environment ?

* Install system  ?

* Add the cli entry point.

* docker install on system

* Monkeying this...

* `--system` is redundant.

* Trying to force-include this pb folder.

* TRying to check that pb is imported correctly.

* Editable install necessary ?

* Non editable?

* Editable it is.
---
 .github/workflows/tests.yaml |    6 +-
 Dockerfile                   |   10 +-
 Dockerfile_amd               |    7 +-
 Dockerfile_intel             |   10 +-
 flake.lock                   |    6 +-
 server/Makefile              |   21 +-
 server/poetry.lock           | 4101 ----------------------------------
 server/pyproject.toml        |  157 +-
 server/uv.lock               | 3426 ++++++++++++++++++++++++++++
 9 files changed, 3536 insertions(+), 4208 deletions(-)
 delete mode 100644 server/poetry.lock
 create mode 100644 server/uv.lock

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 6bcf7d96..cbb3392a 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -44,10 +44,14 @@ jobs:
         run: |
           sudo apt update
           sudo apt install python3.11-dev -y
+          pip install -U pip uv
+          uv venv
+          source ./.venv/bin/activate
           make install-cpu
       - name: Run server tests
         run: |
-          pip install pytest
+          source ./.venv/bin/activate
+          uv pip install pytest
           export HF_TOKEN=${{ secrets.HF_TOKEN }}
           pytest -s -vv server/tests
       - name: Pre-commit checks
diff --git a/Dockerfile b/Dockerfile
index 0f2ae6cc..18b1d5b0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -224,17 +224,19 @@ COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-
 COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
 
 # Install flash-attention dependencies
-RUN pip install einops --no-cache-dir
+# RUN pip install einops --no-cache-dir
 
 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
     make gen-server && \
-    pip install -r requirements_cuda.txt && \
-    pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
-    pip install nvidia-nccl-cu12==2.22.3
+    python -c "from text_generation_server.pb import generate_pb2" && \
+    pip install -U pip uv && \
+    uv pip install -e ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
+    uv pip install nvidia-nccl-cu12==2.22.3
 
 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
 # Required to find libpython within the rust binaries
diff --git a/Dockerfile_amd b/Dockerfile_amd
index 92acff5a..293cac2d 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -104,7 +104,7 @@ RUN case ${TARGETPLATFORM} in \
     /opt/conda/bin/conda clean -ya
 
 # Install flash-attention, torch dependencies
-RUN python3 -m pip install --upgrade pip && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
+RUN python3 -m pip install --upgrade pip uv && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
 
 RUN conda install mkl=2021
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
@@ -318,10 +318,11 @@ COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
     make gen-server && \
-    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
diff --git a/Dockerfile_intel b/Dockerfile_intel
index d6556bea..5edd8951 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -108,10 +108,11 @@ RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
     make gen-server && \
-    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
 ENV CCL_ZE_IPC_EXCHANGE=sockets
@@ -211,10 +212,11 @@ ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
     make gen-server && \
-    pip install -r requirements_intel.txt && \
-    pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
diff --git a/flake.lock b/flake.lock
index a7c3beb5..772de6eb 100644
--- a/flake.lock
+++ b/flake.lock
@@ -853,11 +853,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1736907983,
-        "narHash": "sha256-fw55wVwpJW36Md2HZBKuxX3YHGeqsGsspPLtCMVr1Y8=",
+        "lastModified": 1736994333,
+        "narHash": "sha256-v4Jrok5yXsZ6dwj2+2uo5cSyUi9fBTurHqHvNHLT1XA=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "eaa365c911441e07e387ff6acc596619fc50b156",
+        "rev": "848db855cb9e88785996e961951659570fc58814",
         "type": "github"
       },
       "original": {
diff --git a/server/Makefile b/server/Makefile
index b5677db8..252e355d 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -9,11 +9,14 @@ include Makefile-exllamav2
 include Makefile-flashinfer
 
 unit-tests:
+	pip install -U pip uv
+	uv pip install -e ".[dev]"
 	pytest -s -vv -m "not private" tests
 
 gen-server:
 	# Compile protos
-	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
+	pip install -U pip uv
+	uv pip install ".[gen]"
 	mkdir text_generation_server/pb || true
 	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
 		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
@@ -21,24 +24,14 @@ gen-server:
 	touch text_generation_server/pb/__init__.py
 
 install-server: gen-server
-	pip install pip --upgrade
-	pip install -r requirements_cuda.txt
-	pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
+	uv pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
 
 
 install: install-cuda
 	echo "Installed server"
 
 install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
-	pip install -e ".[attention,bnb,marlin,moe]"
-	pip install nvidia-nccl-cu12==2.22.3
+	uv pip install -e ".[attention,bnb,marlin,moe]"
+	uv pip install nvidia-nccl-cu12==2.22.3
 
 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
-
-run-dev:
-	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
-
-export-requirements:
-	poetry export -o requirements_cuda.txt --without-hashes
-	poetry export -o requirements_rocm.txt --without-hashes
-	poetry export -o requirements_intel.txt --without-hashes
diff --git a/server/poetry.lock b/server/poetry.lock
deleted file mode 100644
index 93db8dc9..00000000
--- a/server/poetry.lock
+++ /dev/null
@@ -1,4101 +0,0 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
-
-[[package]]
-name = "accelerate"
-version = "1.1.1"
-description = "Accelerate"
-optional = true
-python-versions = ">=3.9.0"
-files = [
-    {file = "accelerate-1.1.1-py3-none-any.whl", hash = "sha256:61edd81762131b8d4bede008643fa1e1f3bf59bec710ebda9771443e24feae02"},
-    {file = "accelerate-1.1.1.tar.gz", hash = "sha256:0d39dfac557052bc735eb2703a0e87742879e1e40b88af8a2f9a93233d4cd7db"},
-]
-
-[package.dependencies]
-huggingface-hub = ">=0.21.0"
-numpy = ">=1.17,<3.0.0"
-packaging = ">=20.0"
-psutil = "*"
-pyyaml = "*"
-safetensors = ">=0.4.3"
-torch = ">=1.10.0"
-
-[package.extras]
-deepspeed = ["deepspeed"]
-dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.6.4,<0.7.0)", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
-quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.6.4,<0.7.0)"]
-rich = ["rich"]
-sagemaker = ["sagemaker"]
-test-dev = ["bitsandbytes", "datasets", "diffusers", "evaluate", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
-test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist"]
-test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"]
-testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
-
-[[package]]
-name = "aiohappyeyeballs"
-version = "2.4.3"
-description = "Happy Eyeballs for asyncio"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"},
-    {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"},
-]
-
-[[package]]
-name = "aiohttp"
-version = "3.10.10"
-description = "Async http client/server framework (asyncio)"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"},
-    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"},
-    {file = "aiohttp-3.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:333cf6cf8e65f6a1e06e9eb3e643a0c515bb850d470902274239fea02033e9a8"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:274cfa632350225ce3fdeb318c23b4a10ec25c0e2c880eff951a3842cf358ac1"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9e5e4a85bdb56d224f412d9c98ae4cbd032cc4f3161818f692cd81766eee65a"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b606353da03edcc71130b52388d25f9a30a126e04caef1fd637e31683033abd"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab5a5a0c7a7991d90446a198689c0535be89bbd6b410a1f9a66688f0880ec026"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:578a4b875af3e0daaf1ac6fa983d93e0bbfec3ead753b6d6f33d467100cdc67b"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8105fd8a890df77b76dd3054cddf01a879fc13e8af576805d667e0fa0224c35d"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3bcd391d083f636c06a68715e69467963d1f9600f85ef556ea82e9ef25f043f7"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fbc6264158392bad9df19537e872d476f7c57adf718944cc1e4495cbabf38e2a"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e48d5021a84d341bcaf95c8460b152cfbad770d28e5fe14a768988c461b821bc"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2609e9ab08474702cc67b7702dbb8a80e392c54613ebe80db7e8dbdb79837c68"},
-    {file = "aiohttp-3.10.10-cp310-cp310-win32.whl", hash = "sha256:84afcdea18eda514c25bc68b9af2a2b1adea7c08899175a51fe7c4fb6d551257"},
-    {file = "aiohttp-3.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:9c72109213eb9d3874f7ac8c0c5fa90e072d678e117d9061c06e30c85b4cf0e6"},
-    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c30a0eafc89d28e7f959281b58198a9fa5e99405f716c0289b7892ca345fe45f"},
-    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:258c5dd01afc10015866114e210fb7365f0d02d9d059c3c3415382ab633fcbcb"},
-    {file = "aiohttp-3.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:15ecd889a709b0080f02721255b3f80bb261c2293d3c748151274dfea93ac871"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3935f82f6f4a3820270842e90456ebad3af15810cf65932bd24da4463bc0a4c"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:413251f6fcf552a33c981c4709a6bba37b12710982fec8e558ae944bfb2abd38"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1720b4f14c78a3089562b8875b53e36b51c97c51adc53325a69b79b4b48ebcb"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:679abe5d3858b33c2cf74faec299fda60ea9de62916e8b67e625d65bf069a3b7"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79019094f87c9fb44f8d769e41dbb664d6e8fcfd62f665ccce36762deaa0e911"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe2fb38c2ed905a2582948e2de560675e9dfbee94c6d5ccdb1301c6d0a5bf092"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a3f00003de6eba42d6e94fabb4125600d6e484846dbf90ea8e48a800430cc142"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1bbb122c557a16fafc10354b9d99ebf2f2808a660d78202f10ba9d50786384b9"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:30ca7c3b94708a9d7ae76ff281b2f47d8eaf2579cd05971b5dc681db8caac6e1"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:df9270660711670e68803107d55c2b5949c2e0f2e4896da176e1ecfc068b974a"},
-    {file = "aiohttp-3.10.10-cp311-cp311-win32.whl", hash = "sha256:aafc8ee9b742ce75044ae9a4d3e60e3d918d15a4c2e08a6c3c3e38fa59b92d94"},
-    {file = "aiohttp-3.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:362f641f9071e5f3ee6f8e7d37d5ed0d95aae656adf4ef578313ee585b585959"},
-    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9294bbb581f92770e6ed5c19559e1e99255e4ca604a22c5c6397b2f9dd3ee42c"},
-    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a8fa23fe62c436ccf23ff930149c047f060c7126eae3ccea005f0483f27b2e28"},
-    {file = "aiohttp-3.10.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c6a5b8c7926ba5d8545c7dd22961a107526562da31a7a32fa2456baf040939f"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:007ec22fbc573e5eb2fb7dec4198ef8f6bf2fe4ce20020798b2eb5d0abda6138"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9627cc1a10c8c409b5822a92d57a77f383b554463d1884008e051c32ab1b3742"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50edbcad60d8f0e3eccc68da67f37268b5144ecc34d59f27a02f9611c1d4eec7"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a45d85cf20b5e0d0aa5a8dca27cce8eddef3292bc29d72dcad1641f4ed50aa16"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b00807e2605f16e1e198f33a53ce3c4523114059b0c09c337209ae55e3823a8"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f2d4324a98062be0525d16f768a03e0bbb3b9fe301ceee99611dc9a7953124e6"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438cd072f75bb6612f2aca29f8bd7cdf6e35e8f160bc312e49fbecab77c99e3a"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:baa42524a82f75303f714108fea528ccacf0386af429b69fff141ffef1c534f9"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a7d8d14fe962153fc681f6366bdec33d4356f98a3e3567782aac1b6e0e40109a"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c1277cd707c465cd09572a774559a3cc7c7a28802eb3a2a9472588f062097205"},
-    {file = "aiohttp-3.10.10-cp312-cp312-win32.whl", hash = "sha256:59bb3c54aa420521dc4ce3cc2c3fe2ad82adf7b09403fa1f48ae45c0cbde6628"},
-    {file = "aiohttp-3.10.10-cp312-cp312-win_amd64.whl", hash = "sha256:0e1b370d8007c4ae31ee6db7f9a2fe801a42b146cec80a86766e7ad5c4a259cf"},
-    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ad7593bb24b2ab09e65e8a1d385606f0f47c65b5a2ae6c551db67d6653e78c28"},
-    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1eb89d3d29adaf533588f209768a9c02e44e4baf832b08118749c5fad191781d"},
-    {file = "aiohttp-3.10.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3fe407bf93533a6fa82dece0e74dbcaaf5d684e5a51862887f9eaebe6372cd79"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50aed5155f819873d23520919e16703fc8925e509abbb1a1491b0087d1cd969e"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f05e9727ce409358baa615dbeb9b969db94324a79b5a5cea45d39bdb01d82e6"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dffb610a30d643983aeb185ce134f97f290f8935f0abccdd32c77bed9388b42"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6658732517ddabe22c9036479eabce6036655ba87a0224c612e1ae6af2087e"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:741a46d58677d8c733175d7e5aa618d277cd9d880301a380fd296975a9cdd7bc"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e00e3505cd80440f6c98c6d69269dcc2a119f86ad0a9fd70bccc59504bebd68a"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ffe595f10566f8276b76dc3a11ae4bb7eba1aac8ddd75811736a15b0d5311414"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdfcf6443637c148c4e1a20c48c566aa694fa5e288d34b20fcdc58507882fed3"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d183cf9c797a5291e8301790ed6d053480ed94070637bfaad914dd38b0981f67"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"},
-    {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"},
-    {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"},
-    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1b66ccafef7336a1e1f0e389901f60c1d920102315a56df85e49552308fc0486"},
-    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:acd48d5b80ee80f9432a165c0ac8cbf9253eaddb6113269a5e18699b33958dbb"},
-    {file = "aiohttp-3.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3455522392fb15ff549d92fbf4b73b559d5e43dc522588f7eb3e54c3f38beee7"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c3b868724137f713a38376fef8120c166d1eadd50da1855c112fe97954aed8"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:da1dee8948d2137bb51fbb8a53cce6b1bcc86003c6b42565f008438b806cccd8"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5ce2ce7c997e1971b7184ee37deb6ea9922ef5163c6ee5aa3c274b05f9e12fa"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28529e08fde6f12eba8677f5a8608500ed33c086f974de68cc65ab218713a59d"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7db54c7914cc99d901d93a34704833568d86c20925b2762f9fa779f9cd2e70f"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:03a42ac7895406220124c88911ebee31ba8b2d24c98507f4a8bf826b2937c7f2"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7e338c0523d024fad378b376a79faff37fafb3c001872a618cde1d322400a572"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:038f514fe39e235e9fef6717fbf944057bfa24f9b3db9ee551a7ecf584b5b480"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:64f6c17757251e2b8d885d728b6433d9d970573586a78b78ba8929b0f41d045a"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:93429602396f3383a797a2a70e5f1de5df8e35535d7806c9f91df06f297e109b"},
-    {file = "aiohttp-3.10.10-cp38-cp38-win32.whl", hash = "sha256:c823bc3971c44ab93e611ab1a46b1eafeae474c0c844aff4b7474287b75fe49c"},
-    {file = "aiohttp-3.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:54ca74df1be3c7ca1cf7f4c971c79c2daf48d9aa65dea1a662ae18926f5bc8ce"},
-    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"},
-    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"},
-    {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"},
-    {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"},
-    {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"},
-    {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"},
-]
-
-[package.dependencies]
-aiohappyeyeballs = ">=2.3.0"
-aiosignal = ">=1.1.2"
-async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
-attrs = ">=17.3.0"
-frozenlist = ">=1.1.1"
-multidict = ">=4.5,<7.0"
-yarl = ">=1.12.0,<2.0"
-
-[package.extras]
-speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
-
-[[package]]
-name = "aiosignal"
-version = "1.3.1"
-description = "aiosignal: a list of registered asynchronous callbacks"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
-    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
-]
-
-[package.dependencies]
-frozenlist = ">=1.1.0"
-
-[[package]]
-name = "airportsdata"
-version = "20241001"
-description = "Extensive database of location and timezone data for nearly every airport and landing strip in the world."
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "airportsdata-20241001-py3-none-any.whl", hash = "sha256:67d71cf2c5378cc17ff66b62b1e11aa2444043949c894543ac8fd8dafce192fd"},
-    {file = "airportsdata-20241001.tar.gz", hash = "sha256:fa0bd143b4f4be3557cb892fa0612ef210fd91a92bd720b4d8221de576a4fa00"},
-]
-
-[[package]]
-name = "annotated-types"
-version = "0.7.0"
-description = "Reusable constraint types to use with typing.Annotated"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
-    {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
-]
-
-[[package]]
-name = "async-timeout"
-version = "4.0.3"
-description = "Timeout context manager for asyncio programs"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
-    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
-]
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-description = "Attention kernels"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-description = "Attention kernels"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-description = "Attention kernels"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-description = "Attention kernels"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
-
-[[package]]
-name = "attrs"
-version = "24.2.0"
-description = "Classes Without Boilerplate"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
-    {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
-]
-
-[package.extras]
-benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
-tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
-
-[[package]]
-name = "bitsandbytes"
-version = "0.45.0"
-description = "k-bit optimizers and matrix multiplication routines."
-optional = true
-python-versions = "*"
-files = [
-    {file = "bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:0f0323de1ff1fdf8383e79bdad1283516a4c05a6fd2b44a363bf4e059422305b"},
-    {file = "bitsandbytes-0.45.0-py3-none-win_amd64.whl", hash = "sha256:ebbf96e0ecb466716a65ecdeaef3fa1983575447b9ab66b74e5211892507c6ff"},
-]
-
-[package.dependencies]
-numpy = "*"
-torch = "*"
-typing_extensions = ">=4.8.0"
-
-[package.extras]
-benchmark = ["matplotlib", "pandas"]
-test = ["lion_pytorch", "scipy"]
-
-[[package]]
-name = "certifi"
-version = "2024.8.30"
-description = "Python package for providing Mozilla's CA Bundle."
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
-    {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
-]
-
-[[package]]
-name = "charset-normalizer"
-version = "3.4.0"
-description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-optional = false
-python-versions = ">=3.7.0"
-files = [
-    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
-    {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
-    {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
-]
-
-[[package]]
-name = "click"
-version = "8.1.7"
-description = "Composable command line interface toolkit"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
-    {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
-]
-
-[package.dependencies]
-colorama = {version = "*", markers = "platform_system == \"Windows\""}
-
-[[package]]
-name = "cloudpickle"
-version = "3.1.0"
-description = "Pickler class to extend the standard pickle.Pickler functionality"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"},
-    {file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"},
-]
-
-[[package]]
-name = "colorama"
-version = "0.4.6"
-description = "Cross-platform colored terminal text."
-optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
-files = [
-    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
-    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
-]
-
-[[package]]
-name = "compressed-tensors"
-version = "0.7.1"
-description = "Library for utilization of compressed safetensors of neural network models"
-optional = true
-python-versions = "*"
-files = [
-    {file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
-    {file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
-]
-
-[package.dependencies]
-pydantic = ">=2.0"
-torch = ">=1.7.0"
-transformers = "*"
-
-[package.extras]
-accelerate = ["accelerate"]
-dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
-
-[[package]]
-name = "datasets"
-version = "2.21.0"
-description = "HuggingFace community-driven open-source library of datasets"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "datasets-2.21.0-py3-none-any.whl", hash = "sha256:25e4e097110ce28824b746a107727ada94024cba11db8bc588d468414692b65a"},
-    {file = "datasets-2.21.0.tar.gz", hash = "sha256:998f85a8460f1bd982e5bd058f8a0808eef424249e3df1e8cdd594ccd0dc8ba2"},
-]
-
-[package.dependencies]
-aiohttp = "*"
-dill = ">=0.3.0,<0.3.9"
-filelock = "*"
-fsspec = {version = ">=2023.1.0,<=2024.6.1", extras = ["http"]}
-huggingface-hub = ">=0.21.2"
-multiprocess = "*"
-numpy = ">=1.17"
-packaging = "*"
-pandas = "*"
-pyarrow = ">=15.0.0"
-pyyaml = ">=5.1"
-requests = ">=2.32.2"
-tqdm = ">=4.66.3"
-xxhash = "*"
-
-[package.extras]
-apache-beam = ["apache-beam (>=2.26.0)"]
-audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"]
-benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"]
-dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"]
-docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"]
-jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"]
-metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk (<3.8.2)", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"]
-quality = ["ruff (>=0.3.0)"]
-s3 = ["s3fs"]
-tensorflow = ["tensorflow (>=2.6.0)"]
-tensorflow-gpu = ["tensorflow (>=2.6.0)"]
-tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "transformers (>=4.42.0)", "typing-extensions (>=4.6.1)", "zstandard"]
-tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "elasticsearch (<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "typing-extensions (>=4.6.1)", "zstandard"]
-torch = ["torch"]
-vision = ["Pillow (>=9.4.0)"]
-
-[[package]]
-name = "deprecated"
-version = "1.2.14"
-description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-files = [
-    {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"},
-    {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"},
-]
-
-[package.dependencies]
-wrapt = ">=1.10,<2"
-
-[package.extras]
-dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
-
-[[package]]
-name = "dill"
-version = "0.3.8"
-description = "serialize all of Python"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7"},
-    {file = "dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca"},
-]
-
-[package.extras]
-graph = ["objgraph (>=1.7.2)"]
-profile = ["gprof2dot (>=2022.7.29)"]
-
-[[package]]
-name = "diskcache"
-version = "5.6.3"
-description = "Disk Cache -- Disk and file backed persistent cache."
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"},
-    {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"},
-]
-
-[[package]]
-name = "einops"
-version = "0.8.0"
-description = "A new flavour of deep learning operations"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "einops-0.8.0-py3-none-any.whl", hash = "sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f"},
-    {file = "einops-0.8.0.tar.gz", hash = "sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85"},
-]
-
-[[package]]
-name = "exceptiongroup"
-version = "1.2.2"
-description = "Backport of PEP 654 (exception groups)"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
-    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
-]
-
-[package.extras]
-test = ["pytest (>=6)"]
-
-[[package]]
-name = "filelock"
-version = "3.16.1"
-description = "A platform independent file lock."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"},
-    {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"},
-]
-
-[package.extras]
-docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
-typing = ["typing-extensions (>=4.12.2)"]
-
-[[package]]
-name = "frozenlist"
-version = "1.5.0"
-description = "A list-like structure which implements collections.abc.MutableSequence"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"},
-    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"},
-    {file = "frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec"},
-    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5"},
-    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76"},
-    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17"},
-    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba"},
-    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d"},
-    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2"},
-    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f"},
-    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c"},
-    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab"},
-    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5"},
-    {file = "frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb"},
-    {file = "frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4"},
-    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30"},
-    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5"},
-    {file = "frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778"},
-    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a"},
-    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869"},
-    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d"},
-    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45"},
-    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d"},
-    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3"},
-    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a"},
-    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9"},
-    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2"},
-    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf"},
-    {file = "frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942"},
-    {file = "frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d"},
-    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21"},
-    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d"},
-    {file = "frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e"},
-    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a"},
-    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a"},
-    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee"},
-    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6"},
-    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e"},
-    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9"},
-    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039"},
-    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784"},
-    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631"},
-    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f"},
-    {file = "frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8"},
-    {file = "frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f"},
-    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953"},
-    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0"},
-    {file = "frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2"},
-    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f"},
-    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608"},
-    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b"},
-    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840"},
-    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439"},
-    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de"},
-    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641"},
-    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e"},
-    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9"},
-    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03"},
-    {file = "frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c"},
-    {file = "frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28"},
-    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:dd94994fc91a6177bfaafd7d9fd951bc8689b0a98168aa26b5f543868548d3ca"},
-    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0da8bbec082bf6bf18345b180958775363588678f64998c2b7609e34719b10"},
-    {file = "frozenlist-1.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73f2e31ea8dd7df61a359b731716018c2be196e5bb3b74ddba107f694fbd7604"},
-    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:828afae9f17e6de596825cf4228ff28fbdf6065974e5ac1410cecc22f699d2b3"},
-    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1577515d35ed5649d52ab4319db757bb881ce3b2b796d7283e6634d99ace307"},
-    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2150cc6305a2c2ab33299453e2968611dacb970d2283a14955923062c8d00b10"},
-    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a72b7a6e3cd2725eff67cd64c8f13335ee18fc3c7befc05aed043d24c7b9ccb9"},
-    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c16d2fa63e0800723139137d667e1056bee1a1cf7965153d2d104b62855e9b99"},
-    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:17dcc32fc7bda7ce5875435003220a457bcfa34ab7924a49a1c19f55b6ee185c"},
-    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:97160e245ea33d8609cd2b8fd997c850b56db147a304a262abc2b3be021a9171"},
-    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f1e6540b7fa044eee0bb5111ada694cf3dc15f2b0347ca125ee9ca984d5e9e6e"},
-    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:91d6c171862df0a6c61479d9724f22efb6109111017c87567cfeb7b5d1449fdf"},
-    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c1fac3e2ace2eb1052e9f7c7db480818371134410e1f5c55d65e8f3ac6d1407e"},
-    {file = "frozenlist-1.5.0-cp38-cp38-win32.whl", hash = "sha256:b97f7b575ab4a8af9b7bc1d2ef7f29d3afee2226bd03ca3875c16451ad5a7723"},
-    {file = "frozenlist-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:374ca2dabdccad8e2a76d40b1d037f5bd16824933bf7bcea3e59c891fd4a0923"},
-    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972"},
-    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336"},
-    {file = "frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08"},
-    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0"},
-    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c"},
-    {file = "frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3"},
-    {file = "frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0"},
-    {file = "frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3"},
-    {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"},
-]
-
-[[package]]
-name = "fsspec"
-version = "2024.6.1"
-description = "File-system specification"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "fsspec-2024.6.1-py3-none-any.whl", hash = "sha256:3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e"},
-    {file = "fsspec-2024.6.1.tar.gz", hash = "sha256:fad7d7e209dd4c1208e3bbfda706620e0da5142bebbd9c384afb95b07e798e49"},
-]
-
-[package.dependencies]
-aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional = true, markers = "extra == \"http\""}
-
-[package.extras]
-abfs = ["adlfs"]
-adl = ["adlfs"]
-arrow = ["pyarrow (>=1)"]
-dask = ["dask", "distributed"]
-dev = ["pre-commit", "ruff"]
-doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"]
-dropbox = ["dropbox", "dropboxdrivefs", "requests"]
-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
-fuse = ["fusepy"]
-gcs = ["gcsfs"]
-git = ["pygit2"]
-github = ["requests"]
-gs = ["gcsfs"]
-gui = ["panel"]
-hdfs = ["pyarrow (>=1)"]
-http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"]
-libarchive = ["libarchive-c"]
-oci = ["ocifs"]
-s3 = ["s3fs"]
-sftp = ["paramiko"]
-smb = ["smbprotocol"]
-ssh = ["paramiko"]
-test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"]
-test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
-test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
-tqdm = ["tqdm"]
-
-[[package]]
-name = "googleapis-common-protos"
-version = "1.65.0"
-description = "Common protobufs used in Google APIs"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "googleapis_common_protos-1.65.0-py2.py3-none-any.whl", hash = "sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63"},
-    {file = "googleapis_common_protos-1.65.0.tar.gz", hash = "sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0"},
-]
-
-[package.dependencies]
-protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
-
-[package.extras]
-grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
-
-[[package]]
-name = "grpc-interceptor"
-version = "0.15.4"
-description = "Simplifies gRPC interceptors"
-optional = false
-python-versions = ">=3.7,<4.0"
-files = [
-    {file = "grpc-interceptor-0.15.4.tar.gz", hash = "sha256:1f45c0bcb58b6f332f37c637632247c9b02bc6af0fdceb7ba7ce8d2ebbfb0926"},
-    {file = "grpc_interceptor-0.15.4-py3-none-any.whl", hash = "sha256:0035f33228693ed3767ee49d937bac424318db173fef4d2d0170b3215f254d9d"},
-]
-
-[package.dependencies]
-grpcio = ">=1.49.1,<2.0.0"
-
-[package.extras]
-testing = ["protobuf (>=4.21.9)"]
-
-[[package]]
-name = "grpcio"
-version = "1.68.0"
-description = "HTTP/2-based RPC framework"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "grpcio-1.68.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:619b5d0f29f4f5351440e9343224c3e19912c21aeda44e0c49d0d147a8d01544"},
-    {file = "grpcio-1.68.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:a59f5822f9459bed098ffbceb2713abbf7c6fd13f2b9243461da5c338d0cd6c3"},
-    {file = "grpcio-1.68.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:c03d89df516128febc5a7e760d675b478ba25802447624edf7aa13b1e7b11e2a"},
-    {file = "grpcio-1.68.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44bcbebb24363d587472089b89e2ea0ab2e2b4df0e4856ba4c0b087c82412121"},
-    {file = "grpcio-1.68.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79f81b7fbfb136247b70465bd836fa1733043fdee539cd6031cb499e9608a110"},
-    {file = "grpcio-1.68.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:88fb2925789cfe6daa20900260ef0a1d0a61283dfb2d2fffe6194396a354c618"},
-    {file = "grpcio-1.68.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:99f06232b5c9138593ae6f2e355054318717d32a9c09cdc5a2885540835067a1"},
-    {file = "grpcio-1.68.0-cp310-cp310-win32.whl", hash = "sha256:a6213d2f7a22c3c30a479fb5e249b6b7e648e17f364598ff64d08a5136fe488b"},
-    {file = "grpcio-1.68.0-cp310-cp310-win_amd64.whl", hash = "sha256:15327ab81131ef9b94cb9f45b5bd98803a179c7c61205c8c0ac9aff9d6c4e82a"},
-    {file = "grpcio-1.68.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:3b2b559beb2d433129441783e5f42e3be40a9e1a89ec906efabf26591c5cd415"},
-    {file = "grpcio-1.68.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e46541de8425a4d6829ac6c5d9b16c03c292105fe9ebf78cb1c31e8d242f9155"},
-    {file = "grpcio-1.68.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:c1245651f3c9ea92a2db4f95d37b7597db6b246d5892bca6ee8c0e90d76fb73c"},
-    {file = "grpcio-1.68.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f1931c7aa85be0fa6cea6af388e576f3bf6baee9e5d481c586980c774debcb4"},
-    {file = "grpcio-1.68.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b0ff09c81e3aded7a183bc6473639b46b6caa9c1901d6f5e2cba24b95e59e30"},
-    {file = "grpcio-1.68.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:8c73f9fbbaee1a132487e31585aa83987ddf626426d703ebcb9a528cf231c9b1"},
-    {file = "grpcio-1.68.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6b2f98165ea2790ea159393a2246b56f580d24d7da0d0342c18a085299c40a75"},
-    {file = "grpcio-1.68.0-cp311-cp311-win32.whl", hash = "sha256:e1e7ed311afb351ff0d0e583a66fcb39675be112d61e7cfd6c8269884a98afbc"},
-    {file = "grpcio-1.68.0-cp311-cp311-win_amd64.whl", hash = "sha256:e0d2f68eaa0a755edd9a47d40e50dba6df2bceda66960dee1218da81a2834d27"},
-    {file = "grpcio-1.68.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8af6137cc4ae8e421690d276e7627cfc726d4293f6607acf9ea7260bd8fc3d7d"},
-    {file = "grpcio-1.68.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:4028b8e9a3bff6f377698587d642e24bd221810c06579a18420a17688e421af7"},
-    {file = "grpcio-1.68.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:f60fa2adf281fd73ae3a50677572521edca34ba373a45b457b5ebe87c2d01e1d"},
-    {file = "grpcio-1.68.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e18589e747c1e70b60fab6767ff99b2d0c359ea1db8a2cb524477f93cdbedf5b"},
-    {file = "grpcio-1.68.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0d30f3fee9372796f54d3100b31ee70972eaadcc87314be369360248a3dcffe"},
-    {file = "grpcio-1.68.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:7e0a3e72c0e9a1acab77bef14a73a416630b7fd2cbd893c0a873edc47c42c8cd"},
-    {file = "grpcio-1.68.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a831dcc343440969aaa812004685ed322cdb526cd197112d0db303b0da1e8659"},
-    {file = "grpcio-1.68.0-cp312-cp312-win32.whl", hash = "sha256:5a180328e92b9a0050958ced34dddcb86fec5a8b332f5a229e353dafc16cd332"},
-    {file = "grpcio-1.68.0-cp312-cp312-win_amd64.whl", hash = "sha256:2bddd04a790b69f7a7385f6a112f46ea0b34c4746f361ebafe9ca0be567c78e9"},
-    {file = "grpcio-1.68.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:fc05759ffbd7875e0ff2bd877be1438dfe97c9312bbc558c8284a9afa1d0f40e"},
-    {file = "grpcio-1.68.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:15fa1fe25d365a13bc6d52fcac0e3ee1f9baebdde2c9b3b2425f8a4979fccea1"},
-    {file = "grpcio-1.68.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:32a9cb4686eb2e89d97022ecb9e1606d132f85c444354c17a7dbde4a455e4a3b"},
-    {file = "grpcio-1.68.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dba037ff8d284c8e7ea9a510c8ae0f5b016004f13c3648f72411c464b67ff2fb"},
-    {file = "grpcio-1.68.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0efbbd849867e0e569af09e165363ade75cf84f5229b2698d53cf22c7a4f9e21"},
-    {file = "grpcio-1.68.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:4e300e6978df0b65cc2d100c54e097c10dfc7018b9bd890bbbf08022d47f766d"},
-    {file = "grpcio-1.68.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:6f9c7ad1a23e1047f827385f4713b5b8c6c7d325705be1dd3e31fb00dcb2f665"},
-    {file = "grpcio-1.68.0-cp313-cp313-win32.whl", hash = "sha256:3ac7f10850fd0487fcce169c3c55509101c3bde2a3b454869639df2176b60a03"},
-    {file = "grpcio-1.68.0-cp313-cp313-win_amd64.whl", hash = "sha256:afbf45a62ba85a720491bfe9b2642f8761ff348006f5ef67e4622621f116b04a"},
-    {file = "grpcio-1.68.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:f8f695d9576ce836eab27ba7401c60acaf9ef6cf2f70dfe5462055ba3df02cc3"},
-    {file = "grpcio-1.68.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9fe1b141cda52f2ca73e17d2d3c6a9f3f3a0c255c216b50ce616e9dca7e3441d"},
-    {file = "grpcio-1.68.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:4df81d78fd1646bf94ced4fb4cd0a7fe2e91608089c522ef17bc7db26e64effd"},
-    {file = "grpcio-1.68.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:46a2d74d4dd8993151c6cd585594c082abe74112c8e4175ddda4106f2ceb022f"},
-    {file = "grpcio-1.68.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a17278d977746472698460c63abf333e1d806bd41f2224f90dbe9460101c9796"},
-    {file = "grpcio-1.68.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:15377bce516b1c861c35e18eaa1c280692bf563264836cece693c0f169b48829"},
-    {file = "grpcio-1.68.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:cc5f0a4f5904b8c25729a0498886b797feb817d1fd3812554ffa39551112c161"},
-    {file = "grpcio-1.68.0-cp38-cp38-win32.whl", hash = "sha256:def1a60a111d24376e4b753db39705adbe9483ef4ca4761f825639d884d5da78"},
-    {file = "grpcio-1.68.0-cp38-cp38-win_amd64.whl", hash = "sha256:55d3b52fd41ec5772a953612db4e70ae741a6d6ed640c4c89a64f017a1ac02b5"},
-    {file = "grpcio-1.68.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:0d230852ba97654453d290e98d6aa61cb48fa5fafb474fb4c4298d8721809354"},
-    {file = "grpcio-1.68.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:50992f214264e207e07222703c17d9cfdcc2c46ed5a1ea86843d440148ebbe10"},
-    {file = "grpcio-1.68.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:14331e5c27ed3545360464a139ed279aa09db088f6e9502e95ad4bfa852bb116"},
-    {file = "grpcio-1.68.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f84890b205692ea813653ece4ac9afa2139eae136e419231b0eec7c39fdbe4c2"},
-    {file = "grpcio-1.68.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0cf343c6f4f6aa44863e13ec9ddfe299e0be68f87d68e777328bff785897b05"},
-    {file = "grpcio-1.68.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:fd2c2d47969daa0e27eadaf15c13b5e92605c5e5953d23c06d0b5239a2f176d3"},
-    {file = "grpcio-1.68.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:18668e36e7f4045820f069997834e94e8275910b1f03e078a6020bd464cb2363"},
-    {file = "grpcio-1.68.0-cp39-cp39-win32.whl", hash = "sha256:2af76ab7c427aaa26aa9187c3e3c42f38d3771f91a20f99657d992afada2294a"},
-    {file = "grpcio-1.68.0-cp39-cp39-win_amd64.whl", hash = "sha256:e694b5928b7b33ca2d3b4d5f9bf8b5888906f181daff6b406f4938f3a997a490"},
-    {file = "grpcio-1.68.0.tar.gz", hash = "sha256:7e7483d39b4a4fddb9906671e9ea21aaad4f031cdfc349fec76bdfa1e404543a"},
-]
-
-[package.extras]
-protobuf = ["grpcio-tools (>=1.68.0)"]
-
-[[package]]
-name = "grpcio-reflection"
-version = "1.62.3"
-description = "Standard Protobuf Reflection Service for gRPC"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "grpcio-reflection-1.62.3.tar.gz", hash = "sha256:cb84682933c400bddf94dd94f928d1c6570f500b6dd255973d4bfb495b82585f"},
-    {file = "grpcio_reflection-1.62.3-py3-none-any.whl", hash = "sha256:a48ef37df81a3bada78261fc92ef382f061112f989d1312398b945cc69838b9c"},
-]
-
-[package.dependencies]
-grpcio = ">=1.62.3"
-protobuf = ">=4.21.6"
-
-[[package]]
-name = "grpcio-status"
-version = "1.62.3"
-description = "Status proto mapping for gRPC"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "grpcio-status-1.62.3.tar.gz", hash = "sha256:289bdd7b2459794a12cf95dc0cb727bd4a1742c37bd823f760236c937e53a485"},
-    {file = "grpcio_status-1.62.3-py3-none-any.whl", hash = "sha256:f9049b762ba8de6b1086789d8315846e094edac2c50beaf462338b301a8fd4b8"},
-]
-
-[package.dependencies]
-googleapis-common-protos = ">=1.5.5"
-grpcio = ">=1.62.3"
-protobuf = ">=4.21.6"
-
-[[package]]
-name = "grpcio-tools"
-version = "1.62.3"
-description = "Protobuf code generator for gRPC"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "grpcio-tools-1.62.3.tar.gz", hash = "sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-win32.whl", hash = "sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-win_amd64.whl", hash = "sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-win32.whl", hash = "sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-win_amd64.whl", hash = "sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-win32.whl", hash = "sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-win_amd64.whl", hash = "sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-win_amd64.whl", hash = "sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-win32.whl", hash = "sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-win_amd64.whl", hash = "sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-win32.whl", hash = "sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-win_amd64.whl", hash = "sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14"},
-]
-
-[package.dependencies]
-grpcio = ">=1.62.3"
-protobuf = ">=4.21.6,<5.0dev"
-setuptools = "*"
-
-[[package]]
-name = "hf-transfer"
-version = "0.1.8"
-description = "Speed up file transfers with the Hugging Face Hub."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "hf_transfer-0.1.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:70858f9e94286738ed300484a45beb5cfee6a7ddac4c5886f9c6fce7823ac5ab"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:38adc73f0a8526319d90f7cc5dc2d5e4bb66f487a513d94b98aa6725be732e4a"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44d2f0c08198d8d899fe9d66e86aee2dd844bd7ce33888f261373fcec81d2a54"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1de2a4ef36f9e60b3d3bec00193c0aafd75771709f2ca51b9b162373f5af3d32"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e319269e3606a5ff2979296841766649ac73598a4a8eee2a968f86c8071fea5a"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f6026cf3be6a53ea42f92172f60c1c0675baaa9073f865e671b661dde5fd157"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f865c33ada5bd3650c2b46e59979f2d7755c3f517f8d0facc78576a0c7d26406"},
-    {file = "hf_transfer-0.1.8-cp310-none-win32.whl", hash = "sha256:2054730e8d8ed21917c64be7199e06424b2bd08df1c43a72766afaed7992f2d3"},
-    {file = "hf_transfer-0.1.8-cp310-none-win_amd64.whl", hash = "sha256:2b4f1a9446ba31170b5b1eca4e916504d18378a6b5fe959896bdac8a736a5ecb"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e27c15fcc5869ad7e52bbc0bdec6106b288d1c463f8d2da92f28615a3b181361"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:871a0032d011ebc6409a73a8406b98b84ff2cd3ed7d9e1af8cdf4d660b9fab9b"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:686fa756e1e0214bb6327d33c66732c52274d94a8460beb50604ad988b391cf6"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:36a03b1b2911b0cf15b1b9d971a34b32dadcc4f2fd979aaff5979d6ce4017c34"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:079db90c81f41f4cf3227dfaaa855a9b8e9aef45bc7c2be29ce7232cd83ff881"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac08a4524127fdd14c234d4bcbe49d1c498acf5335c781714823179bcc8dc039"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:837432e73cb17274a6782b6216e8ce058aa325a475dc44a5a6a753d48b86d18a"},
-    {file = "hf_transfer-0.1.8-cp311-none-win32.whl", hash = "sha256:b180f9823dde35aba9bc0f1d0c04ac8a873baebd3732a7ffe4f11940abc7df0d"},
-    {file = "hf_transfer-0.1.8-cp311-none-win_amd64.whl", hash = "sha256:37907d2135cebcf8b6d419bb575148d89c224f16b69357f027bd29d0e85c6529"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:baf948f4f493949309cbe60529620b9b0aef854a22b6e526753364acc57c09b6"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0bce5c8bdefa478c5d5eaa646cc4ce1df5cfe764d98572ad0c6b8773e98d49f6"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54d6f8a1a86128d651a3799e1267c343d60f81f2c565d7c5416eb8e674e4cf0e"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f79fd1b0c2ed93efb4c5f684118d7a762ecdd218e170df8208c4e13d3dcd4959"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:414df35692670683bf5623498ef9d88a8df5d77e9516515da6e2b34d1054c11f"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c9798d5f951f66b96d40a7a53910260cb5874fda56cf5944dddb7c571f37ec3"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:060c661691f85a61392e57579c80eb64b5ee277434e81fb582f605c1c8ff05d5"},
-    {file = "hf_transfer-0.1.8-cp312-none-win32.whl", hash = "sha256:f7840e32379820c3e1571a480238e05ea043e970c99d2e999578004a2eb17788"},
-    {file = "hf_transfer-0.1.8-cp312-none-win_amd64.whl", hash = "sha256:9a3204ec423cc5e659872e8179f8704ad9ce2abb1e6a991f8838aedf1dc07830"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09949e86ad63ee139e463fd0dfaf401515ae70445854199f61d545514c65f744"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bf1a74552845b93ea972e6e7131ef54e56056aa54137e93a40faf3fbcb2442ff"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:959bcb3afb4ee6f2a07031a947dba98ec0b64c001bc914fbd8fc32e13a287162"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e01eecdb8162bd61dab9090fbd9f8034dd8b5755ef727a21ca8a057f80cb91ee"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50650a38e9d31f5ad8f010e4598bf304ecd99c17162e7d93f67e031571b864ee"},
-    {file = "hf_transfer-0.1.8-cp37-none-win32.whl", hash = "sha256:e29b9d1d378138f2f4eae0e93ca94af3b5d45f4532eef69f1ab97fe06f9c9d9e"},
-    {file = "hf_transfer-0.1.8-cp37-none-win_amd64.whl", hash = "sha256:cfd6cef43ae883103117a371f8ebae4e7f9637bc6fb480f1be5568e2fe22a8a7"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92a68f7a0043cca8a0de4decc760dca177530944cbab502afac503bd1b2fa01a"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e3138e408179f80a5480598e32f8e1abb564915cbde4d3bc8da52811c75dc3ea"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4544d148930ad34442d43b8fa911c8479c04a95b858b1d1f91e0b7da77082fad"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a851794b9f029965664f8c3002c957fccf21685e9397ceb4f9f19c986dee8ad3"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:791aaf87c5319ac83edb6ab2994b3db19924c49d6ff667dd3d8a610b455ff70a"},
-    {file = "hf_transfer-0.1.8-cp38-none-win32.whl", hash = "sha256:8f71e5d35d3a3160dcca12fdcc8119033aeacaa6a32838a7ad9f9cb1008bbe58"},
-    {file = "hf_transfer-0.1.8-cp38-none-win_amd64.whl", hash = "sha256:543287b4ceb1e25501580b99690f7f0df9d3631d29306f37cbd97e918c732944"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:7ce02a18bd0bb2343e707ac85b68c946bc37623ee24150c69158f6b2b2c7a98f"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:64d7f8dbd64ba183ed1df75d47c84e075ff666ceaa335bff1de16b09eaac5b80"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e7858694e11419ae27e542fb8fc0d0e54d46ff7768fe73bc359d70b8f5aa578"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bed116cd9d1edfa32c0136d7cb8e5f1afd2b32df43c49085d428f108fc8e1c8f"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e385d0da9c6b3472ab29285d2d46c9f9903205b8d108f88a82f3f85aafae0ab"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98f75fa4b86ef15433cd907807ac77d1fb39d7e7b790bfd39c7ae9c385bf0200"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1a63ad947d2901425ac0a3ed70c3696dfde27fadb0482ed763bdd5cc946b278"},
-    {file = "hf_transfer-0.1.8-cp39-none-win32.whl", hash = "sha256:3e74096915813ae842ea6a5bdf10c0fef960aa51a35a560955b3e61cdfe3db57"},
-    {file = "hf_transfer-0.1.8-cp39-none-win_amd64.whl", hash = "sha256:05ea16307bf4a5eb097cbc6e5057e4eb5e080a138af23ef639fd38857723c288"},
-    {file = "hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:928ff036c3e98e10dcfbdb4fcdfc4592d37a5cc8e365a7ba8dfd4337e849d675"},
-    {file = "hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d49ba3ce67035f460ae1924fe2feafec155cb535eec7f31ed5109c19064cd294"},
-    {file = "hf_transfer-0.1.8-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b01f5872c62cfee3ec9ca5c738818296f69f8adf84b4d8d15f2a5601d9dda339"},
-    {file = "hf_transfer-0.1.8-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:659d4212d50847a5165666bf43d67727679b4f694ef9c413613cc27093136527"},
-    {file = "hf_transfer-0.1.8.tar.gz", hash = "sha256:26d229468152e7a3ec12664cac86b8c2800695fd85f9c9a96677a775cc04f0b3"},
-]
-
-[[package]]
-name = "huggingface-hub"
-version = "0.23.5"
-description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
-optional = false
-python-versions = ">=3.8.0"
-files = [
-    {file = "huggingface_hub-0.23.5-py3-none-any.whl", hash = "sha256:d7a7d337615e11a45cc14a0ce5a605db6b038dc24af42866f731684825226e90"},
-    {file = "huggingface_hub-0.23.5.tar.gz", hash = "sha256:67a9caba79b71235be3752852ca27da86bd54311d2424ca8afdb8dda056edf98"},
-]
-
-[package.dependencies]
-filelock = "*"
-fsspec = ">=2023.5.0"
-packaging = ">=20.9"
-pyyaml = ">=5.1"
-requests = "*"
-tqdm = ">=4.42.1"
-typing-extensions = ">=3.7.4.3"
-
-[package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
-cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
-fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
-hf-transfer = ["hf-transfer (>=0.1.4)"]
-inference = ["aiohttp", "minijinja (>=1.0)"]
-quality = ["mypy (==1.5.1)", "ruff (>=0.3.0)"]
-tensorflow = ["graphviz", "pydot", "tensorflow"]
-tensorflow-testing = ["keras (<3.0)", "tensorflow"]
-testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
-torch = ["safetensors", "torch"]
-typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
-
-[[package]]
-name = "idna"
-version = "3.10"
-description = "Internationalized Domain Names in Applications (IDNA)"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
-    {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
-]
-
-[package.extras]
-all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
-
-[[package]]
-name = "importlib-metadata"
-version = "7.1.0"
-description = "Read metadata from Python packages"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "importlib_metadata-7.1.0-py3-none-any.whl", hash = "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570"},
-    {file = "importlib_metadata-7.1.0.tar.gz", hash = "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2"},
-]
-
-[package.dependencies]
-zipp = ">=0.5"
-
-[package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-perf = ["ipython"]
-testing = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"]
-
-[[package]]
-name = "iniconfig"
-version = "2.0.0"
-description = "brain-dead simple config-ini parsing"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
-    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
-]
-
-[[package]]
-name = "interegular"
-version = "0.3.3"
-description = "a regex intersection checker"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c"},
-    {file = "interegular-0.3.3.tar.gz", hash = "sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600"},
-]
-
-[[package]]
-name = "jinja2"
-version = "3.1.4"
-description = "A very fast and expressive template engine."
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
-    {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
-]
-
-[package.dependencies]
-MarkupSafe = ">=2.0"
-
-[package.extras]
-i18n = ["Babel (>=2.7)"]
-
-[[package]]
-name = "jsonschema"
-version = "4.23.0"
-description = "An implementation of JSON Schema validation for Python"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"},
-    {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"},
-]
-
-[package.dependencies]
-attrs = ">=22.2.0"
-jsonschema-specifications = ">=2023.03.6"
-referencing = ">=0.28.4"
-rpds-py = ">=0.7.1"
-
-[package.extras]
-format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
-format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"]
-
-[[package]]
-name = "jsonschema-specifications"
-version = "2024.10.1"
-description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"},
-    {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"},
-]
-
-[package.dependencies]
-referencing = ">=0.31.0"
-
-[[package]]
-name = "lark"
-version = "1.2.2"
-description = "a modern parsing library"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "lark-1.2.2-py3-none-any.whl", hash = "sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c"},
-    {file = "lark-1.2.2.tar.gz", hash = "sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80"},
-]
-
-[package.extras]
-atomic-cache = ["atomicwrites"]
-interegular = ["interegular (>=0.3.1,<0.4.0)"]
-nearley = ["js2py"]
-regex = ["regex"]
-
-[[package]]
-name = "loguru"
-version = "0.7.2"
-description = "Python logging made (stupidly) simple"
-optional = false
-python-versions = ">=3.5"
-files = [
-    {file = "loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb"},
-    {file = "loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac"},
-]
-
-[package.dependencies]
-colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
-win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
-
-[package.extras]
-dev = ["Sphinx (==7.2.5)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.2.2)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.4.1)", "mypy (==v1.5.1)", "pre-commit (==3.4.0)", "pytest (==6.1.2)", "pytest (==7.4.0)", "pytest-cov (==2.12.1)", "pytest-cov (==4.1.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.0.0)", "sphinx-autobuild (==2021.3.14)", "sphinx-rtd-theme (==1.3.0)", "tox (==3.27.1)", "tox (==4.11.0)"]
-
-[[package]]
-name = "markdown-it-py"
-version = "3.0.0"
-description = "Python port of markdown-it. Markdown parsing, done right!"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
-    {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
-]
-
-[package.dependencies]
-mdurl = ">=0.1,<1.0"
-
-[package.extras]
-benchmarking = ["psutil", "pytest", "pytest-benchmark"]
-code-style = ["pre-commit (>=3.0,<4.0)"]
-compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
-linkify = ["linkify-it-py (>=1,<3)"]
-plugins = ["mdit-py-plugins"]
-profiling = ["gprof2dot"]
-rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
-testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
-
-[[package]]
-name = "markupsafe"
-version = "3.0.2"
-description = "Safely add untrusted strings to HTML/XML markup."
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"},
-    {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"},
-    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"},
-    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"},
-    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"},
-    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"},
-    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"},
-    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"},
-    {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"},
-    {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"},
-    {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"},
-    {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"},
-    {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"},
-    {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"},
-    {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"},
-]
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-description = "Marlin quantization kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:bb416d14623dc0ad0eeb2835446c37a41f994555f1baec8701de6d4c1fc17ec8"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-description = "Marlin quantization kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:a89bb61d718002d4432158641bce95c6fd68f9ee1a7d5402dd283903397f3185"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-description = "Marlin quantization kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:ed938d196fc5e9cce9fc44cd2b889d5adc5ca7475c8a23858f1474d29e38bdbf"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-description = "Marlin quantization kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:113c54f68565ad476ca12366b4de92131fa3e9ddb16cbe8ad63272972a15ac28"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
-
-[[package]]
-name = "mdurl"
-version = "0.1.2"
-description = "Markdown URL utilities"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
-    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
-]
-
-[[package]]
-name = "moe-kernels"
-version = "0.7.0"
-description = "MoE kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:f8c126395f11522881c6bf1f6120e3670822006a84e2ff74af561c22445746b3"},
-]
-
-[package.dependencies]
-nvidia-ml-py = "*"
-torch = "*"
-triton = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
-
-[[package]]
-name = "moe-kernels"
-version = "0.7.0"
-description = "MoE kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:2afff8346251f01d5d90bab738e3dfaa6b14a414a9c88205d396ab2bae87983a"},
-]
-
-[package.dependencies]
-nvidia-ml-py = "*"
-torch = "*"
-triton = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
-
-[[package]]
-name = "moe-kernels"
-version = "0.7.0"
-description = "MoE kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:b1a29e33d3b7d85e2b4f8bd47db28211096d1f645e0868d5a1f3666ebb9bd9e3"},
-]
-
-[package.dependencies]
-nvidia-ml-py = "*"
-torch = "*"
-triton = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
-
-[[package]]
-name = "moe-kernels"
-version = "0.7.0"
-description = "MoE kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:9573611174cda9f6fafa1816521e38582fd2903b321bbaf78f83cf6e3189ac7d"},
-]
-
-[package.dependencies]
-nvidia-ml-py = "*"
-torch = "*"
-triton = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
-
-[[package]]
-name = "mpmath"
-version = "1.3.0"
-description = "Python library for arbitrary-precision floating-point arithmetic"
-optional = true
-python-versions = "*"
-files = [
-    {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
-    {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
-]
-
-[package.extras]
-develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"]
-docs = ["sphinx"]
-gmpy = ["gmpy2 (>=2.1.0a4)"]
-tests = ["pytest (>=4.6)"]
-
-[[package]]
-name = "multidict"
-version = "6.1.0"
-description = "multidict implementation"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"},
-    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"},
-    {file = "multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7"},
-    {file = "multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0"},
-    {file = "multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d"},
-    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6"},
-    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156"},
-    {file = "multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753"},
-    {file = "multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80"},
-    {file = "multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926"},
-    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa"},
-    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436"},
-    {file = "multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3"},
-    {file = "multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133"},
-    {file = "multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1"},
-    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008"},
-    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f"},
-    {file = "multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"},
-    {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"},
-    {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"},
-    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392"},
-    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a"},
-    {file = "multidict-6.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd"},
-    {file = "multidict-6.1.0-cp38-cp38-win32.whl", hash = "sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167"},
-    {file = "multidict-6.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"},
-    {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"},
-    {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"},
-    {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"},
-    {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"},
-]
-
-[package.dependencies]
-typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
-
-[[package]]
-name = "multiprocess"
-version = "0.70.16"
-description = "better multiprocessing and multithreading in Python"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee"},
-    {file = "multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec"},
-    {file = "multiprocess-0.70.16-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:37b55f71c07e2d741374998c043b9520b626a8dddc8b3129222ca4f1a06ef67a"},
-    {file = "multiprocess-0.70.16-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ba8c31889abf4511c7308a8c52bb4a30b9d590e7f58523302ba00237702ca054"},
-    {file = "multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41"},
-    {file = "multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a"},
-    {file = "multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02"},
-    {file = "multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a"},
-    {file = "multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e"},
-    {file = "multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435"},
-    {file = "multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3"},
-    {file = "multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1"},
-]
-
-[package.dependencies]
-dill = ">=0.3.8"
-
-[[package]]
-name = "nest-asyncio"
-version = "1.6.0"
-description = "Patch asyncio to allow nested event loops"
-optional = true
-python-versions = ">=3.5"
-files = [
-    {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
-    {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
-]
-
-[[package]]
-name = "networkx"
-version = "3.2.1"
-description = "Python package for creating and manipulating graphs and networks"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"},
-    {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"},
-]
-
-[package.extras]
-default = ["matplotlib (>=3.5)", "numpy (>=1.22)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"]
-developer = ["changelist (==0.4)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"]
-doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"]
-extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
-test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
-
-[[package]]
-name = "numpy"
-version = "1.26.4"
-description = "Fundamental package for array computing in Python"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
-    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
-    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
-    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
-    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
-    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
-    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
-    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
-    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
-    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
-    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
-    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
-    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
-    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
-    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
-    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
-    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
-    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
-    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
-    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
-    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
-    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
-    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
-    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
-    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
-    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
-    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
-    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
-    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
-    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
-    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
-    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
-    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
-    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
-    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
-    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
-]
-
-[[package]]
-name = "nvidia-cublas-cu12"
-version = "12.4.5.8"
-description = "CUBLAS native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"},
-    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"},
-    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc"},
-]
-
-[[package]]
-name = "nvidia-cuda-cupti-cu12"
-version = "12.4.127"
-description = "CUDA profiling tools runtime libs."
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"},
-    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"},
-    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922"},
-]
-
-[[package]]
-name = "nvidia-cuda-nvrtc-cu12"
-version = "12.4.127"
-description = "NVRTC native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"},
-    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"},
-    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec"},
-]
-
-[[package]]
-name = "nvidia-cuda-runtime-cu12"
-version = "12.4.127"
-description = "CUDA Runtime native Libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"},
-    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"},
-    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e"},
-]
-
-[[package]]
-name = "nvidia-cudnn-cu12"
-version = "9.1.0.70"
-description = "cuDNN runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"},
-    {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"},
-]
-
-[package.dependencies]
-nvidia-cublas-cu12 = "*"
-
-[[package]]
-name = "nvidia-cufft-cu12"
-version = "11.2.1.3"
-description = "CUFFT native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"},
-    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"},
-    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b"},
-]
-
-[package.dependencies]
-nvidia-nvjitlink-cu12 = "*"
-
-[[package]]
-name = "nvidia-curand-cu12"
-version = "10.3.5.147"
-description = "CURAND native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"},
-    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"},
-    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771"},
-]
-
-[[package]]
-name = "nvidia-cusolver-cu12"
-version = "11.6.1.9"
-description = "CUDA solver native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"},
-    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"},
-    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c"},
-]
-
-[package.dependencies]
-nvidia-cublas-cu12 = "*"
-nvidia-cusparse-cu12 = "*"
-nvidia-nvjitlink-cu12 = "*"
-
-[[package]]
-name = "nvidia-cusparse-cu12"
-version = "12.3.1.170"
-description = "CUSPARSE native runtime libraries"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"},
-    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"},
-    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f"},
-]
-
-[package.dependencies]
-nvidia-nvjitlink-cu12 = "*"
-
-[[package]]
-name = "nvidia-ml-py"
-version = "12.560.30"
-description = "Python Bindings for the NVIDIA Management Library"
-optional = true
-python-versions = "*"
-files = [
-    {file = "nvidia-ml-py-12.560.30.tar.gz", hash = "sha256:f0254dc7400647680a072ee02509bfd46102b60bdfeca321576d4d4817e7fe97"},
-    {file = "nvidia_ml_py-12.560.30-py3-none-any.whl", hash = "sha256:fea371c94d63e38a611c17bbb85fe400e9c8ddb9e8684a9cd0e47786a4bc3c73"},
-]
-
-[[package]]
-name = "nvidia-nccl-cu12"
-version = "2.21.5"
-description = "NVIDIA Collective Communication Library (NCCL) Runtime"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"},
-]
-
-[[package]]
-name = "nvidia-nvjitlink-cu12"
-version = "12.4.127"
-description = "Nvidia JIT LTO Library"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"},
-    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"},
-    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"},
-]
-
-[[package]]
-name = "nvidia-nvtx-cu12"
-version = "12.4.127"
-description = "NVIDIA Tools Extension"
-optional = true
-python-versions = ">=3"
-files = [
-    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"},
-    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"},
-    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"},
-]
-
-[[package]]
-name = "opentelemetry-api"
-version = "1.27.0"
-description = "OpenTelemetry Python API"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_api-1.27.0-py3-none-any.whl", hash = "sha256:953d5871815e7c30c81b56d910c707588000fff7a3ca1c73e6531911d53065e7"},
-    {file = "opentelemetry_api-1.27.0.tar.gz", hash = "sha256:ed673583eaa5f81b5ce5e86ef7cdaf622f88ef65f0b9aab40b843dcae5bef342"},
-]
-
-[package.dependencies]
-deprecated = ">=1.2.6"
-importlib-metadata = ">=6.0,<=8.4.0"
-
-[[package]]
-name = "opentelemetry-exporter-otlp"
-version = "1.27.0"
-description = "OpenTelemetry Collector Exporters"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_exporter_otlp-1.27.0-py3-none-any.whl", hash = "sha256:7688791cbdd951d71eb6445951d1cfbb7b6b2d7ee5948fac805d404802931145"},
-    {file = "opentelemetry_exporter_otlp-1.27.0.tar.gz", hash = "sha256:4a599459e623868cc95d933c301199c2367e530f089750e115599fccd67cb2a1"},
-]
-
-[package.dependencies]
-opentelemetry-exporter-otlp-proto-grpc = "1.27.0"
-opentelemetry-exporter-otlp-proto-http = "1.27.0"
-
-[[package]]
-name = "opentelemetry-exporter-otlp-proto-common"
-version = "1.27.0"
-description = "OpenTelemetry Protobuf encoding"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_exporter_otlp_proto_common-1.27.0-py3-none-any.whl", hash = "sha256:675db7fffcb60946f3a5c43e17d1168a3307a94a930ecf8d2ea1f286f3d4f79a"},
-    {file = "opentelemetry_exporter_otlp_proto_common-1.27.0.tar.gz", hash = "sha256:159d27cf49f359e3798c4c3eb8da6ef4020e292571bd8c5604a2a573231dd5c8"},
-]
-
-[package.dependencies]
-opentelemetry-proto = "1.27.0"
-
-[[package]]
-name = "opentelemetry-exporter-otlp-proto-grpc"
-version = "1.27.0"
-description = "OpenTelemetry Collector Protobuf over gRPC Exporter"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_exporter_otlp_proto_grpc-1.27.0-py3-none-any.whl", hash = "sha256:56b5bbd5d61aab05e300d9d62a6b3c134827bbd28d0b12f2649c2da368006c9e"},
-    {file = "opentelemetry_exporter_otlp_proto_grpc-1.27.0.tar.gz", hash = "sha256:af6f72f76bcf425dfb5ad11c1a6d6eca2863b91e63575f89bb7b4b55099d968f"},
-]
-
-[package.dependencies]
-deprecated = ">=1.2.6"
-googleapis-common-protos = ">=1.52,<2.0"
-grpcio = ">=1.0.0,<2.0.0"
-opentelemetry-api = ">=1.15,<2.0"
-opentelemetry-exporter-otlp-proto-common = "1.27.0"
-opentelemetry-proto = "1.27.0"
-opentelemetry-sdk = ">=1.27.0,<1.28.0"
-
-[[package]]
-name = "opentelemetry-exporter-otlp-proto-http"
-version = "1.27.0"
-description = "OpenTelemetry Collector Protobuf over HTTP Exporter"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_exporter_otlp_proto_http-1.27.0-py3-none-any.whl", hash = "sha256:688027575c9da42e179a69fe17e2d1eba9b14d81de8d13553a21d3114f3b4d75"},
-    {file = "opentelemetry_exporter_otlp_proto_http-1.27.0.tar.gz", hash = "sha256:2103479092d8eb18f61f3fbff084f67cc7f2d4a7d37e75304b8b56c1d09ebef5"},
-]
-
-[package.dependencies]
-deprecated = ">=1.2.6"
-googleapis-common-protos = ">=1.52,<2.0"
-opentelemetry-api = ">=1.15,<2.0"
-opentelemetry-exporter-otlp-proto-common = "1.27.0"
-opentelemetry-proto = "1.27.0"
-opentelemetry-sdk = ">=1.27.0,<1.28.0"
-requests = ">=2.7,<3.0"
-
-[[package]]
-name = "opentelemetry-instrumentation"
-version = "0.48b0"
-description = "Instrumentation Tools & Auto Instrumentation for OpenTelemetry Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_instrumentation-0.48b0-py3-none-any.whl", hash = "sha256:a69750dc4ba6a5c3eb67986a337185a25b739966d80479befe37b546fc870b44"},
-    {file = "opentelemetry_instrumentation-0.48b0.tar.gz", hash = "sha256:94929685d906380743a71c3970f76b5f07476eea1834abd5dd9d17abfe23cc35"},
-]
-
-[package.dependencies]
-opentelemetry-api = ">=1.4,<2.0"
-setuptools = ">=16.0"
-wrapt = ">=1.0.0,<2.0.0"
-
-[[package]]
-name = "opentelemetry-instrumentation-grpc"
-version = "0.48b0"
-description = "OpenTelemetry gRPC instrumentation"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_instrumentation_grpc-0.48b0-py3-none-any.whl", hash = "sha256:50eb68fec49ceb1bbb0a06e5a2456bf6a5d7d56c7cecd152199f566f02030995"},
-    {file = "opentelemetry_instrumentation_grpc-0.48b0.tar.gz", hash = "sha256:b95c11056dc384a926c2a16b994d7caa2fcf73abe0fe8b4db3007d5c9cf0be81"},
-]
-
-[package.dependencies]
-opentelemetry-api = ">=1.12,<2.0"
-opentelemetry-instrumentation = "0.48b0"
-opentelemetry-semantic-conventions = "0.48b0"
-wrapt = ">=1.0.0,<2.0.0"
-
-[package.extras]
-instruments = ["grpcio (>=1.27,<2.0)"]
-
-[[package]]
-name = "opentelemetry-proto"
-version = "1.27.0"
-description = "OpenTelemetry Python Proto"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_proto-1.27.0-py3-none-any.whl", hash = "sha256:b133873de5581a50063e1e4b29cdcf0c5e253a8c2d8dc1229add20a4c3830ace"},
-    {file = "opentelemetry_proto-1.27.0.tar.gz", hash = "sha256:33c9345d91dafd8a74fc3d7576c5a38f18b7fdf8d02983ac67485386132aedd6"},
-]
-
-[package.dependencies]
-protobuf = ">=3.19,<5.0"
-
-[[package]]
-name = "opentelemetry-sdk"
-version = "1.27.0"
-description = "OpenTelemetry Python SDK"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_sdk-1.27.0-py3-none-any.whl", hash = "sha256:365f5e32f920faf0fd9e14fdfd92c086e317eaa5f860edba9cdc17a380d9197d"},
-    {file = "opentelemetry_sdk-1.27.0.tar.gz", hash = "sha256:d525017dea0ccce9ba4e0245100ec46ecdc043f2d7b8315d56b19aff0904fa6f"},
-]
-
-[package.dependencies]
-opentelemetry-api = "1.27.0"
-opentelemetry-semantic-conventions = "0.48b0"
-typing-extensions = ">=3.7.4"
-
-[[package]]
-name = "opentelemetry-semantic-conventions"
-version = "0.48b0"
-description = "OpenTelemetry Semantic Conventions"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "opentelemetry_semantic_conventions-0.48b0-py3-none-any.whl", hash = "sha256:a0de9f45c413a8669788a38569c7e0a11ce6ce97861a628cca785deecdc32a1f"},
-    {file = "opentelemetry_semantic_conventions-0.48b0.tar.gz", hash = "sha256:12d74983783b6878162208be57c9effcb89dc88691c64992d70bb89dc00daa1a"},
-]
-
-[package.dependencies]
-deprecated = ">=1.2.6"
-opentelemetry-api = "1.27.0"
-
-[[package]]
-name = "outlines"
-version = "0.1.5"
-description = "Probabilistic Generative Model Programming"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "outlines-0.1.5-py3-none-any.whl", hash = "sha256:072443e6d3064c8b705ea2956d52bc7947a7c7310a17c07b9ef2133aec64bece"},
-    {file = "outlines-0.1.5.tar.gz", hash = "sha256:f0e0efc382d70abe6d5782283fd7fb4eca8380ae3f554df3bce10998452b2115"},
-]
-
-[package.dependencies]
-airportsdata = "*"
-cloudpickle = "*"
-datasets = "*"
-diskcache = "*"
-interegular = "*"
-jinja2 = "*"
-jsonschema = "*"
-lark = "*"
-nest_asyncio = "*"
-numpy = "<2.0.0"
-outlines_core = "0.1.17"
-pycountry = "*"
-pydantic = ">=2.0"
-referencing = "*"
-requests = "*"
-torch = "*"
-tqdm = "*"
-typing_extensions = "*"
-
-[package.extras]
-serve = ["fastapi", "pydantic (>=2.0)", "uvicorn", "vllm (>=0.3.0)"]
-test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "diff-cover", "exllamav2", "huggingface_hub", "llama-cpp-python", "mlx-lm", "openai (>=1.0.0)", "pillow", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers", "vllm"]
-
-[[package]]
-name = "outlines-core"
-version = "0.1.17"
-description = "Structured Text Generation in Rust"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "outlines_core-0.1.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8406ff855d0b6a46a90df51b1250345de917ed688a393f0f6471ad7ea0360522"},
-    {file = "outlines_core-0.1.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8602b6ffc10a0e1bd3e7f41669bdf155a2dfe8bb1bca93deab36354e7f5c1d5a"},
-    {file = "outlines_core-0.1.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d381e7ff0be6797dbbb6a5d6b3c60dd8fb862bf829ccdd45de86b8886e5e9a00"},
-    {file = "outlines_core-0.1.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:661b4f5ba19119eaa98ffbbcb466b6fa34bae20353e6c61f155571f0a1937b67"},
-    {file = "outlines_core-0.1.17-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84ca47e609daec4c855ac1eb23697764d34efe58824d7a1b191b4adfa9248d1a"},
-    {file = "outlines_core-0.1.17-cp310-cp310-win32.whl", hash = "sha256:8477aa967d37a0257046b997c06e2229a981269e25b71738cf90b8dd68a939ce"},
-    {file = "outlines_core-0.1.17-cp310-cp310-win_amd64.whl", hash = "sha256:b6ffc681a3f6511dabbf1ec9a121f65a57a17358c7c757fdc7a019835385ba62"},
-    {file = "outlines_core-0.1.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4ca85ef745f512c288b5eb9911414c1fa78dd83c160d1a3e8b0ef6d1d8c17129"},
-    {file = "outlines_core-0.1.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:543234e0679bb933ce5eea8b20b952b0d3a41ec0e428c47910517812c54a8626"},
-    {file = "outlines_core-0.1.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1da1a47e9eae006b7ac75d92a1efed85056aa6e13c4fd0580dc6c72f9c090c31"},
-    {file = "outlines_core-0.1.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:46a0c4db4cda55d1e4c14fefc88ee89351078d3a9d9a73bcceb1ef22cb8767de"},
-    {file = "outlines_core-0.1.17-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cc3fa10a7673c64afcabc1ce60c14af99c64d2faca6be3d6593ccc7d93b86683"},
-    {file = "outlines_core-0.1.17-cp311-cp311-win32.whl", hash = "sha256:47978a58cfd54fda9e9f7080644f1a91c3ea1ac7a19661dd904505463191f4ac"},
-    {file = "outlines_core-0.1.17-cp311-cp311-win_amd64.whl", hash = "sha256:0e2d3ac5b549931743b413659eb702fdb2251350c595c9057f1c219f88ae23bd"},
-    {file = "outlines_core-0.1.17-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1056bc12c62d5f85ca1768b087d73fcebecd4b5613b1af699a46d9fc92c28f26"},
-    {file = "outlines_core-0.1.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c7df3872c9db77f4f6086ca4315a2cd6d60a50ac1d96be3e6f6445ef5d0ff29b"},
-    {file = "outlines_core-0.1.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e38896880ef1681d94ad270fbfe831ea6ccfe46a4e7a70c9e47d35b20e4ebd5d"},
-    {file = "outlines_core-0.1.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a64c62a9ee73c368e5f058a3ca59834b87259820fe2a0639d2fa02fed7bad01"},
-    {file = "outlines_core-0.1.17-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1ba904fdef09fc52fcddca0d7ae24729be35d2f2bdde787a6ec628408c487677"},
-    {file = "outlines_core-0.1.17-cp312-cp312-win32.whl", hash = "sha256:5624dfa129ddf348596040e2122b2fc1d2cfb20b2d0cc2868df8c101b5781327"},
-    {file = "outlines_core-0.1.17-cp312-cp312-win_amd64.whl", hash = "sha256:f930e9c0bf5939047a76aee23cf984a83aaf859aedd988ae5249134ff8a2577b"},
-    {file = "outlines_core-0.1.17-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5c70efcdfff7d5017cb8635789e43050e616a8144dab6ab854420cda3ae5b967"},
-    {file = "outlines_core-0.1.17-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bab703fb11aad79c58f1ea158caae2489c7ff9b1898ec25d802662502ee0844a"},
-    {file = "outlines_core-0.1.17-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a0973b8bfd3432627d0fa76d1c1d2f5c1ab3b838a4559c69c10b80b1ace52b8"},
-    {file = "outlines_core-0.1.17-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de2a3082af9995c45552c2bb42db02abf60328bc2d5371d0a5924a7453410145"},
-    {file = "outlines_core-0.1.17-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:47adcad14d4d2a3093ef7af2b8ae4c750192ba3947f7d3e85b8e11cb118fc7fd"},
-    {file = "outlines_core-0.1.17-cp313-cp313-win32.whl", hash = "sha256:88b83d497d272a3244a598ef029c572cc5a354fb48d115a94f2bb39db47fc2b0"},
-    {file = "outlines_core-0.1.17-cp313-cp313-win_amd64.whl", hash = "sha256:a7ab7a4534172e949814609d3152c70a305244445bdfba51204d99e6c658ab64"},
-    {file = "outlines_core-0.1.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:35e4aeb7e05d4afe5c8c8b9e90993406d91d86ef86bb7f629947adf903345165"},
-    {file = "outlines_core-0.1.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26a235d304e3ca84deda377cf51efb47f13f9abaa21c0ff9930037f7a87a5b68"},
-    {file = "outlines_core-0.1.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d8350947367caa0f99dfa99cfe4cf19c517ca0b0ddeb78bbb2a34fd0e3f9b42"},
-    {file = "outlines_core-0.1.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bdda0cb1525853cb85a10cb73facf18b6b44e5dcdf385068eea2a814230704f5"},
-    {file = "outlines_core-0.1.17-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:7cdf6f8cc5b5ba90d059ad5d351251a63cf06b3d6d62e6ab3ba49079c344d472"},
-    {file = "outlines_core-0.1.17-cp38-cp38-win32.whl", hash = "sha256:b6c2f1d3df66b592fc889bf90c871bbb878fdc03355a4a4092ffe086fa962e8a"},
-    {file = "outlines_core-0.1.17-cp38-cp38-win_amd64.whl", hash = "sha256:f66e576ffa0f07857289dd35cf524225fec04a75c66b8d99d8dfda0f3d763d8a"},
-    {file = "outlines_core-0.1.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1a83e52437954f1fc39fbc3d78182890284b495580b9246c4e9ca0c1b61fa54a"},
-    {file = "outlines_core-0.1.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:624ae80f4bf3faa16b9654df0fb9bc5aec38e537f07b0d64c99a4358ae597ab5"},
-    {file = "outlines_core-0.1.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9cca7897d65400481a763b88a889fae41032d2340325c76dc5af9458d474e1b2"},
-    {file = "outlines_core-0.1.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7157749b0105aa1c5c50974ab6d2badde8660176a3ec5eacedee4fb2137f94e7"},
-    {file = "outlines_core-0.1.17-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6f146d76d73e3f43604752b6e4324d38cfdbe9197ffe661482d0e1d7368fb06a"},
-    {file = "outlines_core-0.1.17-cp39-cp39-win32.whl", hash = "sha256:466d3817fc50268d82084efde5524a345f884339d9fe21773afd47e1a65e2671"},
-    {file = "outlines_core-0.1.17-cp39-cp39-win_amd64.whl", hash = "sha256:b4d7db5ce2a88b9399e7c492e8a34da740d3dc86f0ddf80538903c31d05a12d4"},
-    {file = "outlines_core-0.1.17.tar.gz", hash = "sha256:0925a12ab1e8818c6f1896feefdb4d4d8c922e6461515062d1bd3e60459b0281"},
-]
-
-[package.dependencies]
-interegular = "*"
-jsonschema = "*"
-
-[package.extras]
-test = ["accelerate", "asv", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "numpy", "pillow", "pre-commit", "pydantic", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "scipy", "setuptools-rust", "torch", "transformers"]
-
-[[package]]
-name = "packaging"
-version = "24.1"
-description = "Core utilities for Python packages"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
-    {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
-]
-
-[[package]]
-name = "pandas"
-version = "2.2.3"
-description = "Powerful data structures for data analysis, time series, and statistics"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"},
-    {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
-    {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"},
-    {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"},
-    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"},
-    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"},
-    {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"},
-    {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"},
-    {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"},
-    {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"},
-    {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"},
-    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"},
-    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"},
-    {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"},
-    {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"},
-    {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"},
-    {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"},
-    {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"},
-    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"},
-    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"},
-    {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"},
-    {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"},
-    {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"},
-    {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"},
-    {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"},
-    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"},
-    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"},
-    {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"},
-    {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"},
-    {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"},
-    {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"},
-    {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
-    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
-    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
-    {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
-    {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
-    {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
-    {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
-    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
-    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
-    {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
-    {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
-]
-
-[package.dependencies]
-numpy = [
-    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
-]
-python-dateutil = ">=2.8.2"
-pytz = ">=2020.1"
-tzdata = ">=2022.7"
-
-[package.extras]
-all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
-aws = ["s3fs (>=2022.11.0)"]
-clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
-compression = ["zstandard (>=0.19.0)"]
-computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
-consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
-excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
-feather = ["pyarrow (>=10.0.1)"]
-fss = ["fsspec (>=2022.11.0)"]
-gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
-hdf5 = ["tables (>=3.8.0)"]
-html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
-mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
-output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
-parquet = ["pyarrow (>=10.0.1)"]
-performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
-plot = ["matplotlib (>=3.6.3)"]
-postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
-pyarrow = ["pyarrow (>=10.0.1)"]
-spss = ["pyreadstat (>=1.2.0)"]
-sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
-test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
-xml = ["lxml (>=4.9.2)"]
-
-[[package]]
-name = "peft"
-version = "0.13.2"
-description = "Parameter-Efficient Fine-Tuning (PEFT)"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "peft-0.13.2-py3-none-any.whl", hash = "sha256:d4e0951ec78eac11c45a051801c569913436888c578d48e5ce86996b715bc6ef"},
-    {file = "peft-0.13.2.tar.gz", hash = "sha256:0e0cbd40ebdf5fe4ea79f255880d02f96712d18899509369a2cc5768ad46d672"},
-]
-
-[package.dependencies]
-accelerate = ">=0.21.0"
-huggingface-hub = ">=0.17.0"
-numpy = ">=1.17"
-packaging = ">=20.0"
-psutil = "*"
-pyyaml = "*"
-safetensors = "*"
-torch = ">=1.13.0"
-tqdm = "*"
-transformers = "*"
-
-[package.extras]
-dev = ["black", "hf-doc-builder", "ruff (>=0.6.1,<0.7.0)"]
-docs-specific = ["black", "hf-doc-builder"]
-quality = ["black", "hf-doc-builder", "ruff (>=0.6.1,<0.7.0)"]
-test = ["black", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameterized", "pytest", "pytest-cov", "pytest-xdist", "ruff (>=0.6.1,<0.7.0)", "scipy"]
-
-[[package]]
-name = "pillow"
-version = "11.0.0"
-description = "Python Imaging Library (Fork)"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "pillow-11.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:6619654954dc4936fcff82db8eb6401d3159ec6be81e33c6000dfd76ae189947"},
-    {file = "pillow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b3c5ac4bed7519088103d9450a1107f76308ecf91d6dabc8a33a2fcfb18d0fba"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a65149d8ada1055029fcb665452b2814fe7d7082fcb0c5bed6db851cb69b2086"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88a58d8ac0cc0e7f3a014509f0455248a76629ca9b604eca7dc5927cc593c5e9"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c26845094b1af3c91852745ae78e3ea47abf3dbcd1cf962f16b9a5fbe3ee8488"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:1a61b54f87ab5786b8479f81c4b11f4d61702830354520837f8cc791ebba0f5f"},
-    {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:674629ff60030d144b7bca2b8330225a9b11c482ed408813924619c6f302fdbb"},
-    {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:598b4e238f13276e0008299bd2482003f48158e2b11826862b1eb2ad7c768b97"},
-    {file = "pillow-11.0.0-cp310-cp310-win32.whl", hash = "sha256:9a0f748eaa434a41fccf8e1ee7a3eed68af1b690e75328fd7a60af123c193b50"},
-    {file = "pillow-11.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:a5629742881bcbc1f42e840af185fd4d83a5edeb96475a575f4da50d6ede337c"},
-    {file = "pillow-11.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:ee217c198f2e41f184f3869f3e485557296d505b5195c513b2bfe0062dc537f1"},
-    {file = "pillow-11.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1c1d72714f429a521d8d2d018badc42414c3077eb187a59579f28e4270b4b0fc"},
-    {file = "pillow-11.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:499c3a1b0d6fc8213519e193796eb1a86a1be4b1877d678b30f83fd979811d1a"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8b2351c85d855293a299038e1f89db92a2f35e8d2f783489c6f0b2b5f3fe8a3"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f4dba50cfa56f910241eb7f883c20f1e7b1d8f7d91c750cd0b318bad443f4d5"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:5ddbfd761ee00c12ee1be86c9c0683ecf5bb14c9772ddbd782085779a63dd55b"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:45c566eb10b8967d71bf1ab8e4a525e5a93519e29ea071459ce517f6b903d7fa"},
-    {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b4fd7bd29610a83a8c9b564d457cf5bd92b4e11e79a4ee4716a63c959699b306"},
-    {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cb929ca942d0ec4fac404cbf520ee6cac37bf35be479b970c4ffadf2b6a1cad9"},
-    {file = "pillow-11.0.0-cp311-cp311-win32.whl", hash = "sha256:006bcdd307cc47ba43e924099a038cbf9591062e6c50e570819743f5607404f5"},
-    {file = "pillow-11.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:52a2d8323a465f84faaba5236567d212c3668f2ab53e1c74c15583cf507a0291"},
-    {file = "pillow-11.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:16095692a253047fe3ec028e951fa4221a1f3ed3d80c397e83541a3037ff67c9"},
-    {file = "pillow-11.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d2c0a187a92a1cb5ef2c8ed5412dd8d4334272617f532d4ad4de31e0495bd923"},
-    {file = "pillow-11.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:084a07ef0821cfe4858fe86652fffac8e187b6ae677e9906e192aafcc1b69903"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8069c5179902dcdce0be9bfc8235347fdbac249d23bd90514b7a47a72d9fecf4"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f02541ef64077f22bf4924f225c0fd1248c168f86e4b7abdedd87d6ebaceab0f"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fcb4621042ac4b7865c179bb972ed0da0218a076dc1820ffc48b1d74c1e37fe9"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:00177a63030d612148e659b55ba99527803288cea7c75fb05766ab7981a8c1b7"},
-    {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8853a3bf12afddfdf15f57c4b02d7ded92c7a75a5d7331d19f4f9572a89c17e6"},
-    {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3107c66e43bda25359d5ef446f59c497de2b5ed4c7fdba0894f8d6cf3822dafc"},
-    {file = "pillow-11.0.0-cp312-cp312-win32.whl", hash = "sha256:86510e3f5eca0ab87429dd77fafc04693195eec7fd6a137c389c3eeb4cfb77c6"},
-    {file = "pillow-11.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:8ec4a89295cd6cd4d1058a5e6aec6bf51e0eaaf9714774e1bfac7cfc9051db47"},
-    {file = "pillow-11.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:27a7860107500d813fcd203b4ea19b04babe79448268403172782754870dac25"},
-    {file = "pillow-11.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:bcd1fb5bb7b07f64c15618c89efcc2cfa3e95f0e3bcdbaf4642509de1942a699"},
-    {file = "pillow-11.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0e038b0745997c7dcaae350d35859c9715c71e92ffb7e0f4a8e8a16732150f38"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ae08bd8ffc41aebf578c2af2f9d8749d91f448b3bfd41d7d9ff573d74f2a6b2"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d69bfd8ec3219ae71bcde1f942b728903cad25fafe3100ba2258b973bd2bc1b2"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:61b887f9ddba63ddf62fd02a3ba7add935d053b6dd7d58998c630e6dbade8527"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:c6a660307ca9d4867caa8d9ca2c2658ab685de83792d1876274991adec7b93fa"},
-    {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:73e3a0200cdda995c7e43dd47436c1548f87a30bb27fb871f352a22ab8dcf45f"},
-    {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fba162b8872d30fea8c52b258a542c5dfd7b235fb5cb352240c8d63b414013eb"},
-    {file = "pillow-11.0.0-cp313-cp313-win32.whl", hash = "sha256:f1b82c27e89fffc6da125d5eb0ca6e68017faf5efc078128cfaa42cf5cb38798"},
-    {file = "pillow-11.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:8ba470552b48e5835f1d23ecb936bb7f71d206f9dfeee64245f30c3270b994de"},
-    {file = "pillow-11.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:846e193e103b41e984ac921b335df59195356ce3f71dcfd155aa79c603873b84"},
-    {file = "pillow-11.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4ad70c4214f67d7466bea6a08061eba35c01b1b89eaa098040a35272a8efb22b"},
-    {file = "pillow-11.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ec0d5af64f2e3d64a165f490d96368bb5dea8b8f9ad04487f9ab60dc4bb6003"},
-    {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c809a70e43c7977c4a42aefd62f0131823ebf7dd73556fa5d5950f5b354087e2"},
-    {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4b60c9520f7207aaf2e1d94de026682fc227806c6e1f55bba7606d1c94dd623a"},
-    {file = "pillow-11.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1e2688958a840c822279fda0086fec1fdab2f95bf2b717b66871c4ad9859d7e8"},
-    {file = "pillow-11.0.0-cp313-cp313t-win32.whl", hash = "sha256:607bbe123c74e272e381a8d1957083a9463401f7bd01287f50521ecb05a313f8"},
-    {file = "pillow-11.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5c39ed17edea3bc69c743a8dd3e9853b7509625c2462532e62baa0732163a904"},
-    {file = "pillow-11.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:75acbbeb05b86bc53cbe7b7e6fe00fbcf82ad7c684b3ad82e3d711da9ba287d3"},
-    {file = "pillow-11.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2e46773dc9f35a1dd28bd6981332fd7f27bec001a918a72a79b4133cf5291dba"},
-    {file = "pillow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2679d2258b7f1192b378e2893a8a0a0ca472234d4c2c0e6bdd3380e8dfa21b6a"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eda2616eb2313cbb3eebbe51f19362eb434b18e3bb599466a1ffa76a033fb916"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ec184af98a121fb2da42642dea8a29ec80fc3efbaefb86d8fdd2606619045d"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:8594f42df584e5b4bb9281799698403f7af489fba84c34d53d1c4bfb71b7c4e7"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:c12b5ae868897c7338519c03049a806af85b9b8c237b7d675b8c5e089e4a618e"},
-    {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:70fbbdacd1d271b77b7721fe3cdd2d537bbbd75d29e6300c672ec6bb38d9672f"},
-    {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5178952973e588b3f1360868847334e9e3bf49d19e169bbbdfaf8398002419ae"},
-    {file = "pillow-11.0.0-cp39-cp39-win32.whl", hash = "sha256:8c676b587da5673d3c75bd67dd2a8cdfeb282ca38a30f37950511766b26858c4"},
-    {file = "pillow-11.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:94f3e1780abb45062287b4614a5bc0874519c86a777d4a7ad34978e86428b8dd"},
-    {file = "pillow-11.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:290f2cc809f9da7d6d622550bbf4c1e57518212da51b6a30fe8e0a270a5b78bd"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1187739620f2b365de756ce086fdb3604573337cc28a0d3ac4a01ab6b2d2a6d2"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fbbcb7b57dc9c794843e3d1258c0fbf0f48656d46ffe9e09b63bbd6e8cd5d0a2"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d203af30149ae339ad1b4f710d9844ed8796e97fda23ffbc4cc472968a47d0b"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21a0d3b115009ebb8ac3d2ebec5c2982cc693da935f4ab7bb5c8ebe2f47d36f2"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:73853108f56df97baf2bb8b522f3578221e56f646ba345a372c78326710d3830"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e58876c91f97b0952eb766123bfef372792ab3f4e3e1f1a2267834c2ab131734"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:224aaa38177597bb179f3ec87eeefcce8e4f85e608025e9cfac60de237ba6316"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:5bd2d3bdb846d757055910f0a59792d33b555800813c3b39ada1829c372ccb06"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:375b8dd15a1f5d2feafff536d47e22f69625c1aa92f12b339ec0b2ca40263273"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:daffdf51ee5db69a82dd127eabecce20729e21f7a3680cf7cbb23f0829189790"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7326a1787e3c7b0429659e0a944725e1b03eeaa10edd945a86dead1913383944"},
-    {file = "pillow-11.0.0.tar.gz", hash = "sha256:72bacbaf24ac003fea9bff9837d1eedb6088758d41e100c1552930151f677739"},
-]
-
-[package.extras]
-docs = ["furo", "olefile", "sphinx (>=8.1)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"]
-fpx = ["olefile"]
-mic = ["olefile"]
-tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
-typing = ["typing-extensions"]
-xmp = ["defusedxml"]
-
-[[package]]
-name = "pluggy"
-version = "1.5.0"
-description = "plugin and hook calling mechanisms for python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
-    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
-]
-
-[package.extras]
-dev = ["pre-commit", "tox"]
-testing = ["pytest", "pytest-benchmark"]
-
-[[package]]
-name = "prometheus-client"
-version = "0.20.0"
-description = "Python client for the Prometheus monitoring system."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "prometheus_client-0.20.0-py3-none-any.whl", hash = "sha256:cde524a85bce83ca359cc837f28b8c0db5cac7aa653a588fd7e84ba061c329e7"},
-    {file = "prometheus_client-0.20.0.tar.gz", hash = "sha256:287629d00b147a32dcb2be0b9df905da599b2d82f80377083ec8463309a4bb89"},
-]
-
-[package.extras]
-twisted = ["twisted"]
-
-[[package]]
-name = "propcache"
-version = "0.2.0"
-description = "Accelerated property cache"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
-    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
-    {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"},
-    {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"},
-    {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"},
-    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"},
-    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"},
-    {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"},
-    {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"},
-    {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"},
-    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"},
-    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"},
-    {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"},
-    {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"},
-    {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"},
-    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"},
-    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"},
-    {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"},
-    {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"},
-    {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"},
-    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"},
-    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"},
-    {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"},
-    {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"},
-    {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"},
-    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"},
-    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"},
-    {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"},
-    {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"},
-    {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"},
-    {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"},
-    {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"},
-]
-
-[[package]]
-name = "protobuf"
-version = "4.25.5"
-description = ""
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"},
-    {file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"},
-    {file = "protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173"},
-    {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d"},
-    {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331"},
-    {file = "protobuf-4.25.5-cp38-cp38-win32.whl", hash = "sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1"},
-    {file = "protobuf-4.25.5-cp38-cp38-win_amd64.whl", hash = "sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a"},
-    {file = "protobuf-4.25.5-cp39-cp39-win32.whl", hash = "sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f"},
-    {file = "protobuf-4.25.5-cp39-cp39-win_amd64.whl", hash = "sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45"},
-    {file = "protobuf-4.25.5-py3-none-any.whl", hash = "sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41"},
-    {file = "protobuf-4.25.5.tar.gz", hash = "sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584"},
-]
-
-[[package]]
-name = "psutil"
-version = "6.1.0"
-description = "Cross-platform lib for process and system monitoring in Python."
-optional = true
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
-files = [
-    {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
-    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
-    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"},
-    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"},
-    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"},
-    {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"},
-    {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"},
-    {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"},
-    {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"},
-    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"},
-    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"},
-    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"},
-    {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"},
-    {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"},
-    {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"},
-    {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"},
-    {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"},
-]
-
-[package.extras]
-dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"]
-test = ["pytest", "pytest-xdist", "setuptools"]
-
-[[package]]
-name = "py-cpuinfo"
-version = "9.0.0"
-description = "Get CPU info with pure Python"
-optional = false
-python-versions = "*"
-files = [
-    {file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"},
-    {file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"},
-]
-
-[[package]]
-name = "pyarrow"
-version = "17.0.0"
-description = "Python library for Apache Arrow"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
-    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
-    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
-    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
-    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
-    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
-    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
-    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
-    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
-    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
-    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
-    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
-    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
-    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
-    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
-    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
-]
-
-[package.dependencies]
-numpy = ">=1.16.6"
-
-[package.extras]
-test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
-
-[[package]]
-name = "pycountry"
-version = "24.6.1"
-description = "ISO country, subdivision, language, currency and script definitions and their translations"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f"},
-    {file = "pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221"},
-]
-
-[[package]]
-name = "pydantic"
-version = "2.9.2"
-description = "Data validation using Python type hints"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12"},
-    {file = "pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f"},
-]
-
-[package.dependencies]
-annotated-types = ">=0.6.0"
-pydantic-core = "2.23.4"
-typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""}
-
-[package.extras]
-email = ["email-validator (>=2.0.0)"]
-timezone = ["tzdata"]
-
-[[package]]
-name = "pydantic-core"
-version = "2.23.4"
-description = "Core functionality for Pydantic validation and serialization"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b10bd51f823d891193d4717448fab065733958bdb6a6b351967bd349d48d5c9b"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4fc714bdbfb534f94034efaa6eadd74e5b93c8fa6315565a222f7b6f42ca1166"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63e46b3169866bd62849936de036f901a9356e36376079b05efa83caeaa02ceb"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed1a53de42fbe34853ba90513cea21673481cd81ed1be739f7f2efb931b24916"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cfdd16ab5e59fc31b5e906d1a3f666571abc367598e3e02c83403acabc092e07"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255a8ef062cbf6674450e668482456abac99a5583bbafb73f9ad469540a3a232"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a7cd62e831afe623fbb7aabbb4fe583212115b3ef38a9f6b71869ba644624a2"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f09e2ff1f17c2b51f2bc76d1cc33da96298f0a036a137f5440ab3ec5360b624f"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e38e63e6f3d1cec5a27e0afe90a085af8b6806ee208b33030e65b6516353f1a3"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0dbd8dbed2085ed23b5c04afa29d8fd2771674223135dc9bc937f3c09284d071"},
-    {file = "pydantic_core-2.23.4-cp310-none-win32.whl", hash = "sha256:6531b7ca5f951d663c339002e91aaebda765ec7d61b7d1e3991051906ddde119"},
-    {file = "pydantic_core-2.23.4-cp310-none-win_amd64.whl", hash = "sha256:7c9129eb40958b3d4500fa2467e6a83356b3b61bfff1b414c7361d9220f9ae8f"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:77733e3892bb0a7fa797826361ce8a9184d25c8dffaec60b7ffe928153680ba8"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b84d168f6c48fabd1f2027a3d1bdfe62f92cade1fb273a5d68e621da0e44e6d"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df49e7a0861a8c36d089c1ed57d308623d60416dab2647a4a17fe050ba85de0e"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff02b6d461a6de369f07ec15e465a88895f3223eb75073ffea56b84d9331f607"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:996a38a83508c54c78a5f41456b0103c30508fed9abcad0a59b876d7398f25fd"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d97683ddee4723ae8c95d1eddac7c192e8c552da0c73a925a89fa8649bf13eea"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:216f9b2d7713eb98cb83c80b9c794de1f6b7e3145eef40400c62e86cee5f4e1e"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f783e0ec4803c787bcea93e13e9932edab72068f68ecffdf86a99fd5918878b"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d0776dea117cf5272382634bd2a5c1b6eb16767c223c6a5317cd3e2a757c61a0"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5f7a395a8cf1621939692dba2a6b6a830efa6b3cee787d82c7de1ad2930de64"},
-    {file = "pydantic_core-2.23.4-cp311-none-win32.whl", hash = "sha256:74b9127ffea03643e998e0c5ad9bd3811d3dac8c676e47db17b0ee7c3c3bf35f"},
-    {file = "pydantic_core-2.23.4-cp311-none-win_amd64.whl", hash = "sha256:98d134c954828488b153d88ba1f34e14259284f256180ce659e8d83e9c05eaa3"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f3e0da4ebaef65158d4dfd7d3678aad692f7666877df0002b8a522cdf088f231"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f69a8e0b033b747bb3e36a44e7732f0c99f7edd5cea723d45bc0d6e95377ffee"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723314c1d51722ab28bfcd5240d858512ffd3116449c557a1336cbe3919beb87"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bb2802e667b7051a1bebbfe93684841cc9351004e2badbd6411bf357ab8d5ac8"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18ca8148bebe1b0a382a27a8ee60350091a6ddaf475fa05ef50dc35b5df6327"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33e3d65a85a2a4a0dc3b092b938a4062b1a05f3a9abde65ea93b233bca0e03f2"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:128585782e5bfa515c590ccee4b727fb76925dd04a98864182b22e89a4e6ed36"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:68665f4c17edcceecc112dfed5dbe6f92261fb9d6054b47d01bf6371a6196126"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:20152074317d9bed6b7a95ade3b7d6054845d70584216160860425f4fbd5ee9e"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9261d3ce84fa1d38ed649c3638feefeae23d32ba9182963e465d58d62203bd24"},
-    {file = "pydantic_core-2.23.4-cp312-none-win32.whl", hash = "sha256:4ba762ed58e8d68657fc1281e9bb72e1c3e79cc5d464be146e260c541ec12d84"},
-    {file = "pydantic_core-2.23.4-cp312-none-win_amd64.whl", hash = "sha256:97df63000f4fea395b2824da80e169731088656d1818a11b95f3b173747b6cd9"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7530e201d10d7d14abce4fb54cfe5b94a0aefc87da539d0346a484ead376c3cc"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df933278128ea1cd77772673c73954e53a1c95a4fdf41eef97c2b779271bd0bd"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cb3da3fd1b6a5d0279a01877713dbda118a2a4fc6f0d821a57da2e464793f05"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c6dcb030aefb668a2b7009c85b27f90e51e6a3b4d5c9bc4c57631292015b0d"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:696dd8d674d6ce621ab9d45b205df149399e4bb9aa34102c970b721554828510"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2971bb5ffe72cc0f555c13e19b23c85b654dd2a8f7ab493c262071377bfce9f6"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8394d940e5d400d04cad4f75c0598665cbb81aecefaca82ca85bd28264af7f9b"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dff76e0602ca7d4cdaacc1ac4c005e0ce0dcfe095d5b5259163a80d3a10d327"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7d32706badfe136888bdea71c0def994644e09fff0bfe47441deaed8e96fdbc6"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f"},
-    {file = "pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769"},
-    {file = "pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d4488a93b071c04dc20f5cecc3631fc78b9789dd72483ba15d423b5b3689b555"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:81965a16b675b35e1d09dd14df53f190f9129c0202356ed44ab2728b1c905658"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ffa2ebd4c8530079140dd2d7f794a9d9a73cbb8e9d59ffe24c63436efa8f271"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:61817945f2fe7d166e75fbfb28004034b48e44878177fc54d81688e7b85a3665"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29d2c342c4bc01b88402d60189f3df065fb0dda3654744d5a165a5288a657368"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e11661ce0fd30a6790e8bcdf263b9ec5988e95e63cf901972107efc49218b13"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d18368b137c6295db49ce7218b1a9ba15c5bc254c96d7c9f9e924a9bc7825ad"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ec4e55f79b1c4ffb2eecd8a0cfba9955a2588497d96851f4c8f99aa4a1d39b12"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:374a5e5049eda9e0a44c696c7ade3ff355f06b1fe0bb945ea3cac2bc336478a2"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5c364564d17da23db1106787675fc7af45f2f7b58b4173bfdd105564e132e6fb"},
-    {file = "pydantic_core-2.23.4-cp38-none-win32.whl", hash = "sha256:d7a80d21d613eec45e3d41eb22f8f94ddc758a6c4720842dc74c0581f54993d6"},
-    {file = "pydantic_core-2.23.4-cp38-none-win_amd64.whl", hash = "sha256:5f5ff8d839f4566a474a969508fe1c5e59c31c80d9e140566f9a37bba7b8d556"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a4fa4fc04dff799089689f4fd502ce7d59de529fc2f40a2c8836886c03e0175a"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7df63886be5e270da67e0966cf4afbae86069501d35c8c1b3b6c168f42cb36"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcedcd19a557e182628afa1d553c3895a9f825b936415d0dbd3cd0bbcfd29b4b"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f54b118ce5de9ac21c363d9b3caa6c800341e8c47a508787e5868c6b79c9323"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d2f57d3e1379a9525c5ab067b27dbb8a0642fb5d454e17a9ac434f9ce523e3"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de6d1d1b9e5101508cb37ab0d972357cac5235f5c6533d1071964c47139257df"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1278e0d324f6908e872730c9102b0112477a7f7cf88b308e4fc36ce1bdb6d58c"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a6b5099eeec78827553827f4c6b8615978bb4b6a88e5d9b93eddf8bb6790f55"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e55541f756f9b3ee346b840103f32779c695a19826a4c442b7954550a0972040"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a5c7ba8ffb6d6f8f2ab08743be203654bb1aaa8c9dcb09f82ddd34eadb695605"},
-    {file = "pydantic_core-2.23.4-cp39-none-win32.whl", hash = "sha256:37b0fe330e4a58d3c58b24d91d1eb102aeec675a3db4c292ec3928ecd892a9a6"},
-    {file = "pydantic_core-2.23.4-cp39-none-win_amd64.whl", hash = "sha256:1498bec4c05c9c787bde9125cfdcc63a41004ff167f495063191b863399b1a29"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f455ee30a9d61d3e1a15abd5068827773d6e4dc513e795f380cdd59932c782d5"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1e90d2e3bd2c3863d48525d297cd143fe541be8bbf6f579504b9712cb6b643ec"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e203fdf807ac7e12ab59ca2bfcabb38c7cf0b33c41efeb00f8e5da1d86af480"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e08277a400de01bc72436a0ccd02bdf596631411f592ad985dcee21445bd0068"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f220b0eea5965dec25480b6333c788fb72ce5f9129e8759ef876a1d805d00801"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d06b0c8da4f16d1d1e352134427cb194a0a6e19ad5db9161bf32b2113409e728"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ba1a0996f6c2773bd83e63f18914c1de3c9dd26d55f4ac302a7efe93fb8e7433"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a5bce9d23aac8f0cf0836ecfc033896aa8443b501c58d0602dbfd5bd5b37753"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:78ddaaa81421a29574a682b3179d4cf9e6d405a09b99d93ddcf7e5239c742e21"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:883a91b5dd7d26492ff2f04f40fbb652de40fcc0afe07e8129e8ae779c2110eb"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88ad334a15b32a791ea935af224b9de1bf99bcd62fabf745d5f3442199d86d59"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233710f069d251feb12a56da21e14cca67994eab08362207785cf8c598e74577"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19442362866a753485ba5e4be408964644dd6a09123d9416c54cd49171f50744"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:624e278a7d29b6445e4e813af92af37820fafb6dcc55c012c834f9e26f9aaaef"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5ef8f42bec47f21d07668a043f077d507e5bf4e668d5c6dfe6aaba89de1a5b8"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:aea443fffa9fbe3af1a9ba721a87f926fe548d32cab71d188a6ede77d0ff244e"},
-    {file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"},
-]
-
-[package.dependencies]
-typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
-
-[[package]]
-name = "pygments"
-version = "2.18.0"
-description = "Pygments is a syntax highlighting package written in Python."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"},
-    {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"},
-]
-
-[package.extras]
-windows-terminal = ["colorama (>=0.4.6)"]
-
-[[package]]
-name = "pytest"
-version = "7.4.4"
-description = "pytest: simple powerful testing with Python"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
-    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
-]
-
-[package.dependencies]
-colorama = {version = "*", markers = "sys_platform == \"win32\""}
-exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
-iniconfig = "*"
-packaging = "*"
-pluggy = ">=0.12,<2.0"
-tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
-
-[package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
-
-[[package]]
-name = "python-dateutil"
-version = "2.9.0.post0"
-description = "Extensions to the standard Python datetime module"
-optional = true
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
-files = [
-    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
-    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
-]
-
-[package.dependencies]
-six = ">=1.5"
-
-[[package]]
-name = "pytz"
-version = "2024.2"
-description = "World timezone definitions, modern and historical"
-optional = true
-python-versions = "*"
-files = [
-    {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
-    {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
-]
-
-[[package]]
-name = "pyyaml"
-version = "6.0.2"
-description = "YAML parser and emitter for Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
-    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
-    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
-    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
-    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
-    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
-    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
-    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
-    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
-    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
-    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
-    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
-    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
-    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
-    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
-    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
-    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
-    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
-    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
-    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
-    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
-    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
-    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
-    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
-    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
-    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
-    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
-    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
-    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
-    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
-    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
-    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
-    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
-    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
-    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
-    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
-    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
-    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
-    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
-    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
-    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
-    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
-    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
-    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
-    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
-    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
-    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
-    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
-    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
-    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
-]
-
-[[package]]
-name = "referencing"
-version = "0.35.1"
-description = "JSON Referencing + Python"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de"},
-    {file = "referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c"},
-]
-
-[package.dependencies]
-attrs = ">=22.2.0"
-rpds-py = ">=0.7.0"
-
-[[package]]
-name = "regex"
-version = "2024.9.11"
-description = "Alternative regular expression module, to replace re."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1494fa8725c285a81d01dc8c06b55287a1ee5e0e382d8413adc0a9197aac6408"},
-    {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0e12c481ad92d129c78f13a2a3662317e46ee7ef96c94fd332e1c29131875b7d"},
-    {file = "regex-2024.9.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:16e13a7929791ac1216afde26f712802e3df7bf0360b32e4914dca3ab8baeea5"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46989629904bad940bbec2106528140a218b4a36bb3042d8406980be1941429c"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a906ed5e47a0ce5f04b2c981af1c9acf9e8696066900bf03b9d7879a6f679fc8"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9a091b0550b3b0207784a7d6d0f1a00d1d1c8a11699c1a4d93db3fbefc3ad35"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ddcd9a179c0a6fa8add279a4444015acddcd7f232a49071ae57fa6e278f1f71"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b41e1adc61fa347662b09398e31ad446afadff932a24807d3ceb955ed865cc8"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ced479f601cd2f8ca1fd7b23925a7e0ad512a56d6e9476f79b8f381d9d37090a"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:635a1d96665f84b292e401c3d62775851aedc31d4f8784117b3c68c4fcd4118d"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c0256beda696edcf7d97ef16b2a33a8e5a875affd6fa6567b54f7c577b30a137"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:3ce4f1185db3fbde8ed8aa223fc9620f276c58de8b0d4f8cc86fd1360829edb6"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:09d77559e80dcc9d24570da3745ab859a9cf91953062e4ab126ba9d5993688ca"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a22ccefd4db3f12b526eccb129390942fe874a3a9fdbdd24cf55773a1faab1a"},
-    {file = "regex-2024.9.11-cp310-cp310-win32.whl", hash = "sha256:f745ec09bc1b0bd15cfc73df6fa4f726dcc26bb16c23a03f9e3367d357eeedd0"},
-    {file = "regex-2024.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:01c2acb51f8a7d6494c8c5eafe3d8e06d76563d8a8a4643b37e9b2dd8a2ff623"},
-    {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2cce2449e5927a0bf084d346da6cd5eb016b2beca10d0013ab50e3c226ffc0df"},
-    {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b37fa423beefa44919e009745ccbf353d8c981516e807995b2bd11c2c77d268"},
-    {file = "regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:64ce2799bd75039b480cc0360907c4fb2f50022f030bf9e7a8705b636e408fad"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4cc92bb6db56ab0c1cbd17294e14f5e9224f0cc6521167ef388332604e92679"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d05ac6fa06959c4172eccd99a222e1fbf17b5670c4d596cb1e5cde99600674c4"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:040562757795eeea356394a7fb13076ad4f99d3c62ab0f8bdfb21f99a1f85664"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6113c008a7780792efc80f9dfe10ba0cd043cbf8dc9a76ef757850f51b4edc50"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e5fb5f77c8745a60105403a774fe2c1759b71d3e7b4ca237a5e67ad066c7199"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:54d9ff35d4515debf14bc27f1e3b38bfc453eff3220f5bce159642fa762fe5d4"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:df5cbb1fbc74a8305b6065d4ade43b993be03dbe0f8b30032cced0d7740994bd"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7fb89ee5d106e4a7a51bce305ac4efb981536301895f7bdcf93ec92ae0d91c7f"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a738b937d512b30bf75995c0159c0ddf9eec0775c9d72ac0202076c72f24aa96"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e28f9faeb14b6f23ac55bfbbfd3643f5c7c18ede093977f1df249f73fd22c7b1"},
-    {file = "regex-2024.9.11-cp311-cp311-win32.whl", hash = "sha256:18e707ce6c92d7282dfce370cd205098384b8ee21544e7cb29b8aab955b66fa9"},
-    {file = "regex-2024.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:313ea15e5ff2a8cbbad96ccef6be638393041b0a7863183c2d31e0c6116688cf"},
-    {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b0d0a6c64fcc4ef9c69bd5b3b3626cc3776520a1637d8abaa62b9edc147a58f7"},
-    {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:49b0e06786ea663f933f3710a51e9385ce0cba0ea56b67107fd841a55d56a231"},
-    {file = "regex-2024.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5b513b6997a0b2f10e4fd3a1313568e373926e8c252bd76c960f96fd039cd28d"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee439691d8c23e76f9802c42a95cfeebf9d47cf4ffd06f18489122dbb0a7ad64"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a8f877c89719d759e52783f7fe6e1c67121076b87b40542966c02de5503ace42"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23b30c62d0f16827f2ae9f2bb87619bc4fba2044911e2e6c2eb1af0161cdb766"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85ab7824093d8f10d44330fe1e6493f756f252d145323dd17ab6b48733ff6c0a"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8dee5b4810a89447151999428fe096977346cf2f29f4d5e29609d2e19e0199c9"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:98eeee2f2e63edae2181c886d7911ce502e1292794f4c5ee71e60e23e8d26b5d"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:57fdd2e0b2694ce6fc2e5ccf189789c3e2962916fb38779d3e3521ff8fe7a822"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d552c78411f60b1fdaafd117a1fca2f02e562e309223b9d44b7de8be451ec5e0"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a0b2b80321c2ed3fcf0385ec9e51a12253c50f146fddb2abbb10f033fe3d049a"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:18406efb2f5a0e57e3a5881cd9354c1512d3bb4f5c45d96d110a66114d84d23a"},
-    {file = "regex-2024.9.11-cp312-cp312-win32.whl", hash = "sha256:e464b467f1588e2c42d26814231edecbcfe77f5ac414d92cbf4e7b55b2c2a776"},
-    {file = "regex-2024.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:9e8719792ca63c6b8340380352c24dcb8cd7ec49dae36e963742a275dfae6009"},
-    {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c157bb447303070f256e084668b702073db99bbb61d44f85d811025fcf38f784"},
-    {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4db21ece84dfeefc5d8a3863f101995de646c6cb0536952c321a2650aa202c36"},
-    {file = "regex-2024.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:220e92a30b426daf23bb67a7962900ed4613589bab80382be09b48896d211e92"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb1ae19e64c14c7ec1995f40bd932448713d3c73509e82d8cd7744dc00e29e86"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f47cd43a5bfa48f86925fe26fbdd0a488ff15b62468abb5d2a1e092a4fb10e85"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9d4a76b96f398697fe01117093613166e6aa8195d63f1b4ec3f21ab637632963"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ea51dcc0835eea2ea31d66456210a4e01a076d820e9039b04ae8d17ac11dee6"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7aaa315101c6567a9a45d2839322c51c8d6e81f67683d529512f5bcfb99c802"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c57d08ad67aba97af57a7263c2d9006d5c404d721c5f7542f077f109ec2a4a29"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f8404bf61298bb6f8224bb9176c1424548ee1181130818fcd2cbffddc768bed8"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dd4490a33eb909ef5078ab20f5f000087afa2a4daa27b4c072ccb3cb3050ad84"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:eee9130eaad130649fd73e5cd92f60e55708952260ede70da64de420cdcad554"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6a2644a93da36c784e546de579ec1806bfd2763ef47babc1b03d765fe560c9f8"},
-    {file = "regex-2024.9.11-cp313-cp313-win32.whl", hash = "sha256:e997fd30430c57138adc06bba4c7c2968fb13d101e57dd5bb9355bf8ce3fa7e8"},
-    {file = "regex-2024.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:042c55879cfeb21a8adacc84ea347721d3d83a159da6acdf1116859e2427c43f"},
-    {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:35f4a6f96aa6cb3f2f7247027b07b15a374f0d5b912c0001418d1d55024d5cb4"},
-    {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:55b96e7ce3a69a8449a66984c268062fbaa0d8ae437b285428e12797baefce7e"},
-    {file = "regex-2024.9.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cb130fccd1a37ed894824b8c046321540263013da72745d755f2d35114b81a60"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:323c1f04be6b2968944d730e5c2091c8c89767903ecaa135203eec4565ed2b2b"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be1c8ed48c4c4065ecb19d882a0ce1afe0745dfad8ce48c49586b90a55f02366"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b5b029322e6e7b94fff16cd120ab35a253236a5f99a79fb04fda7ae71ca20ae8"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6fff13ef6b5f29221d6904aa816c34701462956aa72a77f1f151a8ec4f56aeb"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:587d4af3979376652010e400accc30404e6c16b7df574048ab1f581af82065e4"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:079400a8269544b955ffa9e31f186f01d96829110a3bf79dc338e9910f794fca"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f9268774428ec173654985ce55fc6caf4c6d11ade0f6f914d48ef4719eb05ebb"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:23f9985c8784e544d53fc2930fc1ac1a7319f5d5332d228437acc9f418f2f168"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ae2941333154baff9838e88aa71c1d84f4438189ecc6021a12c7573728b5838e"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:e93f1c331ca8e86fe877a48ad64e77882c0c4da0097f2212873a69bbfea95d0c"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:846bc79ee753acf93aef4184c040d709940c9d001029ceb7b7a52747b80ed2dd"},
-    {file = "regex-2024.9.11-cp38-cp38-win32.whl", hash = "sha256:c94bb0a9f1db10a1d16c00880bdebd5f9faf267273b8f5bd1878126e0fbde771"},
-    {file = "regex-2024.9.11-cp38-cp38-win_amd64.whl", hash = "sha256:2b08fce89fbd45664d3df6ad93e554b6c16933ffa9d55cb7e01182baaf971508"},
-    {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:07f45f287469039ffc2c53caf6803cd506eb5f5f637f1d4acb37a738f71dd066"},
-    {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4838e24ee015101d9f901988001038f7f0d90dc0c3b115541a1365fb439add62"},
-    {file = "regex-2024.9.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6edd623bae6a737f10ce853ea076f56f507fd7726bee96a41ee3d68d347e4d16"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c69ada171c2d0e97a4b5aa78fbb835e0ffbb6b13fc5da968c09811346564f0d3"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02087ea0a03b4af1ed6ebab2c54d7118127fee8d71b26398e8e4b05b78963199"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69dee6a020693d12a3cf892aba4808fe168d2a4cef368eb9bf74f5398bfd4ee8"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:297f54910247508e6e5cae669f2bc308985c60540a4edd1c77203ef19bfa63ca"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ecea58b43a67b1b79805f1a0255730edaf5191ecef84dbc4cc85eb30bc8b63b9"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eab4bb380f15e189d1313195b062a6aa908f5bd687a0ceccd47c8211e9cf0d4a"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0cbff728659ce4bbf4c30b2a1be040faafaa9eca6ecde40aaff86f7889f4ab39"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:54c4a097b8bc5bb0dfc83ae498061d53ad7b5762e00f4adaa23bee22b012e6ba"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:73d6d2f64f4d894c96626a75578b0bf7d9e56dcda8c3d037a2118fdfe9b1c664"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:e53b5fbab5d675aec9f0c501274c467c0f9a5d23696cfc94247e1fb56501ed89"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0ffbcf9221e04502fc35e54d1ce9567541979c3fdfb93d2c554f0ca583a19b35"},
-    {file = "regex-2024.9.11-cp39-cp39-win32.whl", hash = "sha256:e4c22e1ac1f1ec1e09f72e6c44d8f2244173db7eb9629cc3a346a8d7ccc31142"},
-    {file = "regex-2024.9.11-cp39-cp39-win_amd64.whl", hash = "sha256:faa3c142464efec496967359ca99696c896c591c56c53506bac1ad465f66e919"},
-    {file = "regex-2024.9.11.tar.gz", hash = "sha256:6c188c307e8433bcb63dc1915022deb553b4203a70722fc542c363bf120a01fd"},
-]
-
-[[package]]
-name = "requests"
-version = "2.32.3"
-description = "Python HTTP for Humans."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
-    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
-]
-
-[package.dependencies]
-certifi = ">=2017.4.17"
-charset-normalizer = ">=2,<4"
-idna = ">=2.5,<4"
-urllib3 = ">=1.21.1,<3"
-
-[package.extras]
-socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
-
-[[package]]
-name = "rich"
-version = "13.9.4"
-description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
-optional = false
-python-versions = ">=3.8.0"
-files = [
-    {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"},
-    {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"},
-]
-
-[package.dependencies]
-markdown-it-py = ">=2.2.0"
-pygments = ">=2.13.0,<3.0.0"
-typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""}
-
-[package.extras]
-jupyter = ["ipywidgets (>=7.5.1,<9)"]
-
-[[package]]
-name = "rpds-py"
-version = "0.20.0"
-description = "Python bindings to Rust's persistent data structures (rpds)"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "rpds_py-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3ad0fda1635f8439cde85c700f964b23ed5fc2d28016b32b9ee5fe30da5c84e2"},
-    {file = "rpds_py-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9bb4a0d90fdb03437c109a17eade42dfbf6190408f29b2744114d11586611d6f"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6377e647bbfd0a0b159fe557f2c6c602c159fc752fa316572f012fc0bf67150"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb851b7df9dda52dc1415ebee12362047ce771fc36914586b2e9fcbd7d293b3e"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e0f80b739e5a8f54837be5d5c924483996b603d5502bfff79bf33da06164ee2"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a8c94dad2e45324fc74dce25e1645d4d14df9a4e54a30fa0ae8bad9a63928e3"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8e604fe73ba048c06085beaf51147eaec7df856824bfe7b98657cf436623daf"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df3de6b7726b52966edf29663e57306b23ef775faf0ac01a3e9f4012a24a4140"},
-    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf258ede5bc22a45c8e726b29835b9303c285ab46fc7c3a4cc770736b5304c9f"},
-    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:55fea87029cded5df854ca7e192ec7bdb7ecd1d9a3f63d5c4eb09148acf4a7ce"},
-    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ae94bd0b2f02c28e199e9bc51485d0c5601f58780636185660f86bf80c89af94"},
-    {file = "rpds_py-0.20.0-cp310-none-win32.whl", hash = "sha256:28527c685f237c05445efec62426d285e47a58fb05ba0090a4340b73ecda6dee"},
-    {file = "rpds_py-0.20.0-cp310-none-win_amd64.whl", hash = "sha256:238a2d5b1cad28cdc6ed15faf93a998336eb041c4e440dd7f902528b8891b399"},
-    {file = "rpds_py-0.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ac2f4f7a98934c2ed6505aead07b979e6f999389f16b714448fb39bbaa86a489"},
-    {file = "rpds_py-0.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:220002c1b846db9afd83371d08d239fdc865e8f8c5795bbaec20916a76db3318"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d7919548df3f25374a1f5d01fbcd38dacab338ef5f33e044744b5c36729c8db"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:758406267907b3781beee0f0edfe4a179fbd97c0be2e9b1154d7f0a1279cf8e5"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d61339e9f84a3f0767b1995adfb171a0d00a1185192718a17af6e124728e0f5"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1259c7b3705ac0a0bd38197565a5d603218591d3f6cee6e614e380b6ba61c6f6"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c1dc0f53856b9cc9a0ccca0a7cc61d3d20a7088201c0937f3f4048c1718a209"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7e60cb630f674a31f0368ed32b2a6b4331b8350d67de53c0359992444b116dd3"},
-    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbe982f38565bb50cb7fb061ebf762c2f254ca3d8c20d4006878766e84266272"},
-    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:514b3293b64187172bc77c8fb0cdae26981618021053b30d8371c3a902d4d5ad"},
-    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d0a26ffe9d4dd35e4dfdd1e71f46401cff0181c75ac174711ccff0459135fa58"},
-    {file = "rpds_py-0.20.0-cp311-none-win32.whl", hash = "sha256:89c19a494bf3ad08c1da49445cc5d13d8fefc265f48ee7e7556839acdacf69d0"},
-    {file = "rpds_py-0.20.0-cp311-none-win_amd64.whl", hash = "sha256:c638144ce971df84650d3ed0096e2ae7af8e62ecbbb7b201c8935c370df00a2c"},
-    {file = "rpds_py-0.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a84ab91cbe7aab97f7446652d0ed37d35b68a465aeef8fc41932a9d7eee2c1a6"},
-    {file = "rpds_py-0.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:56e27147a5a4c2c21633ff8475d185734c0e4befd1c989b5b95a5d0db699b21b"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2580b0c34583b85efec8c5c5ec9edf2dfe817330cc882ee972ae650e7b5ef739"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b80d4a7900cf6b66bb9cee5c352b2d708e29e5a37fe9bf784fa97fc11504bf6c"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50eccbf054e62a7b2209b28dc7a22d6254860209d6753e6b78cfaeb0075d7bee"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:49a8063ea4296b3a7e81a5dfb8f7b2d73f0b1c20c2af401fb0cdf22e14711a96"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea438162a9fcbee3ecf36c23e6c68237479f89f962f82dae83dc15feeceb37e4"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:18d7585c463087bddcfa74c2ba267339f14f2515158ac4db30b1f9cbdb62c8ef"},
-    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d4c7d1a051eeb39f5c9547e82ea27cbcc28338482242e3e0b7768033cb083821"},
-    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4df1e3b3bec320790f699890d41c59d250f6beda159ea3c44c3f5bac1976940"},
-    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2cf126d33a91ee6eedc7f3197b53e87a2acdac63602c0f03a02dd69e4b138174"},
-    {file = "rpds_py-0.20.0-cp312-none-win32.whl", hash = "sha256:8bc7690f7caee50b04a79bf017a8d020c1f48c2a1077ffe172abec59870f1139"},
-    {file = "rpds_py-0.20.0-cp312-none-win_amd64.whl", hash = "sha256:0e13e6952ef264c40587d510ad676a988df19adea20444c2b295e536457bc585"},
-    {file = "rpds_py-0.20.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:aa9a0521aeca7d4941499a73ad7d4f8ffa3d1affc50b9ea11d992cd7eff18a29"},
-    {file = "rpds_py-0.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a1f1d51eccb7e6c32ae89243cb352389228ea62f89cd80823ea7dd1b98e0b91"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a86a9b96070674fc88b6f9f71a97d2c1d3e5165574615d1f9168ecba4cecb24"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c8ef2ebf76df43f5750b46851ed1cdf8f109d7787ca40035fe19fbdc1acc5a7"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b74b25f024b421d5859d156750ea9a65651793d51b76a2e9238c05c9d5f203a9"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57eb94a8c16ab08fef6404301c38318e2c5a32216bf5de453e2714c964c125c8"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1940dae14e715e2e02dfd5b0f64a52e8374a517a1e531ad9412319dc3ac7879"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d20277fd62e1b992a50c43f13fbe13277a31f8c9f70d59759c88f644d66c619f"},
-    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:06db23d43f26478303e954c34c75182356ca9aa7797d22c5345b16871ab9c45c"},
-    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2a5db5397d82fa847e4c624b0c98fe59d2d9b7cf0ce6de09e4d2e80f8f5b3f2"},
-    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a35df9f5548fd79cb2f52d27182108c3e6641a4feb0f39067911bf2adaa3e57"},
-    {file = "rpds_py-0.20.0-cp313-none-win32.whl", hash = "sha256:fd2d84f40633bc475ef2d5490b9c19543fbf18596dcb1b291e3a12ea5d722f7a"},
-    {file = "rpds_py-0.20.0-cp313-none-win_amd64.whl", hash = "sha256:9bc2d153989e3216b0559251b0c260cfd168ec78b1fac33dd485750a228db5a2"},
-    {file = "rpds_py-0.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f2fbf7db2012d4876fb0d66b5b9ba6591197b0f165db8d99371d976546472a24"},
-    {file = "rpds_py-0.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1e5f3cd7397c8f86c8cc72d5a791071431c108edd79872cdd96e00abd8497d29"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce9845054c13696f7af7f2b353e6b4f676dab1b4b215d7fe5e05c6f8bb06f965"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c3e130fd0ec56cb76eb49ef52faead8ff09d13f4527e9b0c400307ff72b408e1"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b16aa0107ecb512b568244ef461f27697164d9a68d8b35090e9b0c1c8b27752"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa7f429242aae2947246587d2964fad750b79e8c233a2367f71b554e9447949c"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af0fc424a5842a11e28956e69395fbbeab2c97c42253169d87e90aac2886d751"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b8c00a3b1e70c1d3891f0db1b05292747f0dbcfb49c43f9244d04c70fbc40eb8"},
-    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:40ce74fc86ee4645d0a225498d091d8bc61f39b709ebef8204cb8b5a464d3c0e"},
-    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:4fe84294c7019456e56d93e8ababdad5a329cd25975be749c3f5f558abb48253"},
-    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:338ca4539aad4ce70a656e5187a3a31c5204f261aef9f6ab50e50bcdffaf050a"},
-    {file = "rpds_py-0.20.0-cp38-none-win32.whl", hash = "sha256:54b43a2b07db18314669092bb2de584524d1ef414588780261e31e85846c26a5"},
-    {file = "rpds_py-0.20.0-cp38-none-win_amd64.whl", hash = "sha256:a1862d2d7ce1674cffa6d186d53ca95c6e17ed2b06b3f4c476173565c862d232"},
-    {file = "rpds_py-0.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:3fde368e9140312b6e8b6c09fb9f8c8c2f00999d1823403ae90cc00480221b22"},
-    {file = "rpds_py-0.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9824fb430c9cf9af743cf7aaf6707bf14323fb51ee74425c380f4c846ea70789"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11ef6ce74616342888b69878d45e9f779b95d4bd48b382a229fe624a409b72c5"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c52d3f2f82b763a24ef52f5d24358553e8403ce05f893b5347098014f2d9eff2"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d35cef91e59ebbeaa45214861874bc6f19eb35de96db73e467a8358d701a96c"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d72278a30111e5b5525c1dd96120d9e958464316f55adb030433ea905866f4de"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4c29cbbba378759ac5786730d1c3cb4ec6f8ababf5c42a9ce303dc4b3d08cda"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6632f2d04f15d1bd6fe0eedd3b86d9061b836ddca4c03d5cf5c7e9e6b7c14580"},
-    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d0b67d87bb45ed1cd020e8fbf2307d449b68abc45402fe1a4ac9e46c3c8b192b"},
-    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ec31a99ca63bf3cd7f1a5ac9fe95c5e2d060d3c768a09bc1d16e235840861420"},
-    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22e6c9976e38f4d8c4a63bd8a8edac5307dffd3ee7e6026d97f3cc3a2dc02a0b"},
-    {file = "rpds_py-0.20.0-cp39-none-win32.whl", hash = "sha256:569b3ea770c2717b730b61998b6c54996adee3cef69fc28d444f3e7920313cf7"},
-    {file = "rpds_py-0.20.0-cp39-none-win_amd64.whl", hash = "sha256:e6900ecdd50ce0facf703f7a00df12374b74bbc8ad9fe0f6559947fb20f82364"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:617c7357272c67696fd052811e352ac54ed1d9b49ab370261a80d3b6ce385045"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9426133526f69fcaba6e42146b4e12d6bc6c839b8b555097020e2b78ce908dcc"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deb62214c42a261cb3eb04d474f7155279c1a8a8c30ac89b7dcb1721d92c3c02"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcaeb7b57f1a1e071ebd748984359fef83ecb026325b9d4ca847c95bc7311c92"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d454b8749b4bd70dd0a79f428731ee263fa6995f83ccb8bada706e8d1d3ff89d"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d807dc2051abe041b6649681dce568f8e10668e3c1c6543ebae58f2d7e617855"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3c20f0ddeb6e29126d45f89206b8291352b8c5b44384e78a6499d68b52ae511"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b7f19250ceef892adf27f0399b9e5afad019288e9be756d6919cb58892129f51"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4f1ed4749a08379555cebf4650453f14452eaa9c43d0a95c49db50c18b7da075"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dcedf0b42bcb4cfff4101d7771a10532415a6106062f005ab97d1d0ab5681c60"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:39ed0d010457a78f54090fafb5d108501b5aa5604cc22408fc1c0c77eac14344"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bb273176be34a746bdac0b0d7e4e2c467323d13640b736c4c477881a3220a989"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f918a1a130a6dfe1d7fe0f105064141342e7dd1611f2e6a21cd2f5c8cb1cfb3e"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:f60012a73aa396be721558caa3a6fd49b3dd0033d1675c6d59c4502e870fcf0c"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d2b1ad682a3dfda2a4e8ad8572f3100f95fad98cb99faf37ff0ddfe9cbf9d03"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:614fdafe9f5f19c63ea02817fa4861c606a59a604a77c8cdef5aa01d28b97921"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa518bcd7600c584bf42e6617ee8132869e877db2f76bcdc281ec6a4113a53ab"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0475242f447cc6cb8a9dd486d68b2ef7fbee84427124c232bff5f63b1fe11e5"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f90a4cd061914a60bd51c68bcb4357086991bd0bb93d8aa66a6da7701370708f"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:def7400461c3a3f26e49078302e1c1b38f6752342c77e3cf72ce91ca69fb1bc1"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:65794e4048ee837494aea3c21a28ad5fc080994dfba5b036cf84de37f7ad5074"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:faefcc78f53a88f3076b7f8be0a8f8d35133a3ecf7f3770895c25f8813460f08"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5b4f105deeffa28bbcdff6c49b34e74903139afa690e35d2d9e3c2c2fba18cec"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdfc3a892927458d98f3d55428ae46b921d1f7543b89382fdb483f5640daaec8"},
-    {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
-]
-
-[[package]]
-name = "safetensors"
-version = "0.4.5"
-description = ""
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7"},
-    {file = "safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6885016f34bef80ea1085b7e99b3c1f92cb1be78a49839203060f67b40aee761"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:133620f443450429322f238fda74d512c4008621227fccf2f8cf4a76206fea7c"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4fb3e0609ec12d2a77e882f07cced530b8262027f64b75d399f1504ffec0ba56"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0f1dd769f064adc33831f5e97ad07babbd728427f98e3e1db6902e369122737"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6d156bdb26732feada84f9388a9f135528c1ef5b05fae153da365ad4319c4c5"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e347d77e2c77eb7624400ccd09bed69d35c0332f417ce8c048d404a096c593b"},
-    {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9f556eea3aec1d3d955403159fe2123ddd68e880f83954ee9b4a3f2e15e716b6"},
-    {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9483f42be3b6bc8ff77dd67302de8ae411c4db39f7224dec66b0eb95822e4163"},
-    {file = "safetensors-0.4.5-cp310-none-win32.whl", hash = "sha256:7389129c03fadd1ccc37fd1ebbc773f2b031483b04700923c3511d2a939252cc"},
-    {file = "safetensors-0.4.5-cp310-none-win_amd64.whl", hash = "sha256:e98ef5524f8b6620c8cdef97220c0b6a5c1cef69852fcd2f174bb96c2bb316b1"},
-    {file = "safetensors-0.4.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:21f848d7aebd5954f92538552d6d75f7c1b4500f51664078b5b49720d180e47c"},
-    {file = "safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb07000b19d41e35eecef9a454f31a8b4718a185293f0d0b1c4b61d6e4487971"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09dedf7c2fda934ee68143202acff6e9e8eb0ddeeb4cfc24182bef999efa9f42"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:59b77e4b7a708988d84f26de3ebead61ef1659c73dcbc9946c18f3b1786d2688"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d3bc83e14d67adc2e9387e511097f254bd1b43c3020440e708858c684cbac68"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39371fc551c1072976073ab258c3119395294cf49cdc1f8476794627de3130df"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6c19feda32b931cae0acd42748a670bdf56bee6476a046af20181ad3fee4090"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a659467495de201e2f282063808a41170448c78bada1e62707b07a27b05e6943"},
-    {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bad5e4b2476949bcd638a89f71b6916fa9a5cae5c1ae7eede337aca2100435c0"},
-    {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a3a315a6d0054bc6889a17f5668a73f94f7fe55121ff59e0a199e3519c08565f"},
-    {file = "safetensors-0.4.5-cp311-none-win32.whl", hash = "sha256:a01e232e6d3d5cf8b1667bc3b657a77bdab73f0743c26c1d3c5dd7ce86bd3a92"},
-    {file = "safetensors-0.4.5-cp311-none-win_amd64.whl", hash = "sha256:cbd39cae1ad3e3ef6f63a6f07296b080c951f24cec60188378e43d3713000c04"},
-    {file = "safetensors-0.4.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:473300314e026bd1043cef391bb16a8689453363381561b8a3e443870937cc1e"},
-    {file = "safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:801183a0f76dc647f51a2d9141ad341f9665602a7899a693207a82fb102cc53e"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1524b54246e422ad6fb6aea1ac71edeeb77666efa67230e1faf6999df9b2e27f"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3139098e3e8b2ad7afbca96d30ad29157b50c90861084e69fcb80dec7430461"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65573dc35be9059770808e276b017256fa30058802c29e1038eb1c00028502ea"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd33da8e9407559f8779c82a0448e2133737f922d71f884da27184549416bfed"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3685ce7ed036f916316b567152482b7e959dc754fcc4a8342333d222e05f407c"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dde2bf390d25f67908278d6f5d59e46211ef98e44108727084d4637ee70ab4f1"},
-    {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7469d70d3de970b1698d47c11ebbf296a308702cbaae7fcb993944751cf985f4"},
-    {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3a6ba28118636a130ccbb968bc33d4684c48678695dba2590169d5ab03a45646"},
-    {file = "safetensors-0.4.5-cp312-none-win32.whl", hash = "sha256:c859c7ed90b0047f58ee27751c8e56951452ed36a67afee1b0a87847d065eec6"},
-    {file = "safetensors-0.4.5-cp312-none-win_amd64.whl", hash = "sha256:b5a8810ad6a6f933fff6c276eae92c1da217b39b4d8b1bc1c0b8af2d270dc532"},
-    {file = "safetensors-0.4.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:25e5f8e2e92a74f05b4ca55686234c32aac19927903792b30ee6d7bd5653d54e"},
-    {file = "safetensors-0.4.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81efb124b58af39fcd684254c645e35692fea81c51627259cdf6d67ff4458916"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:585f1703a518b437f5103aa9cf70e9bd437cb78eea9c51024329e4fb8a3e3679"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b99fbf72e3faf0b2f5f16e5e3458b93b7d0a83984fe8d5364c60aa169f2da89"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b17b299ca9966ca983ecda1c0791a3f07f9ca6ab5ded8ef3d283fff45f6bcd5f"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76ded72f69209c9780fdb23ea89e56d35c54ae6abcdec67ccb22af8e696e449a"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2783956926303dcfeb1de91a4d1204cd4089ab441e622e7caee0642281109db3"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d94581aab8c6b204def4d7320f07534d6ee34cd4855688004a4354e63b639a35"},
-    {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:67e1e7cb8678bb1b37ac48ec0df04faf689e2f4e9e81e566b5c63d9f23748523"},
-    {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:dbd280b07e6054ea68b0cb4b16ad9703e7d63cd6890f577cb98acc5354780142"},
-    {file = "safetensors-0.4.5-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:77d9b228da8374c7262046a36c1f656ba32a93df6cc51cd4453af932011e77f1"},
-    {file = "safetensors-0.4.5-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:500cac01d50b301ab7bb192353317035011c5ceeef0fca652f9f43c000bb7f8d"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75331c0c746f03158ded32465b7d0b0e24c5a22121743662a2393439c43a45cf"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:670e95fe34e0d591d0529e5e59fd9d3d72bc77b1444fcaa14dccda4f36b5a38b"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:098923e2574ff237c517d6e840acada8e5b311cb1fa226019105ed82e9c3b62f"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ca0902d2648775089fa6a0c8fc9e6390c5f8ee576517d33f9261656f851e3f"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f0032bedc869c56f8d26259fe39cd21c5199cd57f2228d817a0e23e8370af25"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4b15f51b4f8f2a512341d9ce3475cacc19c5fdfc5db1f0e19449e75f95c7dc8"},
-    {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f6594d130d0ad933d885c6a7b75c5183cb0e8450f799b80a39eae2b8508955eb"},
-    {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:60c828a27e852ded2c85fc0f87bf1ec20e464c5cd4d56ff0e0711855cc2e17f8"},
-    {file = "safetensors-0.4.5-cp37-none-win32.whl", hash = "sha256:6d3de65718b86c3eeaa8b73a9c3d123f9307a96bbd7be9698e21e76a56443af5"},
-    {file = "safetensors-0.4.5-cp37-none-win_amd64.whl", hash = "sha256:5a2d68a523a4cefd791156a4174189a4114cf0bf9c50ceb89f261600f3b2b81a"},
-    {file = "safetensors-0.4.5-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:e7a97058f96340850da0601a3309f3d29d6191b0702b2da201e54c6e3e44ccf0"},
-    {file = "safetensors-0.4.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:63bfd425e25f5c733f572e2246e08a1c38bd6f2e027d3f7c87e2e43f228d1345"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3664ac565d0e809b0b929dae7ccd74e4d3273cd0c6d1220c6430035befb678e"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:313514b0b9b73ff4ddfb4edd71860696dbe3c1c9dc4d5cc13dbd74da283d2cbf"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31fa33ee326f750a2f2134a6174773c281d9a266ccd000bd4686d8021f1f3dac"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:09566792588d77b68abe53754c9f1308fadd35c9f87be939e22c623eaacbed6b"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309aaec9b66cbf07ad3a2e5cb8a03205663324fea024ba391594423d0f00d9fe"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:53946c5813b8f9e26103c5efff4a931cc45d874f45229edd68557ffb35ffb9f8"},
-    {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:868f9df9e99ad1e7f38c52194063a982bc88fedc7d05096f4f8160403aaf4bd6"},
-    {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9cc9449bd0b0bc538bd5e268221f0c5590bc5c14c1934a6ae359d44410dc68c4"},
-    {file = "safetensors-0.4.5-cp38-none-win32.whl", hash = "sha256:83c4f13a9e687335c3928f615cd63a37e3f8ef072a3f2a0599fa09f863fb06a2"},
-    {file = "safetensors-0.4.5-cp38-none-win_amd64.whl", hash = "sha256:b98d40a2ffa560653f6274e15b27b3544e8e3713a44627ce268f419f35c49478"},
-    {file = "safetensors-0.4.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:cf727bb1281d66699bef5683b04d98c894a2803442c490a8d45cd365abfbdeb2"},
-    {file = "safetensors-0.4.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96f1d038c827cdc552d97e71f522e1049fef0542be575421f7684756a748e457"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:139fbee92570ecea774e6344fee908907db79646d00b12c535f66bc78bd5ea2c"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c36302c1c69eebb383775a89645a32b9d266878fab619819ce660309d6176c9b"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d641f5b8149ea98deb5ffcf604d764aad1de38a8285f86771ce1abf8e74c4891"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b4db6a61d968de73722b858038c616a1bebd4a86abe2688e46ca0cc2d17558f2"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b75a616e02f21b6f1d5785b20cecbab5e2bd3f6358a90e8925b813d557666ec1"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:788ee7d04cc0e0e7f944c52ff05f52a4415b312f5efd2ee66389fb7685ee030c"},
-    {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87bc42bd04fd9ca31396d3ca0433db0be1411b6b53ac5a32b7845a85d01ffc2e"},
-    {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4037676c86365a721a8c9510323a51861d703b399b78a6b4486a54a65a975fca"},
-    {file = "safetensors-0.4.5-cp39-none-win32.whl", hash = "sha256:1500418454529d0ed5c1564bda376c4ddff43f30fce9517d9bee7bcce5a8ef50"},
-    {file = "safetensors-0.4.5-cp39-none-win_amd64.whl", hash = "sha256:9d1a94b9d793ed8fe35ab6d5cea28d540a46559bafc6aae98f30ee0867000cab"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fdadf66b5a22ceb645d5435a0be7a0292ce59648ca1d46b352f13cff3ea80410"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d42ffd4c2259f31832cb17ff866c111684c87bd930892a1ba53fed28370c918c"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd8a1f6d2063a92cd04145c7fd9e31a1c7d85fbec20113a14b487563fdbc0597"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:951d2fcf1817f4fb0ef0b48f6696688a4e852a95922a042b3f96aaa67eedc920"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ac85d9a8c1af0e3132371d9f2d134695a06a96993c2e2f0bbe25debb9e3f67a"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e3cec4a29eb7fe8da0b1c7988bc3828183080439dd559f720414450de076fcab"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c7db3006a4915151ce1913652e907cdede299b974641a83fbc092102ac41b644"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f68bf99ea970960a237f416ea394e266e0361895753df06e3e06e6ea7907d98b"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8158938cf3324172df024da511839d373c40fbfaa83e9abf467174b2910d7b4c"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:540ce6c4bf6b58cb0fd93fa5f143bc0ee341c93bb4f9287ccd92cf898cc1b0dd"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bfeaa1a699c6b9ed514bd15e6a91e74738b71125a9292159e3d6b7f0a53d2cde"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:01c8f00da537af711979e1b42a69a8ec9e1d7112f208e0e9b8a35d2c381085ef"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a0dd565f83b30f2ca79b5d35748d0d99dd4b3454f80e03dfb41f0038e3bdf180"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:023b6e5facda76989f4cba95a861b7e656b87e225f61811065d5c501f78cdb3f"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9633b663393d5796f0b60249549371e392b75a0b955c07e9c6f8708a87fc841f"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78dd8adfb48716233c45f676d6e48534d34b4bceb50162c13d1f0bdf6f78590a"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8e8deb16c4321d61ae72533b8451ec4a9af8656d1c61ff81aa49f966406e4b68"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:52452fa5999dc50c4decaf0c53aa28371f7f1e0fe5c2dd9129059fbe1e1599c7"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d5f23198821e227cfc52d50fa989813513db381255c6d100927b012f0cfec63d"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f4beb84b6073b1247a773141a6331117e35d07134b3bb0383003f39971d414bb"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:68814d599d25ed2fdd045ed54d370d1d03cf35e02dce56de44c651f828fb9b7b"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0b6453c54c57c1781292c46593f8a37254b8b99004c68d6c3ce229688931a22"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adaa9c6dead67e2dd90d634f89131e43162012479d86e25618e821a03d1eb1dc"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73e7d408e9012cd17511b382b43547850969c7979efc2bc353f317abaf23c84c"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:775409ce0fcc58b10773fdb4221ed1eb007de10fe7adbdf8f5e8a56096b6f0bc"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:834001bed193e4440c4a3950a31059523ee5090605c907c66808664c932b549c"},
-    {file = "safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310"},
-]
-
-[package.extras]
-all = ["safetensors[jax]", "safetensors[numpy]", "safetensors[paddlepaddle]", "safetensors[pinned-tf]", "safetensors[quality]", "safetensors[testing]", "safetensors[torch]"]
-dev = ["safetensors[all]"]
-jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "safetensors[numpy]"]
-mlx = ["mlx (>=0.0.9)"]
-numpy = ["numpy (>=1.21.6)"]
-paddlepaddle = ["paddlepaddle (>=2.4.1)", "safetensors[numpy]"]
-pinned-tf = ["safetensors[numpy]", "tensorflow (==2.11.0)"]
-quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"]
-tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
-testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
-torch = ["safetensors[numpy]", "torch (>=1.10)"]
-
-[[package]]
-name = "scipy"
-version = "1.13.1"
-description = "Fundamental algorithms for scientific computing in Python"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca"},
-    {file = "scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f"},
-    {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989"},
-    {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f"},
-    {file = "scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94"},
-    {file = "scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54"},
-    {file = "scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9"},
-    {file = "scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326"},
-    {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299"},
-    {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa"},
-    {file = "scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59"},
-    {file = "scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b"},
-    {file = "scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1"},
-    {file = "scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d"},
-    {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627"},
-    {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884"},
-    {file = "scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16"},
-    {file = "scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949"},
-    {file = "scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:436bbb42a94a8aeef855d755ce5a465479c721e9d684de76bf61a62e7c2b81d5"},
-    {file = "scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:8335549ebbca860c52bf3d02f80784e91a004b71b059e3eea9678ba994796a24"},
-    {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d533654b7d221a6a97304ab63c41c96473ff04459e404b83275b60aa8f4b7004"},
-    {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d"},
-    {file = "scipy-1.13.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a014c2b3697bde71724244f63de2476925596c24285c7a637364761f8710891c"},
-    {file = "scipy-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:392e4ec766654852c25ebad4f64e4e584cf19820b980bc04960bca0b0cd6eaa2"},
-    {file = "scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c"},
-]
-
-[package.dependencies]
-numpy = ">=1.22.4,<2.3"
-
-[package.extras]
-dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"]
-doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"]
-test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
-
-[[package]]
-name = "sentencepiece"
-version = "0.2.0"
-description = "SentencePiece python wrapper"
-optional = false
-python-versions = "*"
-files = [
-    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:188779e1298a1c8b8253c7d3ad729cb0a9891e5cef5e5d07ce4592c54869e227"},
-    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bed9cf85b296fa2b76fc2547b9cbb691a523864cebaee86304c43a7b4cb1b452"},
-    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7b67e724bead13f18db6e1d10b6bbdc454af574d70efbb36f27d90387be1ca3"},
-    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fde4b08cfe237be4484c6c7c2e2c75fb862cfeab6bd5449ce4caeafd97b767a"},
-    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c378492056202d1c48a4979650981635fd97875a00eabb1f00c6a236b013b5e"},
-    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1380ce6540a368de2ef6d7e6ba14ba8f3258df650d39ba7d833b79ee68a52040"},
-    {file = "sentencepiece-0.2.0-cp310-cp310-win32.whl", hash = "sha256:a1151d6a6dd4b43e552394aed0edfe9292820272f0194bd56c7c1660a0c06c3d"},
-    {file = "sentencepiece-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:d490142b0521ef22bc1085f061d922a2a6666175bb6b42e588ff95c0db6819b2"},
-    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:17982700c4f6dbb55fa3594f3d7e5dd1c8659a274af3738e33c987d2a27c9d5c"},
-    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7c867012c0e8bcd5bdad0f791609101cb5c66acb303ab3270218d6debc68a65e"},
-    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7fd6071249c74f779c5b27183295b9202f8dedb68034e716784364443879eaa6"},
-    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f90c55a65013cbb8f4d7aab0599bf925cde4adc67ae43a0d323677b5a1c6cb"},
-    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b293734059ef656dcd65be62ff771507bea8fed0a711b6733976e1ed3add4553"},
-    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e58b47f933aca74c6a60a79dcb21d5b9e47416256c795c2d58d55cec27f9551d"},
-    {file = "sentencepiece-0.2.0-cp311-cp311-win32.whl", hash = "sha256:c581258cf346b327c62c4f1cebd32691826306f6a41d8c4bec43b010dee08e75"},
-    {file = "sentencepiece-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:0993dbc665f4113017892f1b87c3904a44d0640eda510abcacdfb07f74286d36"},
-    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ea5f536e32ea8ec96086ee00d7a4a131ce583a1b18d130711707c10e69601cb2"},
-    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d0cb51f53b6aae3c36bafe41e86167c71af8370a039f542c43b0cce5ef24a68c"},
-    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3212121805afc58d8b00ab4e7dd1f8f76c203ddb9dc94aa4079618a31cf5da0f"},
-    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a3149e3066c2a75e0d68a43eb632d7ae728c7925b517f4c05c40f6f7280ce08"},
-    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:632f3594d3e7ac8b367bca204cb3fd05a01d5b21455acd097ea4c0e30e2f63d7"},
-    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f295105c6bdbb05bd5e1b0cafbd78ff95036f5d3641e7949455a3f4e5e7c3109"},
-    {file = "sentencepiece-0.2.0-cp312-cp312-win32.whl", hash = "sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251"},
-    {file = "sentencepiece-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a673a72aab81fef5ebe755c6e0cc60087d1f3a4700835d40537183c1703a45f"},
-    {file = "sentencepiece-0.2.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4547683f330289ec4f093027bfeb87f9ef023b2eb6f879fdc4a8187c7e0ffb90"},
-    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd6175f7eaec7142d2bf6f6597ce7db4c9ac89acf93fcdb17410c3a8b781eeb"},
-    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:859ba1acde782609a0910a26a60e16c191a82bf39b5621107552c0cd79fad00f"},
-    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcbbef6cc277f8f18f36959e305f10b1c620442d75addc79c21d7073ae581b50"},
-    {file = "sentencepiece-0.2.0-cp36-cp36m-win32.whl", hash = "sha256:536b934e244829e3fe6c4f198652cd82da48adb9aa145c9f00889542726dee3d"},
-    {file = "sentencepiece-0.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:0a91aaa3c769b52440df56fafda683b3aa48e3f2169cf7ee5b8c8454a7f3ae9b"},
-    {file = "sentencepiece-0.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:787e480ca4c1d08c9985a7eb1eae4345c107729c99e9b5a9a00f2575fc7d4b4b"},
-    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4d158189eb2ecffea3a51edf6d25e110b3678ec47f1a40f2d541eafbd8f6250"},
-    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1e5ca43013e8935f25457a4fca47e315780172c3e821b4b13a890668911c792"},
-    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7140d9e5a74a0908493bb4a13f1f16a401297bd755ada4c707e842fbf6f0f5bf"},
-    {file = "sentencepiece-0.2.0-cp37-cp37m-win32.whl", hash = "sha256:6cf333625234f247ab357b0bd9836638405ea9082e1543d5b8408f014979dcbf"},
-    {file = "sentencepiece-0.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ff88712338b01031910e8e61e7239aff3ce8869ee31a47df63cb38aadd591bea"},
-    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20813a68d4c221b1849c62c30e1281ea81687894d894b8d4a0f4677d9311e0f5"},
-    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:926ef920ae2e8182db31d3f5d081ada57804e3e1d3a8c4ef8b117f9d9fb5a945"},
-    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:89f65f69636b7e9c015b79dff9c9985a9bc7d19ded6f79ef9f1ec920fdd73ecf"},
-    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f67eae0dbe6f2d7d6ba50a354623d787c99965f068b81e145d53240198021b0"},
-    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:98501e075f35dd1a1d5a20f65be26839fcb1938752ec61539af008a5aa6f510b"},
-    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3d1d2cc4882e8d6a1adf9d5927d7716f80617fc693385661caff21888972269"},
-    {file = "sentencepiece-0.2.0-cp38-cp38-win32.whl", hash = "sha256:b99a308a2e5e569031ab164b74e6fab0b6f37dfb493c32f7816225f4d411a6dd"},
-    {file = "sentencepiece-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:cdb701eec783d3ec86b7cd4c763adad8eaf6b46db37ee1c36e5e6c44b3fe1b5f"},
-    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1e0f9c4d0a6b0af59b613175f019916e28ade076e21242fd5be24340d8a2f64a"},
-    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:298f21cc1366eb60311aedba3169d30f885c363ddbf44214b0a587d2908141ad"},
-    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f1ec95aa1e5dab11f37ac7eff190493fd87770f7a8b81ebc9dd768d1a3c8704"},
-    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b06b70af54daa4b4904cbb90b4eb6d35c9f3252fdc86c9c32d5afd4d30118d8"},
-    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22e37bac44dd6603388cb598c64ff7a76e41ca774646f21c23aadfbf5a2228ab"},
-    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5"},
-    {file = "sentencepiece-0.2.0-cp39-cp39-win32.whl", hash = "sha256:38aed822fb76435fa1f12185f10465a94ab9e51d5e8a9159e9a540ce926f0ffd"},
-    {file = "sentencepiece-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8cf876516548b5a1d6ac4745d8b554f5c07891d55da557925e5c13ff0b4e6ad"},
-    {file = "sentencepiece-0.2.0.tar.gz", hash = "sha256:a52c19171daaf2e697dc6cbe67684e0fa341b1248966f6aebb541de654d15843"},
-]
-
-[[package]]
-name = "setuptools"
-version = "75.2.0"
-description = "Easily download, build, install, upgrade, and uninstall Python packages"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"},
-    {file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"},
-]
-
-[package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"]
-core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
-enabler = ["pytest-enabler (>=2.2)"]
-test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
-type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"]
-
-[[package]]
-name = "shellingham"
-version = "1.5.4"
-description = "Tool to Detect Surrounding Shell"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"},
-    {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
-]
-
-[[package]]
-name = "six"
-version = "1.16.0"
-description = "Python 2 and 3 compatibility utilities"
-optional = true
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
-files = [
-    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
-    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
-]
-
-[[package]]
-name = "sympy"
-version = "1.13.1"
-description = "Computer algebra system (CAS) in Python"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8"},
-    {file = "sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f"},
-]
-
-[package.dependencies]
-mpmath = ">=1.1.0,<1.4"
-
-[package.extras]
-dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
-
-[[package]]
-name = "texttable"
-version = "1.7.0"
-description = "module to create simple ASCII tables"
-optional = true
-python-versions = "*"
-files = [
-    {file = "texttable-1.7.0-py2.py3-none-any.whl", hash = "sha256:72227d592c82b3d7f672731ae73e4d1f88cd8e2ef5b075a7a7f01a23a3743917"},
-    {file = "texttable-1.7.0.tar.gz", hash = "sha256:2d2068fb55115807d3ac77a4ca68fa48803e84ebb0ee2340f858107a36522638"},
-]
-
-[[package]]
-name = "tokenizers"
-version = "0.20.3"
-description = ""
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "tokenizers-0.20.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:31ccab28dbb1a9fe539787210b0026e22debeab1662970f61c2d921f7557f7e4"},
-    {file = "tokenizers-0.20.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6361191f762bda98c773da418cf511cbaa0cb8d0a1196f16f8c0119bde68ff8"},
-    {file = "tokenizers-0.20.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f128d5da1202b78fa0a10d8d938610472487da01b57098d48f7e944384362514"},
-    {file = "tokenizers-0.20.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:79c4121a2e9433ad7ef0769b9ca1f7dd7fa4c0cd501763d0a030afcbc6384481"},
-    {file = "tokenizers-0.20.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7850fde24197fe5cd6556e2fdba53a6d3bae67c531ea33a3d7c420b90904141"},
-    {file = "tokenizers-0.20.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b357970c095dc134978a68c67d845a1e3803ab7c4fbb39195bde914e7e13cf8b"},
-    {file = "tokenizers-0.20.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a333d878c4970b72d6c07848b90c05f6b045cf9273fc2bc04a27211721ad6118"},
-    {file = "tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fd9fee817f655a8f50049f685e224828abfadd436b8ff67979fc1d054b435f1"},
-    {file = "tokenizers-0.20.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9e7816808b402129393a435ea2a509679b41246175d6e5e9f25b8692bfaa272b"},
-    {file = "tokenizers-0.20.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ba96367db9d8a730d3a1d5996b4b7babb846c3994b8ef14008cd8660f55db59d"},
-    {file = "tokenizers-0.20.3-cp310-none-win32.whl", hash = "sha256:ee31ba9d7df6a98619426283e80c6359f167e2e9882d9ce1b0254937dbd32f3f"},
-    {file = "tokenizers-0.20.3-cp310-none-win_amd64.whl", hash = "sha256:a845c08fdad554fe0871d1255df85772f91236e5fd6b9287ef8b64f5807dbd0c"},
-    {file = "tokenizers-0.20.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:585b51e06ca1f4839ce7759941e66766d7b060dccfdc57c4ca1e5b9a33013a90"},
-    {file = "tokenizers-0.20.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:61cbf11954f3b481d08723ebd048ba4b11e582986f9be74d2c3bdd9293a4538d"},
-    {file = "tokenizers-0.20.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef820880d5e4e8484e2fa54ff8d297bb32519eaa7815694dc835ace9130a3eea"},
-    {file = "tokenizers-0.20.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:67ef4dcb8841a4988cd00dd288fb95dfc8e22ed021f01f37348fd51c2b055ba9"},
-    {file = "tokenizers-0.20.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff1ef8bd47a02b0dc191688ccb4da53600df5d4c9a05a4b68e1e3de4823e78eb"},
-    {file = "tokenizers-0.20.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:444d188186eab3148baf0615b522461b41b1f0cd58cd57b862ec94b6ac9780f1"},
-    {file = "tokenizers-0.20.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37c04c032c1442740b2c2d925f1857885c07619224a533123ac7ea71ca5713da"},
-    {file = "tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453c7769d22231960ee0e883d1005c93c68015025a5e4ae56275406d94a3c907"},
-    {file = "tokenizers-0.20.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4bb31f7b2847e439766aaa9cc7bccf7ac7088052deccdb2275c952d96f691c6a"},
-    {file = "tokenizers-0.20.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:843729bf0f991b29655a069a2ff58a4c24375a553c70955e15e37a90dd4e045c"},
-    {file = "tokenizers-0.20.3-cp311-none-win32.whl", hash = "sha256:efcce3a927b1e20ca694ba13f7a68c59b0bd859ef71e441db68ee42cf20c2442"},
-    {file = "tokenizers-0.20.3-cp311-none-win_amd64.whl", hash = "sha256:88301aa0801f225725b6df5dea3d77c80365ff2362ca7e252583f2b4809c4cc0"},
-    {file = "tokenizers-0.20.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:49d12a32e190fad0e79e5bdb788d05da2f20d8e006b13a70859ac47fecf6ab2f"},
-    {file = "tokenizers-0.20.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:282848cacfb9c06d5e51489f38ec5aa0b3cd1e247a023061945f71f41d949d73"},
-    {file = "tokenizers-0.20.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abe4e08c7d0cd6154c795deb5bf81d2122f36daf075e0c12a8b050d824ef0a64"},
-    {file = "tokenizers-0.20.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca94fc1b73b3883c98f0c88c77700b13d55b49f1071dfd57df2b06f3ff7afd64"},
-    {file = "tokenizers-0.20.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef279c7e239f95c8bdd6ff319d9870f30f0d24915b04895f55b1adcf96d6c60d"},
-    {file = "tokenizers-0.20.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16384073973f6ccbde9852157a4fdfe632bb65208139c9d0c0bd0176a71fd67f"},
-    {file = "tokenizers-0.20.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:312d522caeb8a1a42ebdec87118d99b22667782b67898a76c963c058a7e41d4f"},
-    {file = "tokenizers-0.20.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2b7cb962564785a83dafbba0144ecb7f579f1d57d8c406cdaa7f32fe32f18ad"},
-    {file = "tokenizers-0.20.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:124c5882ebb88dadae1fc788a582299fcd3a8bd84fc3e260b9918cf28b8751f5"},
-    {file = "tokenizers-0.20.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2b6e54e71f84c4202111a489879005cb14b92616a87417f6c102c833af961ea2"},
-    {file = "tokenizers-0.20.3-cp312-none-win32.whl", hash = "sha256:83d9bfbe9af86f2d9df4833c22e94d94750f1d0cd9bfb22a7bb90a86f61cdb1c"},
-    {file = "tokenizers-0.20.3-cp312-none-win_amd64.whl", hash = "sha256:44def74cee574d609a36e17c8914311d1b5dbcfe37c55fd29369d42591b91cf2"},
-    {file = "tokenizers-0.20.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e0b630e0b536ef0e3c8b42c685c1bc93bd19e98c0f1543db52911f8ede42cf84"},
-    {file = "tokenizers-0.20.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a02d160d2b19bcbfdf28bd9a4bf11be4cb97d0499c000d95d4c4b1a4312740b6"},
-    {file = "tokenizers-0.20.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e3d80d89b068bc30034034b5319218c7c0a91b00af19679833f55f3becb6945"},
-    {file = "tokenizers-0.20.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:174a54910bed1b089226512b4458ea60d6d6fd93060254734d3bc3540953c51c"},
-    {file = "tokenizers-0.20.3-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:098b8a632b8656aa5802c46689462c5c48f02510f24029d71c208ec2c822e771"},
-    {file = "tokenizers-0.20.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:78c8c143e3ae41e718588281eb3e212c2b31623c9d6d40410ec464d7d6221fb5"},
-    {file = "tokenizers-0.20.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b26b0aadb18cd8701077362ba359a06683662d5cafe3e8e8aba10eb05c037f1"},
-    {file = "tokenizers-0.20.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07d7851a72717321022f3774e84aa9d595a041d643fafa2e87fbc9b18711dac0"},
-    {file = "tokenizers-0.20.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:bd44e48a430ada902c6266a8245f5036c4fe744fcb51f699999fbe82aa438797"},
-    {file = "tokenizers-0.20.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:a4c186bb006ccbe1f5cc4e0380d1ce7806f5955c244074fd96abc55e27b77f01"},
-    {file = "tokenizers-0.20.3-cp313-none-win32.whl", hash = "sha256:6e19e0f1d854d6ab7ea0c743d06e764d1d9a546932be0a67f33087645f00fe13"},
-    {file = "tokenizers-0.20.3-cp313-none-win_amd64.whl", hash = "sha256:d50ede425c7e60966a9680d41b58b3a0950afa1bb570488e2972fa61662c4273"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:9adda1ff5fb9dcdf899ceca672a4e2ce9e797adb512a6467305ca3d8bfcfbdd0"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:6dde2cae6004ba7a3badff4a11911cae03ebf23e97eebfc0e71fef2530e5074f"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4a7fd678b35614fca708579eb95b7587a5e8a6d328171bd2488fd9f27d82be4"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1b80e3c7283a01a356bd2210f53d1a4a5d32b269c2024389ed0173137708d50e"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a8cc0e8176b762973758a77f0d9c4467d310e33165fb74173418ca3734944da4"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5634b2e2f5f3d2b4439d2d74066e22eb4b1f04f3fea05cb2a3c12d89b5a3bcd"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b4ba635165bc1ea46f2da8e5d80b5f70f6ec42161e38d96dbef33bb39df73964"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18e4c7c64172e7789bd8b07aa3087ea87c4c4de7e90937a2aa036b5d92332536"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1f74909ef7675c26d4095a817ec3393d67f3158ca4836c233212e5613ef640c4"},
-    {file = "tokenizers-0.20.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0e9b81321a1e05b16487d312b4264984513f8b4a7556229cafac6e88c2036b09"},
-    {file = "tokenizers-0.20.3-cp37-none-win32.whl", hash = "sha256:ab48184cd58b4a03022a2ec75b54c9f600ffea9a733612c02325ed636f353729"},
-    {file = "tokenizers-0.20.3-cp37-none-win_amd64.whl", hash = "sha256:60ac483cebee1c12c71878523e768df02fa17e4c54412966cb3ac862c91b36c1"},
-    {file = "tokenizers-0.20.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:3229ef103c89583d10b9378afa5d601b91e6337530a0988e17ca8d635329a996"},
-    {file = "tokenizers-0.20.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6ac52cc24bad3de865c7e65b1c4e7b70d00938a8ae09a92a453b8f676e714ad5"},
-    {file = "tokenizers-0.20.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:04627b7b502fa6a2a005e1bd446fa4247d89abcb1afaa1b81eb90e21aba9a60f"},
-    {file = "tokenizers-0.20.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c27ceb887f0e81a3c377eb4605dca7a95a81262761c0fba308d627b2abb98f2b"},
-    {file = "tokenizers-0.20.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65ab780194da4e1fcf5670523a2f377c4838ebf5249efe41fa1eddd2a84fb49d"},
-    {file = "tokenizers-0.20.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98d343134f47159e81f7f242264b0eb222e6b802f37173c8d7d7b64d5c9d1388"},
-    {file = "tokenizers-0.20.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2475bb004ab2009d29aff13b5047bfdb3d4b474f0aa9d4faa13a7f34dbbbb43"},
-    {file = "tokenizers-0.20.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b6583a65c01db1197c1eb36857ceba8ec329d53afadd268b42a6b04f4965724"},
-    {file = "tokenizers-0.20.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:62d00ba208358c037eeab7bfc00a905adc67b2d31b68ab40ed09d75881e114ea"},
-    {file = "tokenizers-0.20.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0fc7a39e5bedc817bda395a798dfe2d9c5f7c71153c90d381b5135a0328d9520"},
-    {file = "tokenizers-0.20.3-cp38-none-win32.whl", hash = "sha256:84d40ee0f8550d64d3ea92dd7d24a8557a9172165bdb986c9fb2503b4fe4e3b6"},
-    {file = "tokenizers-0.20.3-cp38-none-win_amd64.whl", hash = "sha256:205a45246ed7f1718cf3785cff88450ba603352412aaf220ace026384aa3f1c0"},
-    {file = "tokenizers-0.20.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:93e37f0269a11dc3b1a953f1fca9707f0929ebf8b4063c591c71a0664219988e"},
-    {file = "tokenizers-0.20.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f4cb0c614b0135e781de96c2af87e73da0389ac1458e2a97562ed26e29490d8d"},
-    {file = "tokenizers-0.20.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7eb2fb1c432f5746b22f8a7f09fc18c4156cb0031c77f53cb19379d82d43297a"},
-    {file = "tokenizers-0.20.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bfa8d029bb156181b006643309d6b673615a24e4ed24cf03aa191d599b996f51"},
-    {file = "tokenizers-0.20.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f90549622de3bf476ad9f1dd6f3f952ec3ed6ab8615ae88ef060d0c5bfad55d"},
-    {file = "tokenizers-0.20.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1d469c74eebf5c43fd61cd9b030e271d17198edd7bd45392e03a3c091d7d6d4"},
-    {file = "tokenizers-0.20.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bee8f53b2594749f4460d53253bae55d718f04e9b633efa0f5df8938bd98e4f0"},
-    {file = "tokenizers-0.20.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:938441babf3e5720e4459e306ef2809fb267680df9d1ff2873458b22aef60248"},
-    {file = "tokenizers-0.20.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7310ab23d7b0caebecc0e8be11a1146f320f5f07284000f6ea54793e83de1b75"},
-    {file = "tokenizers-0.20.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:16121eb030a2b13094cfec936b0c12e8b4063c5f839591ea7d0212336d8f9921"},
-    {file = "tokenizers-0.20.3-cp39-none-win32.whl", hash = "sha256:401cc21ef642ee235985d747f65e18f639464d377c70836c9003df208d582064"},
-    {file = "tokenizers-0.20.3-cp39-none-win_amd64.whl", hash = "sha256:7498f3ea7746133335a6adb67a77cf77227a8b82c8483f644a2e5f86fea42b8d"},
-    {file = "tokenizers-0.20.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e919f2e3e68bb51dc31de4fcbbeff3bdf9c1cad489044c75e2b982a91059bd3c"},
-    {file = "tokenizers-0.20.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b8e9608f2773996cc272156e305bd79066163a66b0390fe21750aff62df1ac07"},
-    {file = "tokenizers-0.20.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39270a7050deaf50f7caff4c532c01b3c48f6608d42b3eacdebdc6795478c8df"},
-    {file = "tokenizers-0.20.3-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e005466632b1c5d2d2120f6de8aa768cc9d36cd1ab7d51d0c27a114c91a1e6ee"},
-    {file = "tokenizers-0.20.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a07962340b36189b6c8feda552ea1bfeee6cf067ff922a1d7760662c2ee229e5"},
-    {file = "tokenizers-0.20.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:55046ad3dd5f2b3c67501fcc8c9cbe3e901d8355f08a3b745e9b57894855f85b"},
-    {file = "tokenizers-0.20.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:efcf0eb939988b627558aaf2b9dc3e56d759cad2e0cfa04fcab378e4b48fc4fd"},
-    {file = "tokenizers-0.20.3-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f3558a7ae6a6d38a77dfce12172a1e2e1bf3e8871e744a1861cd7591ea9ebe24"},
-    {file = "tokenizers-0.20.3-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d53029fe44bc70c3ff14ef512460a0cf583495a0f8e2f4b70e26eb9438e38a9"},
-    {file = "tokenizers-0.20.3-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a2a56397b2bec5a629b516b23f0f8a3e4f978c7488d4a299980f8375954b85"},
-    {file = "tokenizers-0.20.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e5bfaae740ef9ece000f8a07e78ac0e2b085c5ce9648f8593ddf0243c9f76d"},
-    {file = "tokenizers-0.20.3-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:fbaf3ea28fedfb2283da60e710aff25492e795a7397cad8a50f1e079b65a5a70"},
-    {file = "tokenizers-0.20.3-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:c47c037116310dc976eb96b008e41b9cfaba002ed8005848d4d632ee0b7ba9ae"},
-    {file = "tokenizers-0.20.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c31751f0721f58f5e19bb27c1acc259aeff860d8629c4e1a900b26a1979ada8e"},
-    {file = "tokenizers-0.20.3-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:c697cbd3be7a79ea250ea5f380d6f12e534c543cfb137d5c734966b3ee4f34cc"},
-    {file = "tokenizers-0.20.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b48971b88ef9130bf35b41b35fd857c3c4dae4a9cd7990ebc7fc03e59cc92438"},
-    {file = "tokenizers-0.20.3-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4e615de179bbe060ab33773f0d98a8a8572b5883dd7dac66c1de8c056c7e748c"},
-    {file = "tokenizers-0.20.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da1ec842035ed9999c62e45fbe0ff14b7e8a7e02bb97688cc6313cf65e5cd755"},
-    {file = "tokenizers-0.20.3-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:6ee4954c1dd23aadc27958dad759006e71659d497dcb0ef0c7c87ea992c16ebd"},
-    {file = "tokenizers-0.20.3-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:3eda46ca402751ec82553a321bf35a617b76bbed7586e768c02ccacbdda94d6d"},
-    {file = "tokenizers-0.20.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:de082392a85eb0055cc055c535bff2f0cc15d7a000bdc36fbf601a0f3cf8507a"},
-    {file = "tokenizers-0.20.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c3db46cc0647bfd88263afdb739b92017a02a87ee30945cb3e86c7e25c7c9917"},
-    {file = "tokenizers-0.20.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a292392f24ab9abac5cfa8197e5a6208f2e43723420217e1ceba0b4ec77816ac"},
-    {file = "tokenizers-0.20.3-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8dcd91f4e60f62b20d83a87a84fe062035a1e3ff49a8c2bbdeb2d441c8e311f4"},
-    {file = "tokenizers-0.20.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:900991a2b8ee35961b1095db7e265342e0e42a84c1a594823d5ee9f8fb791958"},
-    {file = "tokenizers-0.20.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:5a8d8261ca2133d4f98aa9627c748189502b3787537ba3d7e2beb4f7cfc5d627"},
-    {file = "tokenizers-0.20.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:c4fd4d71e6deb6ddf99d8d0eab87d1d16f635898906e631914a9bae8ae9f2cfb"},
-    {file = "tokenizers-0.20.3.tar.gz", hash = "sha256:2278b34c5d0dd78e087e1ca7f9b1dcbf129d80211afa645f214bd6e051037539"},
-]
-
-[package.dependencies]
-huggingface-hub = ">=0.16.4,<1.0"
-
-[package.extras]
-dev = ["tokenizers[testing]"]
-docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
-testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
-
-[[package]]
-name = "tomli"
-version = "2.0.2"
-description = "A lil' TOML parser"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"},
-    {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"},
-]
-
-[[package]]
-name = "torch"
-version = "2.5.1"
-description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744"},
-    {file = "torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601"},
-    {file = "torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa"},
-    {file = "torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86"},
-    {file = "torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457"},
-    {file = "torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9"},
-    {file = "torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a"},
-    {file = "torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c"},
-    {file = "torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03"},
-    {file = "torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697"},
-    {file = "torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c"},
-    {file = "torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1"},
-    {file = "torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7"},
-    {file = "torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1f3b7fb3cf7ab97fae52161423f81be8c6b8afac8d9760823fd623994581e1a3"},
-    {file = "torch-2.5.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7974e3dce28b5a21fb554b73e1bc9072c25dde873fa00d54280861e7a009d7dc"},
-    {file = "torch-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:46c817d3ea33696ad3b9df5e774dba2257e9a4cd3c4a3afbf92f6bb13ac5ce2d"},
-    {file = "torch-2.5.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8046768b7f6d35b85d101b4b38cba8aa2f3cd51952bc4c06a49580f2ce682291"},
-]
-
-[package.dependencies]
-filelock = "*"
-fsspec = "*"
-jinja2 = "*"
-networkx = "*"
-nvidia-cublas-cu12 = {version = "12.4.5.8", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-cupti-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-nvrtc-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-runtime-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-setuptools = {version = "*", markers = "python_version >= \"3.12\""}
-sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
-triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
-typing-extensions = ">=4.8.0"
-
-[package.extras]
-opt-einsum = ["opt-einsum (>=3.3)"]
-optree = ["optree (>=0.12.0)"]
-
-[[package]]
-name = "tqdm"
-version = "4.66.5"
-description = "Fast, Extensible Progress Meter"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
-    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
-]
-
-[package.dependencies]
-colorama = {version = "*", markers = "platform_system == \"Windows\""}
-
-[package.extras]
-dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
-notebook = ["ipywidgets (>=6)"]
-slack = ["slack-sdk"]
-telegram = ["requests"]
-
-[[package]]
-name = "transformers"
-version = "4.46.3"
-description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
-optional = false
-python-versions = ">=3.8.0"
-files = [
-    {file = "transformers-4.46.3-py3-none-any.whl", hash = "sha256:a12ef6f52841fd190a3e5602145b542d03507222f2c64ebb7ee92e8788093aef"},
-    {file = "transformers-4.46.3.tar.gz", hash = "sha256:8ee4b3ae943fe33e82afff8e837f4b052058b07ca9be3cb5b729ed31295f72cc"},
-]
-
-[package.dependencies]
-filelock = "*"
-huggingface-hub = ">=0.23.2,<1.0"
-numpy = ">=1.17"
-packaging = ">=20.0"
-pyyaml = ">=5.1"
-regex = "!=2019.12.17"
-requests = "*"
-safetensors = ">=0.4.1"
-tokenizers = ">=0.20,<0.21"
-tqdm = ">=4.27"
-
-[package.extras]
-accelerate = ["accelerate (>=0.26.0)"]
-agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"]
-audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
-benchmark = ["optimum-benchmark (>=0.3.0)"]
-codecarbon = ["codecarbon (==1.2.0)"]
-deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
-deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.20,<0.21)", "urllib3 (<2.0.0)"]
-dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
-flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
-ftfy = ["ftfy"]
-integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"]
-ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"]
-modelcreation = ["cookiecutter (==1.7.3)"]
-natten = ["natten (>=0.14.6,<0.15.0)"]
-onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
-onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
-optuna = ["optuna"]
-quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "libcst", "rich", "ruff (==0.5.1)", "urllib3 (<2.0.0)"]
-ray = ["ray[tune] (>=2.7.0)"]
-retrieval = ["datasets (!=2.5.0)", "faiss-cpu"]
-ruff = ["ruff (==0.5.1)"]
-sagemaker = ["sagemaker (>=2.31.0)"]
-sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
-serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
-sigopt = ["sigopt"]
-sklearn = ["scikit-learn"]
-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
-tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
-tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
-tiktoken = ["blobfile", "tiktoken"]
-timm = ["timm (<=0.9.16)"]
-tokenizers = ["tokenizers (>=0.20,<0.21)"]
-torch = ["accelerate (>=0.26.0)", "torch"]
-torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.23.2,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.20,<0.21)", "torch", "tqdm (>=4.27)"]
-video = ["av (==9.2.0)"]
-vision = ["Pillow (>=10.0.1,<=15.0)"]
-
-[[package]]
-name = "triton"
-version = "3.1.0"
-description = "A language and compiler for custom Deep Learning operations"
-optional = true
-python-versions = "*"
-files = [
-    {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"},
-    {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"},
-    {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"},
-    {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"},
-    {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"},
-]
-
-[package.dependencies]
-filelock = "*"
-
-[package.extras]
-build = ["cmake (>=3.20)", "lit"]
-tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
-tutorials = ["matplotlib", "pandas", "tabulate"]
-
-[[package]]
-name = "typer"
-version = "0.12.5"
-description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "typer-0.12.5-py3-none-any.whl", hash = "sha256:62fe4e471711b147e3365034133904df3e235698399bc4de2b36c8579298d52b"},
-    {file = "typer-0.12.5.tar.gz", hash = "sha256:f592f089bedcc8ec1b974125d64851029c3b1af145f04aca64d69410f0c9b722"},
-]
-
-[package.dependencies]
-click = ">=8.0.0"
-rich = ">=10.11.0"
-shellingham = ">=1.3.0"
-typing-extensions = ">=3.7.4.3"
-
-[[package]]
-name = "typing-extensions"
-version = "4.12.2"
-description = "Backported and Experimental Type Hints for Python 3.8+"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
-    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
-]
-
-[[package]]
-name = "tzdata"
-version = "2024.2"
-description = "Provider of IANA time zone data"
-optional = true
-python-versions = ">=2"
-files = [
-    {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"},
-    {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"},
-]
-
-[[package]]
-name = "urllib3"
-version = "2.2.3"
-description = "HTTP library with thread-safe connection pooling, file post, and more."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
-    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
-]
-
-[package.extras]
-brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
-h2 = ["h2 (>=4,<5)"]
-socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
-zstd = ["zstandard (>=0.18.0)"]
-
-[[package]]
-name = "win32-setctime"
-version = "1.1.0"
-description = "A small Python utility to set file creation time on Windows"
-optional = false
-python-versions = ">=3.5"
-files = [
-    {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"},
-    {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"},
-]
-
-[package.extras]
-dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
-
-[[package]]
-name = "wrapt"
-version = "1.16.0"
-description = "Module for decorators, wrappers and monkey patching."
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"},
-    {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"},
-    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"},
-    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"},
-    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"},
-    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"},
-    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"},
-    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"},
-    {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"},
-    {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"},
-    {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"},
-    {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"},
-    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"},
-    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"},
-    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"},
-    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"},
-    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"},
-    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"},
-    {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"},
-    {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"},
-    {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"},
-    {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"},
-    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"},
-    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"},
-    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"},
-    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"},
-    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"},
-    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"},
-    {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"},
-    {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"},
-    {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"},
-    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"},
-    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"},
-    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"},
-    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"},
-    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"},
-    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"},
-    {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"},
-    {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"},
-    {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"},
-    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"},
-    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"},
-    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"},
-    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"},
-    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"},
-    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"},
-    {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"},
-    {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"},
-    {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"},
-    {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"},
-    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"},
-    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"},
-    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"},
-    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"},
-    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"},
-    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"},
-    {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"},
-    {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"},
-    {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"},
-    {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"},
-    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"},
-    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"},
-    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"},
-    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"},
-    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"},
-    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"},
-    {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"},
-    {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"},
-    {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"},
-    {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
-]
-
-[[package]]
-name = "xxhash"
-version = "3.5.0"
-description = "Python binding for xxHash"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"},
-    {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442"},
-    {file = "xxhash-3.5.0-cp310-cp310-win32.whl", hash = "sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da"},
-    {file = "xxhash-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9"},
-    {file = "xxhash-3.5.0-cp310-cp310-win_arm64.whl", hash = "sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6"},
-    {file = "xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1"},
-    {file = "xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839"},
-    {file = "xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da"},
-    {file = "xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58"},
-    {file = "xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3"},
-    {file = "xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00"},
-    {file = "xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e"},
-    {file = "xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8"},
-    {file = "xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e"},
-    {file = "xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2"},
-    {file = "xxhash-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6"},
-    {file = "xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c"},
-    {file = "xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637"},
-    {file = "xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43"},
-    {file = "xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b"},
-    {file = "xxhash-3.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6e5f70f6dca1d3b09bccb7daf4e087075ff776e3da9ac870f86ca316736bb4aa"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e76e83efc7b443052dd1e585a76201e40b3411fe3da7af4fe434ec51b2f163b"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33eac61d0796ca0591f94548dcfe37bb193671e0c9bcf065789b5792f2eda644"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ec70a89be933ea49222fafc3999987d7899fc676f688dd12252509434636622"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86b8e7f703ec6ff4f351cfdb9f428955859537125904aa8c963604f2e9d3e7"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0adfbd36003d9f86c8c97110039f7539b379f28656a04097e7434d3eaf9aa131"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:63107013578c8a730419adc05608756c3fa640bdc6abe806c3123a49fb829f43"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:683b94dbd1ca67557850b86423318a2e323511648f9f3f7b1840408a02b9a48c"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5d2a01dcce81789cf4b12d478b5464632204f4c834dc2d064902ee27d2d1f0ee"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:a9d360a792cbcce2fe7b66b8d51274ec297c53cbc423401480e53b26161a290d"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:f0b48edbebea1b7421a9c687c304f7b44d0677c46498a046079d445454504737"},
-    {file = "xxhash-3.5.0-cp37-cp37m-win32.whl", hash = "sha256:7ccb800c9418e438b44b060a32adeb8393764da7441eb52aa2aa195448935306"},
-    {file = "xxhash-3.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c3bc7bf8cb8806f8d1c9bf149c18708cb1c406520097d6b0a73977460ea03602"},
-    {file = "xxhash-3.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:74752ecaa544657d88b1d1c94ae68031e364a4d47005a90288f3bab3da3c970f"},
-    {file = "xxhash-3.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dee1316133c9b463aa81aca676bc506d3f80d8f65aeb0bba2b78d0b30c51d7bd"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:602d339548d35a8579c6b013339fb34aee2df9b4e105f985443d2860e4d7ffaa"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:695735deeddfb35da1677dbc16a083445360e37ff46d8ac5c6fcd64917ff9ade"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1030a39ba01b0c519b1a82f80e8802630d16ab95dc3f2b2386a0b5c8ed5cbb10"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5bc08f33c4966f4eb6590d6ff3ceae76151ad744576b5fc6c4ba8edd459fdec"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:160e0c19ee500482ddfb5d5570a0415f565d8ae2b3fd69c5dcfce8a58107b1c3"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f1abffa122452481a61c3551ab3c89d72238e279e517705b8b03847b1d93d738"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:d5e9db7ef3ecbfc0b4733579cea45713a76852b002cf605420b12ef3ef1ec148"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:23241ff6423378a731d84864bf923a41649dc67b144debd1077f02e6249a0d54"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:82b833d5563fefd6fceafb1aed2f3f3ebe19f84760fdd289f8b926731c2e6e91"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0a80ad0ffd78bef9509eee27b4a29e56f5414b87fb01a888353e3d5bda7038bd"},
-    {file = "xxhash-3.5.0-cp38-cp38-win32.whl", hash = "sha256:50ac2184ffb1b999e11e27c7e3e70cc1139047e7ebc1aa95ed12f4269abe98d4"},
-    {file = "xxhash-3.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:392f52ebbb932db566973693de48f15ce787cabd15cf6334e855ed22ea0be5b3"},
-    {file = "xxhash-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bfc8cdd7f33d57f0468b0614ae634cc38ab9202c6957a60e31d285a71ebe0301"},
-    {file = "xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0c48b6300cd0b0106bf49169c3e0536408dfbeb1ccb53180068a18b03c662ab"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe1a92cfbaa0a1253e339ccec42dbe6db262615e52df591b68726ab10338003f"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33513d6cc3ed3b559134fb307aae9bdd94d7e7c02907b37896a6c45ff9ce51bd"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eefc37f6138f522e771ac6db71a6d4838ec7933939676f3753eafd7d3f4c40bc"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a606c8070ada8aa2a88e181773fa1ef17ba65ce5dd168b9d08038e2a61b33754"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42eca420c8fa072cc1dd62597635d140e78e384a79bb4944f825fbef8bfeeef6"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:604253b2143e13218ff1ef0b59ce67f18b8bd1c4205d2ffda22b09b426386898"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6e93a5ad22f434d7876665444a97e713a8f60b5b1a3521e8df11b98309bff833"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7a46e1d6d2817ba8024de44c4fd79913a90e5f7265434cef97026215b7d30df6"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:30eb2efe6503c379b7ab99c81ba4a779748e3830241f032ab46bd182bf5873af"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c8aa771ff2c13dd9cda8166d685d7333d389fae30a4d2bb39d63ab5775de8606"},
-    {file = "xxhash-3.5.0-cp39-cp39-win32.whl", hash = "sha256:5ed9ebc46f24cf91034544b26b131241b699edbfc99ec5e7f8f3d02d6eb7fba4"},
-    {file = "xxhash-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:220f3f896c6b8d0316f63f16c077d52c412619e475f9372333474ee15133a558"},
-    {file = "xxhash-3.5.0-cp39-cp39-win_arm64.whl", hash = "sha256:a7b1d8315d9b5e9f89eb2933b73afae6ec9597a258d52190944437158b49d38e"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2b4154c00eb22e4d543f472cfca430e7962a0f1d0f3778334f2e08a7ba59363c"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d30bbc1644f726b825b3278764240f449d75f1a8bdda892e641d4a688b1494ae"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa0b72f2423e2aa53077e54a61c28e181d23effeaafd73fcb9c494e60930c8e"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:13de2b76c1835399b2e419a296d5b38dc4855385d9e96916299170085ef72f57"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:0691bfcc4f9c656bcb96cc5db94b4d75980b9d5589f2e59de790091028580837"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:297595fe6138d4da2c8ce9e72a04d73e58725bb60f3a19048bc96ab2ff31c692"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc1276d369452040cbb943300dc8abeedab14245ea44056a2943183822513a18"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2061188a1ba352fc699c82bff722f4baacb4b4b8b2f0c745d2001e56d0dfb514"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38c384c434021e4f62b8d9ba0bc9467e14d394893077e2c66d826243025e1f81"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e6a4dd644d72ab316b580a1c120b375890e4c52ec392d4aef3c63361ec4d77d1"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:531af8845aaadcadf951b7e0c1345c6b9c68a990eeb74ff9acd8501a0ad6a1c9"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ce379bcaa9fcc00f19affa7773084dd09f5b59947b3fb47a1ceb0179f91aaa1"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd1b2281d01723f076df3c8188f43f2472248a6b63118b036e641243656b1b0f"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c770750cc80e8694492244bca7251385188bc5597b6a39d98a9f30e8da984e0"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b150b8467852e1bd844387459aa6fbe11d7f38b56e901f9f3b3e6aba0d660240"},
-    {file = "xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f"},
-]
-
-[[package]]
-name = "yarl"
-version = "1.16.0"
-description = "Yet another URL library"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:32468f41242d72b87ab793a86d92f885355bcf35b3355aa650bfa846a5c60058"},
-    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:234f3a3032b505b90e65b5bc6652c2329ea7ea8855d8de61e1642b74b4ee65d2"},
-    {file = "yarl-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a0296040e5cddf074c7f5af4a60f3fc42c0237440df7bcf5183be5f6c802ed5"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de6c14dd7c7c0badba48157474ea1f03ebee991530ba742d381b28d4f314d6f3"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b140e532fe0266003c936d017c1ac301e72ee4a3fd51784574c05f53718a55d8"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:019f5d58093402aa8f6661e60fd82a28746ad6d156f6c5336a70a39bd7b162b9"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c42998fd1cbeb53cd985bff0e4bc25fbe55fd6eb3a545a724c1012d69d5ec84"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c7c30fb38c300fe8140df30a046a01769105e4cf4282567a29b5cdb635b66c4"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e49e0fd86c295e743fd5be69b8b0712f70a686bc79a16e5268386c2defacaade"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:b9ca7b9147eb1365c8bab03c003baa1300599575effad765e0b07dd3501ea9af"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:27e11db3f1e6a51081a981509f75617b09810529de508a181319193d320bc5c7"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8994c42f4ca25df5380ddf59f315c518c81df6a68fed5bb0c159c6cb6b92f120"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:542fa8e09a581bcdcbb30607c7224beff3fdfb598c798ccd28a8184ffc18b7eb"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2bd6a51010c7284d191b79d3b56e51a87d8e1c03b0902362945f15c3d50ed46b"},
-    {file = "yarl-1.16.0-cp310-cp310-win32.whl", hash = "sha256:178ccb856e265174a79f59721031060f885aca428983e75c06f78aa24b91d929"},
-    {file = "yarl-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe8bba2545427418efc1929c5c42852bdb4143eb8d0a46b09de88d1fe99258e7"},
-    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d8643975a0080f361639787415a038bfc32d29208a4bf6b783ab3075a20b1ef3"},
-    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:676d96bafc8c2d0039cea0cd3fd44cee7aa88b8185551a2bb93354668e8315c2"},
-    {file = "yarl-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9525f03269e64310416dbe6c68d3b23e5d34aaa8f47193a1c45ac568cecbc49"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b37d5ec034e668b22cf0ce1074d6c21fd2a08b90d11b1b73139b750a8b0dd97"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f32c4cb7386b41936894685f6e093c8dfaf0960124d91fe0ec29fe439e201d0"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b8e265a0545637492a7e12fd7038370d66c9375a61d88c5567d0e044ded9202"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:789a3423f28a5fff46fbd04e339863c169ece97c827b44de16e1a7a42bc915d2"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1d1f45e3e8d37c804dca99ab3cf4ab3ed2e7a62cd82542924b14c0a4f46d243"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:621280719c4c5dad4c1391160a9b88925bb8b0ff6a7d5af3224643024871675f"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed097b26f18a1f5ff05f661dc36528c5f6735ba4ce8c9645e83b064665131349"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f1fe2b2e3ee418862f5ebc0c0083c97f6f6625781382f828f6d4e9b614eba9b"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:87dd10bc0618991c66cee0cc65fa74a45f4ecb13bceec3c62d78ad2e42b27a16"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4199db024b58a8abb2cfcedac7b1292c3ad421684571aeb622a02f242280e8d6"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:99a9dcd4b71dd5f5f949737ab3f356cfc058c709b4f49833aeffedc2652dac56"},
-    {file = "yarl-1.16.0-cp311-cp311-win32.whl", hash = "sha256:a9394c65ae0ed95679717d391c862dece9afacd8fa311683fc8b4362ce8a410c"},
-    {file = "yarl-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:5b9101f528ae0f8f65ac9d64dda2bb0627de8a50344b2f582779f32fda747c1d"},
-    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4ffb7c129707dd76ced0a4a4128ff452cecf0b0e929f2668ea05a371d9e5c104"},
-    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1a5e9d8ce1185723419c487758d81ac2bde693711947032cce600ca7c9cda7d6"},
-    {file = "yarl-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d743e3118b2640cef7768ea955378c3536482d95550222f908f392167fe62059"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26768342f256e6e3c37533bf9433f5f15f3e59e3c14b2409098291b3efaceacb"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1b0796168b953bca6600c5f97f5ed407479889a36ad7d17183366260f29a6b9"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:858728086914f3a407aa7979cab743bbda1fe2bdf39ffcd991469a370dd7414d"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5570e6d47bcb03215baf4c9ad7bf7c013e56285d9d35013541f9ac2b372593e7"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66ea8311422a7ba1fc79b4c42c2baa10566469fe5a78500d4e7754d6e6db8724"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:649bddcedee692ee8a9b7b6e38582cb4062dc4253de9711568e5620d8707c2a3"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3a91654adb7643cb21b46f04244c5a315a440dcad63213033826549fa2435f71"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b439cae82034ade094526a8f692b9a2b5ee936452de5e4c5f0f6c48df23f8604"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:571f781ae8ac463ce30bacebfaef2c6581543776d5970b2372fbe31d7bf31a07"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:aa7943f04f36d6cafc0cf53ea89824ac2c37acbdb4b316a654176ab8ffd0f968"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1a5cf32539373ff39d97723e39a9283a7277cbf1224f7aef0c56c9598b6486c3"},
-    {file = "yarl-1.16.0-cp312-cp312-win32.whl", hash = "sha256:a5b6c09b9b4253d6a208b0f4a2f9206e511ec68dce9198e0fbec4f160137aa67"},
-    {file = "yarl-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:1208ca14eed2fda324042adf8d6c0adf4a31522fa95e0929027cd487875f0240"},
-    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5ace0177520bd4caa99295a9b6fb831d0e9a57d8e0501a22ffaa61b4c024283"},
-    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7118bdb5e3ed81acaa2095cba7ec02a0fe74b52a16ab9f9ac8e28e53ee299732"},
-    {file = "yarl-1.16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38fec8a2a94c58bd47c9a50a45d321ab2285ad133adefbbadf3012c054b7e656"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8791d66d81ee45866a7bb15a517b01a2bcf583a18ebf5d72a84e6064c417e64b"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cf936ba67bc6c734f3aa1c01391da74ab7fc046a9f8bbfa230b8393b90cf472"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1aab176dd55b59f77a63b27cffaca67d29987d91a5b615cbead41331e6b7428"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:995d0759004c08abd5d1b81300a91d18c8577c6389300bed1c7c11675105a44d"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bc22e00edeb068f71967ab99081e9406cd56dbed864fc3a8259442999d71552"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:35b4f7842154176523e0a63c9b871168c69b98065d05a4f637fce342a6a2693a"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:7ace71c4b7a0c41f317ae24be62bb61e9d80838d38acb20e70697c625e71f120"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8f639e3f5795a6568aa4f7d2ac6057c757dcd187593679f035adbf12b892bb00"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e8be3aff14f0120ad049121322b107f8a759be76a6a62138322d4c8a337a9e2c"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:122d8e7986043d0549e9eb23c7fd23be078be4b70c9eb42a20052b3d3149c6f2"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fd9c227990f609c165f56b46107d0bc34553fe0387818c42c02f77974402c36"},
-    {file = "yarl-1.16.0-cp313-cp313-win32.whl", hash = "sha256:595ca5e943baed31d56b33b34736461a371c6ea0038d3baec399949dd628560b"},
-    {file = "yarl-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:921b81b8d78f0e60242fb3db615ea3f368827a76af095d5a69f1c3366db3f596"},
-    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab2b2ac232110a1fdb0d3ffcd087783edd3d4a6ced432a1bf75caf7b7be70916"},
-    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7f8713717a09acbfee7c47bfc5777e685539fefdd34fa72faf504c8be2f3df4e"},
-    {file = "yarl-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cdcffe1dbcb4477d2b4202f63cd972d5baa155ff5a3d9e35801c46a415b7f71a"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a91217208306d82357c67daeef5162a41a28c8352dab7e16daa82e3718852a7"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ab3ed42c78275477ea8e917491365e9a9b69bb615cb46169020bd0aa5e2d6d3"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:707ae579ccb3262dfaef093e202b4c3fb23c3810e8df544b1111bd2401fd7b09"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7a852d1cd0b8d8b37fc9d7f8581152add917a98cfe2ea6e241878795f917ae"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3f1cc3d3d4dc574bebc9b387f6875e228ace5748a7c24f49d8f01ac1bc6c31b"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5ff96da263740779b0893d02b718293cc03400c3a208fc8d8cd79d9b0993e532"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:3d375a19ba2bfe320b6d873f3fb165313b002cef8b7cc0a368ad8b8a57453837"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:62c7da0ad93a07da048b500514ca47b759459ec41924143e2ddb5d7e20fd3db5"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:147b0fcd0ee33b4b5f6edfea80452d80e419e51b9a3f7a96ce98eaee145c1581"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:504e1fe1cc4f170195320eb033d2b0ccf5c6114ce5bf2f617535c01699479bca"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bdcf667a5dec12a48f669e485d70c54189f0639c2157b538a4cffd24a853624f"},
-    {file = "yarl-1.16.0-cp39-cp39-win32.whl", hash = "sha256:e9951afe6557c75a71045148890052cb942689ee4c9ec29f5436240e1fcc73b7"},
-    {file = "yarl-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d7aaa8ff95d0840e289423e7dc35696c2b058d635f945bf05b5cd633146b027"},
-    {file = "yarl-1.16.0-py3-none-any.whl", hash = "sha256:e6980a558d8461230c457218bd6c92dfc1d10205548215c2c21d79dc8d0a96f3"},
-    {file = "yarl-1.16.0.tar.gz", hash = "sha256:b6f687ced5510a9a2474bbae96a4352e5ace5fa34dc44a217b0537fec1db00b4"},
-]
-
-[package.dependencies]
-idna = ">=2.0"
-multidict = ">=4.0"
-propcache = ">=0.2.0"
-
-[[package]]
-name = "zipp"
-version = "3.20.2"
-description = "Backport of pathlib-compatible object wrapper for zip files"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"},
-    {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"},
-]
-
-[package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-enabler = ["pytest-enabler (>=2.2)"]
-test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
-type = ["pytest-mypy"]
-
-[extras]
-accelerate = ["accelerate"]
-attention = ["attention-kernels", "attention-kernels", "attention-kernels", "attention-kernels"]
-bnb = ["bitsandbytes"]
-compressed-tensors = ["compressed-tensors"]
-marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"]
-moe = ["moe-kernels", "moe-kernels", "moe-kernels", "moe-kernels"]
-outlines = ["outlines"]
-peft = ["peft"]
-quantize = ["accelerate", "datasets", "texttable"]
-torch = ["torch"]
-
-[metadata]
-lock-version = "2.0"
-python-versions = ">=3.9,<3.13"
-content-hash = "0ead8472620eeef6f9ff81f70bcb48403f9c831b6914245efa5e249724d80d0b"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 0386ae55..6d0b8dfb 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -1,96 +1,97 @@
-[tool.poetry]
+[project]
 name = "text-generation-server"
 version = "2.0.5-dev0"
 description = "Text Generation Inference Python gRPC Server"
-authors = ["Olivier Dehaene <olivier@huggingface.co>"]
+readme = "README.md"
+requires-python = ">=3.9"
+authors = [
+  {name = "Olivier Dehaene", email = "olivier@huggingface.co"},
+  {name = "Nicolas Patry", email = "nicolas@huggingface.co"},
+]
+dependencies = [
+    "einops>=0.8.0",
+    "grpc-interceptor>=0.15.4",
+    "grpcio>=1.67.0",
+    "grpcio-reflection>=1.67.0",
+    "grpcio-status>=1.67.0",
+    "hf-transfer>=0.1.8",
+    "loguru>=0.7.3",
+    "numpy>=1.26,<3",
+    "opentelemetry-api>=1.27.0",
+    "opentelemetry-exporter-otlp>=1.27.0",
+    "opentelemetry-instrumentation-grpc>=0.50b0",
+    "pillow>=11.1.0",
+    "prometheus-client>=0.21.0",
+    "protobuf>=5.28.3",
+    "py-cpuinfo>=9.0.0",
+    "rich>=13.8.1",
+    "safetensors>=0.4.5",
+    "scipy>=1.13.1",
+    "sentencepiece>=0.2.0",
+    "tokenizers>=0.20.3",
+    "typer>=0.15.1",
+]
 
-[tool.poetry.scripts]
-text-generation-server = 'text_generation_server.cli:app'
+[project.scripts]
+text-generation-server = "text_generation_server.cli:app"
 
-[tool.poetry.dependencies]
-python = ">=3.9,<3.13"
-protobuf = ">=4.25.3,<6"
-grpcio = "^1.51.1"
-grpcio-status = "^1.51.1"
-grpcio-reflection = "^1.51.1"
-grpc-interceptor = "^0.15.4"
-typer = "^0.12.5"
-accelerate = {version = "^1.1.0", optional = true}
-bitsandbytes = { version = "^0.45.0", optional = true }
-safetensors = "^0.4.5"
-loguru = "^0.7.2"
-opentelemetry-api = "^1.27.0"
-opentelemetry-exporter-otlp = "^1.27.0"
-opentelemetry-instrumentation-grpc = "^0.48b0"
-hf-transfer = "^0.1.2"
-sentencepiece = "^0.2.0"
-tokenizers = "^0.20.3"
-huggingface-hub = "^0.23"
-transformers = "^4.46.2"
-einops = "^0.8.0"
-texttable = { version = "^1.6.7", optional = true }
-datasets = {version = "^2.21.0", optional = true}
-peft = {version = "^0.13.2", optional = true}
-torch = {version = "^2.4.1", optional = true}
-scipy = "^1.13.1"
-pillow = "^11.0.0"
-outlines= {version = "^0.1.3", optional = true}
-prometheus-client = ">=0.20.0,<0.22"
-py-cpuinfo = "^9.0.0"
-compressed-tensors = {version = "^0.7.1", optional = true}
-# Remove later, temporary workaround for outlines.
-numpy = "^1.26.4"
+[project.optional-dependencies]
+accelerate = [
+    "accelerate>=1.2.1,<2",
+]
+bnb = [
+    "bitsandbytes>=0.45.0",
+]
+compressed-tensors = [
+    "compressed-tensors>=0.9.0",
+]
+peft = [
+    "peft>=0.14.0",
+]
+outlines = [
+    "outlines>=0.1.13",
+]
+dev = [
+    "grpcio-tools>=1.51.1,<2.0",
+    "pytest>=7.3.0,<8"
+]
+quantize = [
+    "texttable>=1.6.7,<2",
+    "datasets>=2.21,<3",
+]
+moe = [ "moe-kernels" ]
+attention = [ "attention-kernels" ]
+marlin = [ "marlin-kernels" ]
+gen = [
+    "grpcio-tools>=1.69.0",
+    "mypy-protobuf>=3.6.0",
+]
 
+[tool.uv.sources]
 attention-kernels = [
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 moe-kernels = [
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
-rich = "^13.8.1"
-
-[tool.poetry.extras]
-torch = ["torch"]
-accelerate = ["accelerate"]
-attention = ["attention-kernels"]
-bnb = ["bitsandbytes"]
-compressed-tensors = ["compressed-tensors"]
-marlin = ["marlin-kernels"]
-moe = ["moe-kernels"]
-peft = ["peft"]
-quantize = ["texttable", "datasets", "accelerate"]
-outlines = ["outlines"]
-
-[tool.poetry.group.dev.dependencies]
-grpcio-tools = "^1.51.1"
-pytest = "^7.3.0"
-
-
-[[tool.poetry.source]]
-name = "pytorch-gpu-src"
-url = "https://download.pytorch.org/whl/cu121"
-priority = "explicit"
 
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
 
-[build-system]
-requires = [
-    "poetry-core>=1.0.0",
-]
-build-backend = "poetry.core.masonry.api"
-
 [tool.isort]
 profile = "black"
+
+[tool.setuptools.packages.find]
+include = ["text_generation_server*"]
diff --git a/server/uv.lock b/server/uv.lock
new file mode 100644
index 00000000..5e2c9d1c
--- /dev/null
+++ b/server/uv.lock
@@ -0,0 +1,3426 @@
+version = 1
+requires-python = ">=3.9"
+resolution-markers = [
+    "python_full_version >= '3.13'",
+    "python_full_version == '3.12.*'",
+    "python_full_version == '3.11.*'",
+    "python_full_version == '3.10.*'",
+    "python_full_version < '3.10'",
+]
+
+[[package]]
+name = "accelerate"
+version = "1.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyyaml" },
+    { name = "safetensors" },
+    { name = "torch" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/09/7947691b7d44bfc739da4a44cc47d6a6d75e6fe9adf047c5234d7cb6be64/accelerate-1.2.1.tar.gz", hash = "sha256:03e161fc69d495daf2b9b5c8d5b43d06e2145520c04727b5bda56d49f1a43ab5", size = 341652 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c2/60/a585c806d6c0ec5f8149d44eb202714792802f484e6e2b1bf96b23bd2b00/accelerate-1.2.1-py3-none-any.whl", hash = "sha256:be1cbb958cf837e7cdfbde46b812964b1b8ae94c9c7d94d921540beafcee8ddf", size = 336355 },
+]
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.4.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7f/55/e4373e888fdacb15563ef6fa9fa8c8252476ea071e96fb46defac9f18bf2/aiohappyeyeballs-2.4.4.tar.gz", hash = "sha256:5fdd7d87889c63183afc18ce9271f9b0a7d32c2303e394468dd45d514a757745", size = 21977 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b9/74/fbb6559de3607b3300b9be3cc64e97548d55678e44623db17820dbd20002/aiohappyeyeballs-2.4.4-py3-none-any.whl", hash = "sha256:a980909d50efcd44795c4afeca523296716d50cd756ddca6af8c65b996e27de8", size = 14756 },
+]
+
+[[package]]
+name = "aiohttp"
+version = "3.11.11"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohappyeyeballs" },
+    { name = "aiosignal" },
+    { name = "async-timeout", marker = "python_full_version < '3.11'" },
+    { name = "attrs" },
+    { name = "frozenlist" },
+    { name = "multidict" },
+    { name = "propcache" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fe/ed/f26db39d29cd3cb2f5a3374304c713fe5ab5a0e4c8ee25a0c45cc6adf844/aiohttp-3.11.11.tar.gz", hash = "sha256:bb49c7f1e6ebf3821a42d81d494f538107610c3a705987f53068546b0e90303e", size = 7669618 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/75/7d/ff2e314b8f9e0b1df833e2d4778eaf23eae6b8cc8f922495d110ddcbf9e1/aiohttp-3.11.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a60804bff28662cbcf340a4d61598891f12eea3a66af48ecfdc975ceec21e3c8", size = 708550 },
+    { url = "https://files.pythonhosted.org/packages/09/b8/aeb4975d5bba233d6f246941f5957a5ad4e3def8b0855a72742e391925f2/aiohttp-3.11.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b4fa1cb5f270fb3eab079536b764ad740bb749ce69a94d4ec30ceee1b5940d5", size = 468430 },
+    { url = "https://files.pythonhosted.org/packages/9c/5b/5b620279b3df46e597008b09fa1e10027a39467387c2332657288e25811a/aiohttp-3.11.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:731468f555656767cda219ab42e033355fe48c85fbe3ba83a349631541715ba2", size = 455593 },
+    { url = "https://files.pythonhosted.org/packages/d8/75/0cdf014b816867d86c0bc26f3d3e3f194198dbf33037890beed629cd4f8f/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb23d8bb86282b342481cad4370ea0853a39e4a32a0042bb52ca6bdde132df43", size = 1584635 },
+    { url = "https://files.pythonhosted.org/packages/df/2f/95b8f4e4dfeb57c1d9ad9fa911ede35a0249d75aa339edd2c2270dc539da/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f047569d655f81cb70ea5be942ee5d4421b6219c3f05d131f64088c73bb0917f", size = 1632363 },
+    { url = "https://files.pythonhosted.org/packages/39/cb/70cf69ea7c50f5b0021a84f4c59c3622b2b3b81695f48a2f0e42ef7eba6e/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd7659baae9ccf94ae5fe8bfaa2c7bc2e94d24611528395ce88d009107e00c6d", size = 1668315 },
+    { url = "https://files.pythonhosted.org/packages/2f/cc/3a3fc7a290eabc59839a7e15289cd48f33dd9337d06e301064e1e7fb26c5/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af01e42ad87ae24932138f154105e88da13ce7d202a6de93fafdafb2883a00ef", size = 1589546 },
+    { url = "https://files.pythonhosted.org/packages/15/b4/0f7b0ed41ac6000e283e7332f0f608d734b675a8509763ca78e93714cfb0/aiohttp-3.11.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5854be2f3e5a729800bac57a8d76af464e160f19676ab6aea74bde18ad19d438", size = 1544581 },
+    { url = "https://files.pythonhosted.org/packages/58/b9/4d06470fd85c687b6b0e31935ef73dde6e31767c9576d617309a2206556f/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6526e5fb4e14f4bbf30411216780c9967c20c5a55f2f51d3abd6de68320cc2f3", size = 1529256 },
+    { url = "https://files.pythonhosted.org/packages/61/a2/6958b1b880fc017fd35f5dfb2c26a9a50c755b75fd9ae001dc2236a4fb79/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:85992ee30a31835fc482468637b3e5bd085fa8fe9392ba0bdcbdc1ef5e9e3c55", size = 1536592 },
+    { url = "https://files.pythonhosted.org/packages/0f/dd/b974012a9551fd654f5bb95a6dd3f03d6e6472a17e1a8216dd42e9638d6c/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:88a12ad8ccf325a8a5ed80e6d7c3bdc247d66175afedbe104ee2aaca72960d8e", size = 1607446 },
+    { url = "https://files.pythonhosted.org/packages/e0/d3/6c98fd87e638e51f074a3f2061e81fcb92123bcaf1439ac1b4a896446e40/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0a6d3fbf2232e3a08c41eca81ae4f1dff3d8f1a30bae415ebe0af2d2458b8a33", size = 1628809 },
+    { url = "https://files.pythonhosted.org/packages/a8/2e/86e6f85cbca02be042c268c3d93e7f35977a0e127de56e319bdd1569eaa8/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84a585799c58b795573c7fa9b84c455adf3e1d72f19a2bf498b54a95ae0d194c", size = 1564291 },
+    { url = "https://files.pythonhosted.org/packages/0b/8d/1f4ef3503b767717f65e1f5178b0173ab03cba1a19997ebf7b052161189f/aiohttp-3.11.11-cp310-cp310-win32.whl", hash = "sha256:bfde76a8f430cf5c5584553adf9926534352251d379dcb266ad2b93c54a29745", size = 416601 },
+    { url = "https://files.pythonhosted.org/packages/ad/86/81cb83691b5ace3d9aa148dc42bacc3450d749fc88c5ec1973573c1c1779/aiohttp-3.11.11-cp310-cp310-win_amd64.whl", hash = "sha256:0fd82b8e9c383af11d2b26f27a478640b6b83d669440c0a71481f7c865a51da9", size = 442007 },
+    { url = "https://files.pythonhosted.org/packages/34/ae/e8806a9f054e15f1d18b04db75c23ec38ec954a10c0a68d3bd275d7e8be3/aiohttp-3.11.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ba74ec819177af1ef7f59063c6d35a214a8fde6f987f7661f4f0eecc468a8f76", size = 708624 },
+    { url = "https://files.pythonhosted.org/packages/c7/e0/313ef1a333fb4d58d0c55a6acb3cd772f5d7756604b455181049e222c020/aiohttp-3.11.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4af57160800b7a815f3fe0eba9b46bf28aafc195555f1824555fa2cfab6c1538", size = 468507 },
+    { url = "https://files.pythonhosted.org/packages/a9/60/03455476bf1f467e5b4a32a465c450548b2ce724eec39d69f737191f936a/aiohttp-3.11.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ffa336210cf9cd8ed117011085817d00abe4c08f99968deef0013ea283547204", size = 455571 },
+    { url = "https://files.pythonhosted.org/packages/be/f9/469588603bd75bf02c8ffb8c8a0d4b217eed446b49d4a767684685aa33fd/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81b8fe282183e4a3c7a1b72f5ade1094ed1c6345a8f153506d114af5bf8accd9", size = 1685694 },
+    { url = "https://files.pythonhosted.org/packages/88/b9/1b7fa43faf6c8616fa94c568dc1309ffee2b6b68b04ac268e5d64b738688/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3af41686ccec6a0f2bdc66686dc0f403c41ac2089f80e2214a0f82d001052c03", size = 1743660 },
+    { url = "https://files.pythonhosted.org/packages/2a/8b/0248d19dbb16b67222e75f6aecedd014656225733157e5afaf6a6a07e2e8/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70d1f9dde0e5dd9e292a6d4d00058737052b01f3532f69c0c65818dac26dc287", size = 1785421 },
+    { url = "https://files.pythonhosted.org/packages/c4/11/f478e071815a46ca0a5ae974651ff0c7a35898c55063305a896e58aa1247/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:249cc6912405917344192b9f9ea5cd5b139d49e0d2f5c7f70bdfaf6b4dbf3a2e", size = 1675145 },
+    { url = "https://files.pythonhosted.org/packages/26/5d/284d182fecbb5075ae10153ff7374f57314c93a8681666600e3a9e09c505/aiohttp-3.11.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0eb98d90b6690827dcc84c246811feeb4e1eea683c0eac6caed7549be9c84665", size = 1619804 },
+    { url = "https://files.pythonhosted.org/packages/1b/78/980064c2ad685c64ce0e8aeeb7ef1e53f43c5b005edcd7d32e60809c4992/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec82bf1fda6cecce7f7b915f9196601a1bd1a3079796b76d16ae4cce6d0ef89b", size = 1654007 },
+    { url = "https://files.pythonhosted.org/packages/21/8d/9e658d63b1438ad42b96f94da227f2e2c1d5c6001c9e8ffcc0bfb22e9105/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9fd46ce0845cfe28f108888b3ab17abff84ff695e01e73657eec3f96d72eef34", size = 1650022 },
+    { url = "https://files.pythonhosted.org/packages/85/fd/a032bf7f2755c2df4f87f9effa34ccc1ef5cea465377dbaeef93bb56bbd6/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:bd176afcf8f5d2aed50c3647d4925d0db0579d96f75a31e77cbaf67d8a87742d", size = 1732899 },
+    { url = "https://files.pythonhosted.org/packages/c5/0c/c2b85fde167dd440c7ba50af2aac20b5a5666392b174df54c00f888c5a75/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ec2aa89305006fba9ffb98970db6c8221541be7bee4c1d027421d6f6df7d1ce2", size = 1755142 },
+    { url = "https://files.pythonhosted.org/packages/bc/78/91ae1a3b3b3bed8b893c5d69c07023e151b1c95d79544ad04cf68f596c2f/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:92cde43018a2e17d48bb09c79e4d4cb0e236de5063ce897a5e40ac7cb4878773", size = 1692736 },
+    { url = "https://files.pythonhosted.org/packages/77/89/a7ef9c4b4cdb546fcc650ca7f7395aaffbd267f0e1f648a436bec33c9b95/aiohttp-3.11.11-cp311-cp311-win32.whl", hash = "sha256:aba807f9569455cba566882c8938f1a549f205ee43c27b126e5450dc9f83cc62", size = 416418 },
+    { url = "https://files.pythonhosted.org/packages/fc/db/2192489a8a51b52e06627506f8ac8df69ee221de88ab9bdea77aa793aa6a/aiohttp-3.11.11-cp311-cp311-win_amd64.whl", hash = "sha256:ae545f31489548c87b0cced5755cfe5a5308d00407000e72c4fa30b19c3220ac", size = 442509 },
+    { url = "https://files.pythonhosted.org/packages/69/cf/4bda538c502f9738d6b95ada11603c05ec260807246e15e869fc3ec5de97/aiohttp-3.11.11-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e595c591a48bbc295ebf47cb91aebf9bd32f3ff76749ecf282ea7f9f6bb73886", size = 704666 },
+    { url = "https://files.pythonhosted.org/packages/46/7b/87fcef2cad2fad420ca77bef981e815df6904047d0a1bd6aeded1b0d1d66/aiohttp-3.11.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3ea1b59dc06396b0b424740a10a0a63974c725b1c64736ff788a3689d36c02d2", size = 464057 },
+    { url = "https://files.pythonhosted.org/packages/5a/a6/789e1f17a1b6f4a38939fbc39d29e1d960d5f89f73d0629a939410171bc0/aiohttp-3.11.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8811f3f098a78ffa16e0ea36dffd577eb031aea797cbdba81be039a4169e242c", size = 455996 },
+    { url = "https://files.pythonhosted.org/packages/b7/dd/485061fbfef33165ce7320db36e530cd7116ee1098e9c3774d15a732b3fd/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7227b87a355ce1f4bf83bfae4399b1f5bb42e0259cb9405824bd03d2f4336a", size = 1682367 },
+    { url = "https://files.pythonhosted.org/packages/e9/d7/9ec5b3ea9ae215c311d88b2093e8da17e67b8856673e4166c994e117ee3e/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d40f9da8cabbf295d3a9dae1295c69975b86d941bc20f0a087f0477fa0a66231", size = 1736989 },
+    { url = "https://files.pythonhosted.org/packages/d6/fb/ea94927f7bfe1d86178c9d3e0a8c54f651a0a655214cce930b3c679b8f64/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ffb3dc385f6bb1568aa974fe65da84723210e5d9707e360e9ecb51f59406cd2e", size = 1793265 },
+    { url = "https://files.pythonhosted.org/packages/40/7f/6de218084f9b653026bd7063cd8045123a7ba90c25176465f266976d8c82/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8f5f7515f3552d899c61202d99dcb17d6e3b0de777900405611cd747cecd1b8", size = 1691841 },
+    { url = "https://files.pythonhosted.org/packages/77/e2/992f43d87831cbddb6b09c57ab55499332f60ad6fdbf438ff4419c2925fc/aiohttp-3.11.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3499c7ffbfd9c6a3d8d6a2b01c26639da7e43d47c7b4f788016226b1e711caa8", size = 1619317 },
+    { url = "https://files.pythonhosted.org/packages/96/74/879b23cdd816db4133325a201287c95bef4ce669acde37f8f1b8669e1755/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8e2bf8029dbf0810c7bfbc3e594b51c4cc9101fbffb583a3923aea184724203c", size = 1641416 },
+    { url = "https://files.pythonhosted.org/packages/30/98/b123f6b15d87c54e58fd7ae3558ff594f898d7f30a90899718f3215ad328/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b6212a60e5c482ef90f2d788835387070a88d52cf6241d3916733c9176d39eab", size = 1646514 },
+    { url = "https://files.pythonhosted.org/packages/d7/38/257fda3dc99d6978ab943141d5165ec74fd4b4164baa15e9c66fa21da86b/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d119fafe7b634dbfa25a8c597718e69a930e4847f0b88e172744be24515140da", size = 1702095 },
+    { url = "https://files.pythonhosted.org/packages/0c/f4/ddab089053f9fb96654df5505c0a69bde093214b3c3454f6bfdb1845f558/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:6fba278063559acc730abf49845d0e9a9e1ba74f85f0ee6efd5803f08b285853", size = 1734611 },
+    { url = "https://files.pythonhosted.org/packages/c3/d6/f30b2bc520c38c8aa4657ed953186e535ae84abe55c08d0f70acd72ff577/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92fc484e34b733704ad77210c7957679c5c3877bd1e6b6d74b185e9320cc716e", size = 1694576 },
+    { url = "https://files.pythonhosted.org/packages/bc/97/b0a88c3f4c6d0020b34045ee6d954058abc870814f6e310c4c9b74254116/aiohttp-3.11.11-cp312-cp312-win32.whl", hash = "sha256:9f5b3c1ed63c8fa937a920b6c1bec78b74ee09593b3f5b979ab2ae5ef60d7600", size = 411363 },
+    { url = "https://files.pythonhosted.org/packages/7f/23/cc36d9c398980acaeeb443100f0216f50a7cfe20c67a9fd0a2f1a5a846de/aiohttp-3.11.11-cp312-cp312-win_amd64.whl", hash = "sha256:1e69966ea6ef0c14ee53ef7a3d68b564cc408121ea56c0caa2dc918c1b2f553d", size = 437666 },
+    { url = "https://files.pythonhosted.org/packages/49/d1/d8af164f400bad432b63e1ac857d74a09311a8334b0481f2f64b158b50eb/aiohttp-3.11.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:541d823548ab69d13d23730a06f97460f4238ad2e5ed966aaf850d7c369782d9", size = 697982 },
+    { url = "https://files.pythonhosted.org/packages/92/d1/faad3bf9fa4bfd26b95c69fc2e98937d52b1ff44f7e28131855a98d23a17/aiohttp-3.11.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:929f3ed33743a49ab127c58c3e0a827de0664bfcda566108989a14068f820194", size = 460662 },
+    { url = "https://files.pythonhosted.org/packages/db/61/0d71cc66d63909dabc4590f74eba71f91873a77ea52424401c2498d47536/aiohttp-3.11.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0882c2820fd0132240edbb4a51eb8ceb6eef8181db9ad5291ab3332e0d71df5f", size = 452950 },
+    { url = "https://files.pythonhosted.org/packages/07/db/6d04bc7fd92784900704e16b745484ef45b77bd04e25f58f6febaadf7983/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b63de12e44935d5aca7ed7ed98a255a11e5cb47f83a9fded7a5e41c40277d104", size = 1665178 },
+    { url = "https://files.pythonhosted.org/packages/54/5c/e95ade9ae29f375411884d9fd98e50535bf9fe316c9feb0f30cd2ac8f508/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa54f8ef31d23c506910c21163f22b124facb573bff73930735cf9fe38bf7dff", size = 1717939 },
+    { url = "https://files.pythonhosted.org/packages/6f/1c/1e7d5c5daea9e409ed70f7986001b8c9e3a49a50b28404498d30860edab6/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a344d5dc18074e3872777b62f5f7d584ae4344cd6006c17ba12103759d407af3", size = 1775125 },
+    { url = "https://files.pythonhosted.org/packages/5d/66/890987e44f7d2f33a130e37e01a164168e6aff06fce15217b6eaf14df4f6/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7fb429ab1aafa1f48578eb315ca45bd46e9c37de11fe45c7f5f4138091e2f1", size = 1677176 },
+    { url = "https://files.pythonhosted.org/packages/8f/dc/e2ba57d7a52df6cdf1072fd5fa9c6301a68e1cd67415f189805d3eeb031d/aiohttp-3.11.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c341c7d868750e31961d6d8e60ff040fb9d3d3a46d77fd85e1ab8e76c3e9a5c4", size = 1603192 },
+    { url = "https://files.pythonhosted.org/packages/6c/9e/8d08a57de79ca3a358da449405555e668f2c8871a7777ecd2f0e3912c272/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed9ee95614a71e87f1a70bc81603f6c6760128b140bc4030abe6abaa988f1c3d", size = 1618296 },
+    { url = "https://files.pythonhosted.org/packages/56/51/89822e3ec72db352c32e7fc1c690370e24e231837d9abd056490f3a49886/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:de8d38f1c2810fa2a4f1d995a2e9c70bb8737b18da04ac2afbf3971f65781d87", size = 1616524 },
+    { url = "https://files.pythonhosted.org/packages/2c/fa/e2e6d9398f462ffaa095e84717c1732916a57f1814502929ed67dd7568ef/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:a9b7371665d4f00deb8f32208c7c5e652059b0fda41cf6dbcac6114a041f1cc2", size = 1685471 },
+    { url = "https://files.pythonhosted.org/packages/ae/5f/6bb976e619ca28a052e2c0ca7b0251ccd893f93d7c24a96abea38e332bf6/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:620598717fce1b3bd14dd09947ea53e1ad510317c85dda2c9c65b622edc96b12", size = 1715312 },
+    { url = "https://files.pythonhosted.org/packages/79/c1/756a7e65aa087c7fac724d6c4c038f2faaa2a42fe56dbc1dd62a33ca7213/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bf8d9bfee991d8acc72d060d53860f356e07a50f0e0d09a8dfedea1c554dd0d5", size = 1672783 },
+    { url = "https://files.pythonhosted.org/packages/73/ba/a6190ebb02176c7f75e6308da31f5d49f6477b651a3dcfaaaca865a298e2/aiohttp-3.11.11-cp313-cp313-win32.whl", hash = "sha256:9d73ee3725b7a737ad86c2eac5c57a4a97793d9f442599bea5ec67ac9f4bdc3d", size = 410229 },
+    { url = "https://files.pythonhosted.org/packages/b8/62/c9fa5bafe03186a0e4699150a7fed9b1e73240996d0d2f0e5f70f3fdf471/aiohttp-3.11.11-cp313-cp313-win_amd64.whl", hash = "sha256:c7a06301c2fb096bdb0bd25fe2011531c1453b9f2c163c8031600ec73af1cc99", size = 436081 },
+    { url = "https://files.pythonhosted.org/packages/9f/37/326ee86b7640be6ca4493c8121cb9a4386e07cf1e5757ce6b7fa854d0a5f/aiohttp-3.11.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3e23419d832d969f659c208557de4a123e30a10d26e1e14b73431d3c13444c2e", size = 709424 },
+    { url = "https://files.pythonhosted.org/packages/9c/c5/a88ec2160b06c22e57e483a1f78f99f005fcd4e7d6855a2d3d6510881b65/aiohttp-3.11.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:21fef42317cf02e05d3b09c028712e1d73a9606f02467fd803f7c1f39cc59add", size = 468907 },
+    { url = "https://files.pythonhosted.org/packages/b2/f0/02f03f818e91996161cce200241b631bb2b4a87e61acddb5b974e254a288/aiohttp-3.11.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1f21bb8d0235fc10c09ce1d11ffbd40fc50d3f08a89e4cf3a0c503dc2562247a", size = 455981 },
+    { url = "https://files.pythonhosted.org/packages/0e/17/c8be12436ec19915f67b1ab8240d4105aba0f7e0894a1f0d8939c3e79c70/aiohttp-3.11.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1642eceeaa5ab6c9b6dfeaaa626ae314d808188ab23ae196a34c9d97efb68350", size = 1587395 },
+    { url = "https://files.pythonhosted.org/packages/43/c0/f4db1ac30ebe855b2fefd6fa98767862d88ac54ab08a6ad07d619146270c/aiohttp-3.11.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2170816e34e10f2fd120f603e951630f8a112e1be3b60963a1f159f5699059a6", size = 1636243 },
+    { url = "https://files.pythonhosted.org/packages/ea/a7/9acf20e9a09b0d38b5b55691410500d051a9f4194692cac22b0d0fc92ad9/aiohttp-3.11.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8be8508d110d93061197fd2d6a74f7401f73b6d12f8822bbcd6d74f2b55d71b1", size = 1672323 },
+    { url = "https://files.pythonhosted.org/packages/f7/5b/a27e8fe1a3b0e245ca80863eefd83fc00136752d27d2cf1afa0130a76f34/aiohttp-3.11.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4eed954b161e6b9b65f6be446ed448ed3921763cc432053ceb606f89d793927e", size = 1589521 },
+    { url = "https://files.pythonhosted.org/packages/25/50/8bccd08004e15906791b46f0a908a8e7f5e0c5882b17da96d1933bd34ac0/aiohttp-3.11.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6c9af134da4bc9b3bd3e6a70072509f295d10ee60c697826225b60b9959acdd", size = 1544059 },
+    { url = "https://files.pythonhosted.org/packages/84/5a/42250b37b06ee0cb7a03dd1630243b1d739ca3edb5abd8b18f479a539900/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:44167fc6a763d534a6908bdb2592269b4bf30a03239bcb1654781adf5e49caf1", size = 1530217 },
+    { url = "https://files.pythonhosted.org/packages/18/08/eb334da86cd2cdbd0621bb7039255b19ca74ce8b05e8fb61850e2589938c/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:479b8c6ebd12aedfe64563b85920525d05d394b85f166b7873c8bde6da612f9c", size = 1536081 },
+    { url = "https://files.pythonhosted.org/packages/1a/a9/9d59958084d5bad7e77a44841013bd59768cda94f9f744769461b66038fc/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:10b4ff0ad793d98605958089fabfa350e8e62bd5d40aa65cdc69d6785859f94e", size = 1606918 },
+    { url = "https://files.pythonhosted.org/packages/4f/e7/27feb1cff17dcddb7a5b703199106196718d622a3aa70f80a386d15361d7/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:b540bd67cfb54e6f0865ceccd9979687210d7ed1a1cc8c01f8e67e2f1e883d28", size = 1629101 },
+    { url = "https://files.pythonhosted.org/packages/e8/29/49debcd858b997c655fca274c5247fcfe29bf31a4ddb1ce3f088539b14e4/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1dac54e8ce2ed83b1f6b1a54005c87dfed139cf3f777fdc8afc76e7841101226", size = 1567338 },
+    { url = "https://files.pythonhosted.org/packages/3b/34/33af1e97aba1862e1812e2e2b96a1e050c5a6e9cecd5a5370591122fb07b/aiohttp-3.11.11-cp39-cp39-win32.whl", hash = "sha256:568c1236b2fde93b7720f95a890741854c1200fba4a3471ff48b2934d2d93fd3", size = 416914 },
+    { url = "https://files.pythonhosted.org/packages/2d/47/28b3fbd97026963af2774423c64341e0d4ec180ea3b79a2762a3c18d5d94/aiohttp-3.11.11-cp39-cp39-win_amd64.whl", hash = "sha256:943a8b052e54dfd6439fd7989f67fc6a7f2138d0a2cf0a7de5f18aa4fe7eb3b1", size = 442225 },
+]
+
+[[package]]
+name = "aiosignal"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "frozenlist" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ba/b5/6d55e80f6d8a08ce22b982eafa278d823b541c925f11ee774b0b9c43473d/aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54", size = 19424 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 },
+]
+
+[[package]]
+name = "airportsdata"
+version = "20241001"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ff/75/0af9b73babfec14df1395ca60e2a174e887c5d515e823c39a60f27bbbb30/airportsdata-20241001.tar.gz", hash = "sha256:fa0bd143b4f4be3557cb892fa0612ef210fd91a92bd720b4d8221de576a4fa00", size = 903075 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/af/57/57635b0af4a23723b2b43109aa4eae7395d346d1eed8a00691cf3acfc5bd/airportsdata-20241001-py3-none-any.whl", hash = "sha256:67d71cf2c5378cc17ff66b62b1e11aa2444043949c894543ac8fd8dafce192fd", size = 912677 },
+]
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 },
+]
+
+[[package]]
+name = "async-timeout"
+version = "5.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233 },
+]
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.13'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0a/49/1847d04522158767065d24382b6fe2e0540a63f222113e273745f77ee2c5/attention-kernels-0.1.1.tar.gz", hash = "sha256:aff84a6e61e4720c14a2c2a62242b21c271d5d36bb6a0f0f017b762611b2d477", size = 49652 }
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version == '3.10.*'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version == '3.10.*'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "torch" }]
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version == '3.11.*'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version == '3.11.*'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "torch" }]
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version == '3.12.*'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version == '3.12.*'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "torch" }]
+
+[[package]]
+name = "attention-kernels"
+version = "0.1.1"
+source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version < '3.10'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version < '3.10'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "torch" }]
+
+[[package]]
+name = "attrs"
+version = "24.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/48/c8/6260f8ccc11f0917360fc0da435c5c9c7504e3db174d5a12a1494887b045/attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff", size = 805984 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/89/aa/ab0f7891a01eeb2d2e338ae8fecbe57fcebea1a24dbb64d45801bfab481d/attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308", size = 63397 },
+]
+
+[[package]]
+name = "bitsandbytes"
+version = "0.45.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "torch" },
+    { name = "typing-extensions" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/99/9a/f41d252bf8b0bc5969b4dce1274cd04b7ddc541de1060dd27eca680bc1b2/bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:0f0323de1ff1fdf8383e79bdad1283516a4c05a6fd2b44a363bf4e059422305b", size = 69084267 },
+    { url = "https://files.pythonhosted.org/packages/1e/90/a2bbb9b5f997b9c9aa9c15ee4adf553ee71053bb942f89fd48d920a1aa9d/bitsandbytes-0.45.0-py3-none-win_amd64.whl", hash = "sha256:ebbf96e0ecb466716a65ecdeaef3fa1983575447b9ab66b74e5211892507c6ff", size = 68520043 },
+]
+
+[[package]]
+name = "certifi"
+version = "2024.12.14"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/bd/1d41ee578ce09523c81a15426705dd20969f5abf006d1afe8aeff0dd776a/certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db", size = 166010 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a5/32/8f6669fc4798494966bf446c8c4a162e0b5d893dff088afddf76414f70e1/certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56", size = 164927 },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/16/b0/572805e227f01586461c80e0fd25d65a2115599cc9dad142fee4b747c357/charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3", size = 123188 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/58/5580c1716040bc89206c77d8f74418caf82ce519aae06450393ca73475d1/charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de", size = 198013 },
+    { url = "https://files.pythonhosted.org/packages/d0/11/00341177ae71c6f5159a08168bcb98c6e6d196d372c94511f9f6c9afe0c6/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176", size = 141285 },
+    { url = "https://files.pythonhosted.org/packages/01/09/11d684ea5819e5a8f5100fb0b38cf8d02b514746607934134d31233e02c8/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037", size = 151449 },
+    { url = "https://files.pythonhosted.org/packages/08/06/9f5a12939db324d905dc1f70591ae7d7898d030d7662f0d426e2286f68c9/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f", size = 143892 },
+    { url = "https://files.pythonhosted.org/packages/93/62/5e89cdfe04584cb7f4d36003ffa2936681b03ecc0754f8e969c2becb7e24/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a", size = 146123 },
+    { url = "https://files.pythonhosted.org/packages/a9/ac/ab729a15c516da2ab70a05f8722ecfccc3f04ed7a18e45c75bbbaa347d61/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a", size = 147943 },
+    { url = "https://files.pythonhosted.org/packages/03/d2/3f392f23f042615689456e9a274640c1d2e5dd1d52de36ab8f7955f8f050/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247", size = 142063 },
+    { url = "https://files.pythonhosted.org/packages/f2/e3/e20aae5e1039a2cd9b08d9205f52142329f887f8cf70da3650326670bddf/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408", size = 150578 },
+    { url = "https://files.pythonhosted.org/packages/8d/af/779ad72a4da0aed925e1139d458adc486e61076d7ecdcc09e610ea8678db/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb", size = 153629 },
+    { url = "https://files.pythonhosted.org/packages/c2/b6/7aa450b278e7aa92cf7732140bfd8be21f5f29d5bf334ae987c945276639/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d", size = 150778 },
+    { url = "https://files.pythonhosted.org/packages/39/f4/d9f4f712d0951dcbfd42920d3db81b00dd23b6ab520419626f4023334056/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807", size = 146453 },
+    { url = "https://files.pythonhosted.org/packages/49/2b/999d0314e4ee0cff3cb83e6bc9aeddd397eeed693edb4facb901eb8fbb69/charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f", size = 95479 },
+    { url = "https://files.pythonhosted.org/packages/2d/ce/3cbed41cff67e455a386fb5e5dd8906cdda2ed92fbc6297921f2e4419309/charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f", size = 102790 },
+    { url = "https://files.pythonhosted.org/packages/72/80/41ef5d5a7935d2d3a773e3eaebf0a9350542f2cab4eac59a7a4741fbbbbe/charset_normalizer-3.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125", size = 194995 },
+    { url = "https://files.pythonhosted.org/packages/7a/28/0b9fefa7b8b080ec492110af6d88aa3dea91c464b17d53474b6e9ba5d2c5/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1", size = 139471 },
+    { url = "https://files.pythonhosted.org/packages/71/64/d24ab1a997efb06402e3fc07317e94da358e2585165930d9d59ad45fcae2/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3", size = 149831 },
+    { url = "https://files.pythonhosted.org/packages/37/ed/be39e5258e198655240db5e19e0b11379163ad7070962d6b0c87ed2c4d39/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd", size = 142335 },
+    { url = "https://files.pythonhosted.org/packages/88/83/489e9504711fa05d8dde1574996408026bdbdbd938f23be67deebb5eca92/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00", size = 143862 },
+    { url = "https://files.pythonhosted.org/packages/c6/c7/32da20821cf387b759ad24627a9aca289d2822de929b8a41b6241767b461/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12", size = 145673 },
+    { url = "https://files.pythonhosted.org/packages/68/85/f4288e96039abdd5aeb5c546fa20a37b50da71b5cf01e75e87f16cd43304/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77", size = 140211 },
+    { url = "https://files.pythonhosted.org/packages/28/a3/a42e70d03cbdabc18997baf4f0227c73591a08041c149e710045c281f97b/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146", size = 148039 },
+    { url = "https://files.pythonhosted.org/packages/85/e4/65699e8ab3014ecbe6f5c71d1a55d810fb716bbfd74f6283d5c2aa87febf/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd", size = 151939 },
+    { url = "https://files.pythonhosted.org/packages/b1/82/8e9fe624cc5374193de6860aba3ea8070f584c8565ee77c168ec13274bd2/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6", size = 149075 },
+    { url = "https://files.pythonhosted.org/packages/3d/7b/82865ba54c765560c8433f65e8acb9217cb839a9e32b42af4aa8e945870f/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8", size = 144340 },
+    { url = "https://files.pythonhosted.org/packages/b5/b6/9674a4b7d4d99a0d2df9b215da766ee682718f88055751e1e5e753c82db0/charset_normalizer-3.4.1-cp311-cp311-win32.whl", hash = "sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b", size = 95205 },
+    { url = "https://files.pythonhosted.org/packages/1e/ab/45b180e175de4402dcf7547e4fb617283bae54ce35c27930a6f35b6bef15/charset_normalizer-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76", size = 102441 },
+    { url = "https://files.pythonhosted.org/packages/0a/9a/dd1e1cdceb841925b7798369a09279bd1cf183cef0f9ddf15a3a6502ee45/charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545", size = 196105 },
+    { url = "https://files.pythonhosted.org/packages/d3/8c/90bfabf8c4809ecb648f39794cf2a84ff2e7d2a6cf159fe68d9a26160467/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7", size = 140404 },
+    { url = "https://files.pythonhosted.org/packages/ad/8f/e410d57c721945ea3b4f1a04b74f70ce8fa800d393d72899f0a40526401f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757", size = 150423 },
+    { url = "https://files.pythonhosted.org/packages/f0/b8/e6825e25deb691ff98cf5c9072ee0605dc2acfca98af70c2d1b1bc75190d/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa", size = 143184 },
+    { url = "https://files.pythonhosted.org/packages/3e/a2/513f6cbe752421f16d969e32f3583762bfd583848b763913ddab8d9bfd4f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d", size = 145268 },
+    { url = "https://files.pythonhosted.org/packages/74/94/8a5277664f27c3c438546f3eb53b33f5b19568eb7424736bdc440a88a31f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616", size = 147601 },
+    { url = "https://files.pythonhosted.org/packages/7c/5f/6d352c51ee763623a98e31194823518e09bfa48be2a7e8383cf691bbb3d0/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b", size = 141098 },
+    { url = "https://files.pythonhosted.org/packages/78/d4/f5704cb629ba5ab16d1d3d741396aec6dc3ca2b67757c45b0599bb010478/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d", size = 149520 },
+    { url = "https://files.pythonhosted.org/packages/c5/96/64120b1d02b81785f222b976c0fb79a35875457fa9bb40827678e54d1bc8/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a", size = 152852 },
+    { url = "https://files.pythonhosted.org/packages/84/c9/98e3732278a99f47d487fd3468bc60b882920cef29d1fa6ca460a1fdf4e6/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9", size = 150488 },
+    { url = "https://files.pythonhosted.org/packages/13/0e/9c8d4cb99c98c1007cc11eda969ebfe837bbbd0acdb4736d228ccaabcd22/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1", size = 146192 },
+    { url = "https://files.pythonhosted.org/packages/b2/21/2b6b5b860781a0b49427309cb8670785aa543fb2178de875b87b9cc97746/charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35", size = 95550 },
+    { url = "https://files.pythonhosted.org/packages/21/5b/1b390b03b1d16c7e382b561c5329f83cc06623916aab983e8ab9239c7d5c/charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f", size = 102785 },
+    { url = "https://files.pythonhosted.org/packages/38/94/ce8e6f63d18049672c76d07d119304e1e2d7c6098f0841b51c666e9f44a0/charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda", size = 195698 },
+    { url = "https://files.pythonhosted.org/packages/24/2e/dfdd9770664aae179a96561cc6952ff08f9a8cd09a908f259a9dfa063568/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313", size = 140162 },
+    { url = "https://files.pythonhosted.org/packages/24/4e/f646b9093cff8fc86f2d60af2de4dc17c759de9d554f130b140ea4738ca6/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9", size = 150263 },
+    { url = "https://files.pythonhosted.org/packages/5e/67/2937f8d548c3ef6e2f9aab0f6e21001056f692d43282b165e7c56023e6dd/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b", size = 142966 },
+    { url = "https://files.pythonhosted.org/packages/52/ed/b7f4f07de100bdb95c1756d3a4d17b90c1a3c53715c1a476f8738058e0fa/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11", size = 144992 },
+    { url = "https://files.pythonhosted.org/packages/96/2c/d49710a6dbcd3776265f4c923bb73ebe83933dfbaa841c5da850fe0fd20b/charset_normalizer-3.4.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f", size = 147162 },
+    { url = "https://files.pythonhosted.org/packages/b4/41/35ff1f9a6bd380303dea55e44c4933b4cc3c4850988927d4082ada230273/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd", size = 140972 },
+    { url = "https://files.pythonhosted.org/packages/fb/43/c6a0b685fe6910d08ba971f62cd9c3e862a85770395ba5d9cad4fede33ab/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2", size = 149095 },
+    { url = "https://files.pythonhosted.org/packages/4c/ff/a9a504662452e2d2878512115638966e75633519ec11f25fca3d2049a94a/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886", size = 152668 },
+    { url = "https://files.pythonhosted.org/packages/6c/71/189996b6d9a4b932564701628af5cee6716733e9165af1d5e1b285c530ed/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601", size = 150073 },
+    { url = "https://files.pythonhosted.org/packages/e4/93/946a86ce20790e11312c87c75ba68d5f6ad2208cfb52b2d6a2c32840d922/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd", size = 145732 },
+    { url = "https://files.pythonhosted.org/packages/cd/e5/131d2fb1b0dddafc37be4f3a2fa79aa4c037368be9423061dccadfd90091/charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407", size = 95391 },
+    { url = "https://files.pythonhosted.org/packages/27/f2/4f9a69cc7712b9b5ad8fdb87039fd89abba997ad5cbe690d1835d40405b0/charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971", size = 102702 },
+    { url = "https://files.pythonhosted.org/packages/7f/c0/b913f8f02836ed9ab32ea643c6fe4d3325c3d8627cf6e78098671cafff86/charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41", size = 197867 },
+    { url = "https://files.pythonhosted.org/packages/0f/6c/2bee440303d705b6fb1e2ec789543edec83d32d258299b16eed28aad48e0/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f", size = 141385 },
+    { url = "https://files.pythonhosted.org/packages/3d/04/cb42585f07f6f9fd3219ffb6f37d5a39b4fd2db2355b23683060029c35f7/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2", size = 151367 },
+    { url = "https://files.pythonhosted.org/packages/54/54/2412a5b093acb17f0222de007cc129ec0e0df198b5ad2ce5699355269dfe/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770", size = 143928 },
+    { url = "https://files.pythonhosted.org/packages/5a/6d/e2773862b043dcf8a221342954f375392bb2ce6487bcd9f2c1b34e1d6781/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4", size = 146203 },
+    { url = "https://files.pythonhosted.org/packages/b9/f8/ca440ef60d8f8916022859885f231abb07ada3c347c03d63f283bec32ef5/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537", size = 148082 },
+    { url = "https://files.pythonhosted.org/packages/04/d2/42fd330901aaa4b805a1097856c2edf5095e260a597f65def493f4b8c833/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496", size = 142053 },
+    { url = "https://files.pythonhosted.org/packages/9e/af/3a97a4fa3c53586f1910dadfc916e9c4f35eeada36de4108f5096cb7215f/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78", size = 150625 },
+    { url = "https://files.pythonhosted.org/packages/26/ae/23d6041322a3556e4da139663d02fb1b3c59a23ab2e2b56432bd2ad63ded/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7", size = 153549 },
+    { url = "https://files.pythonhosted.org/packages/94/22/b8f2081c6a77cb20d97e57e0b385b481887aa08019d2459dc2858ed64871/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6", size = 150945 },
+    { url = "https://files.pythonhosted.org/packages/c7/0b/c5ec5092747f801b8b093cdf5610e732b809d6cb11f4c51e35fc28d1d389/charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294", size = 146595 },
+    { url = "https://files.pythonhosted.org/packages/0c/5a/0b59704c38470df6768aa154cc87b1ac7c9bb687990a1559dc8765e8627e/charset_normalizer-3.4.1-cp39-cp39-win32.whl", hash = "sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5", size = 95453 },
+    { url = "https://files.pythonhosted.org/packages/85/2d/a9790237cb4d01a6d57afadc8573c8b73c609ade20b80f4cda30802009ee/charset_normalizer-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765", size = 102811 },
+    { url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 },
+]
+
+[[package]]
+name = "click"
+version = "8.1.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
+]
+
+[[package]]
+name = "cloudpickle"
+version = "3.1.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/39/069100b84d7418bc358d81669d5748efb14b9cceacd2f9c75f550424132f/cloudpickle-3.1.1.tar.gz", hash = "sha256:b216fa8ae4019d5482a8ac3c95d8f6346115d8835911fd4aefd1a445e4242c64", size = 22113 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/e8/64c37fadfc2816a7701fa8a6ed8d87327c7d54eacfbfb6edab14a2f2be75/cloudpickle-3.1.1-py3-none-any.whl", hash = "sha256:c8c5a44295039331ee9dad40ba100a9c7297b6f988e50e87ccdf3765a668350e", size = 20992 },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
+]
+
+[[package]]
+name = "compressed-tensors"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "torch" },
+    { name = "transformers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9a/58/e7cab9a2c423066b139e6fd5bd2f1979c88fe0cf6dabbee631917288b79e/compressed-tensors-0.9.0.tar.gz", hash = "sha256:80f8b002309b5970493e578d58542a308111fbef26f96811fa2c5322b24e7f1f", size = 63058 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/48/9a733034c68d22708f01094228f0642789ed94a2762612c227184cfd90e6/compressed_tensors-0.9.0-py3-none-any.whl", hash = "sha256:c4a0bccf2fd180a18a4bf0646f7746df4ea8ea537c06061f4b571662debf4424", size = 96438 },
+]
+
+[[package]]
+name = "datasets"
+version = "2.21.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", extra = ["http"] },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyarrow" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e5/a5/38719e5cff7aa0537a6be37d21cc1fdd7096e9565e8fce2d46a822e10b5b/datasets-2.21.0.tar.gz", hash = "sha256:998f85a8460f1bd982e5bd058f8a0808eef424249e3df1e8cdd594ccd0dc8ba2", size = 2215317 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/72/b3/33c4ad44fa020e3757e9b2fad8a5de53d9079b501e6bbc45bdd18f82f893/datasets-2.21.0-py3-none-any.whl", hash = "sha256:25e4e097110ce28824b746a107727ada94024cba11db8bc588d468414692b65a", size = 527251 },
+]
+
+[[package]]
+name = "deprecated"
+version = "1.2.15"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2e/a3/53e7d78a6850ffdd394d7048a31a6f14e44900adedf190f9a165f6b69439/deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d", size = 2977612 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/8f/c7f227eb42cfeaddce3eb0c96c60cbca37797fa7b34f8e1aeadf6c5c0983/Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320", size = 9941 },
+]
+
+[[package]]
+name = "dill"
+version = "0.3.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/17/4d/ac7ffa80c69ea1df30a8aa11b3578692a5118e7cd1aa157e3ef73b092d15/dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca", size = 184847 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", size = 116252 },
+]
+
+[[package]]
+name = "diskcache"
+version = "5.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550 },
+]
+
+[[package]]
+name = "einops"
+version = "0.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/79/ca/9f5dcb8bead39959454c3912266bedc4c315839cee0e0ca9f4328f4588c1/einops-0.8.0.tar.gz", hash = "sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85", size = 58861 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/5a/f0b9ad6c0a9017e62d4735daaeb11ba3b6c009d69a26141b258cd37b5588/einops-0.8.0-py3-none-any.whl", hash = "sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f", size = 43223 },
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 },
+]
+
+[[package]]
+name = "filelock"
+version = "3.16.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/db/3ef5bb276dae18d6ec2124224403d1d67bccdbefc17af4cc8f553e341ab1/filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435", size = 18037 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0", size = 16163 },
+]
+
+[[package]]
+name = "frozenlist"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8f/ed/0f4cec13a93c02c47ec32d81d11c0c1efbadf4a471e3f3ce7cad366cbbd3/frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817", size = 39930 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/79/29d44c4af36b2b240725dce566b20f63f9b36ef267aaaa64ee7466f4f2f8/frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a", size = 94451 },
+    { url = "https://files.pythonhosted.org/packages/47/47/0c999aeace6ead8a44441b4f4173e2261b18219e4ad1fe9a479871ca02fc/frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb", size = 54301 },
+    { url = "https://files.pythonhosted.org/packages/8d/60/107a38c1e54176d12e06e9d4b5d755b677d71d1219217cee063911b1384f/frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec", size = 52213 },
+    { url = "https://files.pythonhosted.org/packages/17/62/594a6829ac5679c25755362a9dc93486a8a45241394564309641425d3ff6/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5", size = 240946 },
+    { url = "https://files.pythonhosted.org/packages/7e/75/6c8419d8f92c80dd0ee3f63bdde2702ce6398b0ac8410ff459f9b6f2f9cb/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76", size = 264608 },
+    { url = "https://files.pythonhosted.org/packages/88/3e/82a6f0b84bc6fb7e0be240e52863c6d4ab6098cd62e4f5b972cd31e002e8/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17", size = 261361 },
+    { url = "https://files.pythonhosted.org/packages/fd/85/14e5f9ccac1b64ff2f10c927b3ffdf88772aea875882406f9ba0cec8ad84/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba", size = 231649 },
+    { url = "https://files.pythonhosted.org/packages/ee/59/928322800306f6529d1852323014ee9008551e9bb027cc38d276cbc0b0e7/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d", size = 241853 },
+    { url = "https://files.pythonhosted.org/packages/7d/bd/e01fa4f146a6f6c18c5d34cab8abdc4013774a26c4ff851128cd1bd3008e/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2", size = 243652 },
+    { url = "https://files.pythonhosted.org/packages/a5/bd/e4771fd18a8ec6757033f0fa903e447aecc3fbba54e3630397b61596acf0/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f", size = 241734 },
+    { url = "https://files.pythonhosted.org/packages/21/13/c83821fa5544af4f60c5d3a65d054af3213c26b14d3f5f48e43e5fb48556/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c", size = 260959 },
+    { url = "https://files.pythonhosted.org/packages/71/f3/1f91c9a9bf7ed0e8edcf52698d23f3c211d8d00291a53c9f115ceb977ab1/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab", size = 262706 },
+    { url = "https://files.pythonhosted.org/packages/4c/22/4a256fdf5d9bcb3ae32622c796ee5ff9451b3a13a68cfe3f68e2c95588ce/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5", size = 250401 },
+    { url = "https://files.pythonhosted.org/packages/af/89/c48ebe1f7991bd2be6d5f4ed202d94960c01b3017a03d6954dd5fa9ea1e8/frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb", size = 45498 },
+    { url = "https://files.pythonhosted.org/packages/28/2f/cc27d5f43e023d21fe5c19538e08894db3d7e081cbf582ad5ed366c24446/frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4", size = 51622 },
+    { url = "https://files.pythonhosted.org/packages/79/43/0bed28bf5eb1c9e4301003b74453b8e7aa85fb293b31dde352aac528dafc/frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30", size = 94987 },
+    { url = "https://files.pythonhosted.org/packages/bb/bf/b74e38f09a246e8abbe1e90eb65787ed745ccab6eaa58b9c9308e052323d/frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5", size = 54584 },
+    { url = "https://files.pythonhosted.org/packages/2c/31/ab01375682f14f7613a1ade30149f684c84f9b8823a4391ed950c8285656/frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778", size = 52499 },
+    { url = "https://files.pythonhosted.org/packages/98/a8/d0ac0b9276e1404f58fec3ab6e90a4f76b778a49373ccaf6a563f100dfbc/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a", size = 276357 },
+    { url = "https://files.pythonhosted.org/packages/ad/c9/c7761084fa822f07dac38ac29f841d4587570dd211e2262544aa0b791d21/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869", size = 287516 },
+    { url = "https://files.pythonhosted.org/packages/a1/ff/cd7479e703c39df7bdab431798cef89dc75010d8aa0ca2514c5b9321db27/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d", size = 283131 },
+    { url = "https://files.pythonhosted.org/packages/59/a0/370941beb47d237eca4fbf27e4e91389fd68699e6f4b0ebcc95da463835b/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45", size = 261320 },
+    { url = "https://files.pythonhosted.org/packages/b8/5f/c10123e8d64867bc9b4f2f510a32042a306ff5fcd7e2e09e5ae5100ee333/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d", size = 274877 },
+    { url = "https://files.pythonhosted.org/packages/fa/79/38c505601ae29d4348f21706c5d89755ceded02a745016ba2f58bd5f1ea6/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3", size = 269592 },
+    { url = "https://files.pythonhosted.org/packages/19/e2/39f3a53191b8204ba9f0bb574b926b73dd2efba2a2b9d2d730517e8f7622/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a", size = 265934 },
+    { url = "https://files.pythonhosted.org/packages/d5/c9/3075eb7f7f3a91f1a6b00284af4de0a65a9ae47084930916f5528144c9dd/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9", size = 283859 },
+    { url = "https://files.pythonhosted.org/packages/05/f5/549f44d314c29408b962fa2b0e69a1a67c59379fb143b92a0a065ffd1f0f/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2", size = 287560 },
+    { url = "https://files.pythonhosted.org/packages/9d/f8/cb09b3c24a3eac02c4c07a9558e11e9e244fb02bf62c85ac2106d1eb0c0b/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf", size = 277150 },
+    { url = "https://files.pythonhosted.org/packages/37/48/38c2db3f54d1501e692d6fe058f45b6ad1b358d82cd19436efab80cfc965/frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942", size = 45244 },
+    { url = "https://files.pythonhosted.org/packages/ca/8c/2ddffeb8b60a4bce3b196c32fcc30d8830d4615e7b492ec2071da801b8ad/frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d", size = 51634 },
+    { url = "https://files.pythonhosted.org/packages/79/73/fa6d1a96ab7fd6e6d1c3500700963eab46813847f01ef0ccbaa726181dd5/frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21", size = 94026 },
+    { url = "https://files.pythonhosted.org/packages/ab/04/ea8bf62c8868b8eada363f20ff1b647cf2e93377a7b284d36062d21d81d1/frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d", size = 54150 },
+    { url = "https://files.pythonhosted.org/packages/d0/9a/8e479b482a6f2070b26bda572c5e6889bb3ba48977e81beea35b5ae13ece/frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e", size = 51927 },
+    { url = "https://files.pythonhosted.org/packages/e3/12/2aad87deb08a4e7ccfb33600871bbe8f0e08cb6d8224371387f3303654d7/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a", size = 282647 },
+    { url = "https://files.pythonhosted.org/packages/77/f2/07f06b05d8a427ea0060a9cef6e63405ea9e0d761846b95ef3fb3be57111/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a", size = 289052 },
+    { url = "https://files.pythonhosted.org/packages/bd/9f/8bf45a2f1cd4aa401acd271b077989c9267ae8463e7c8b1eb0d3f561b65e/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee", size = 291719 },
+    { url = "https://files.pythonhosted.org/packages/41/d1/1f20fd05a6c42d3868709b7604c9f15538a29e4f734c694c6bcfc3d3b935/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6", size = 267433 },
+    { url = "https://files.pythonhosted.org/packages/af/f2/64b73a9bb86f5a89fb55450e97cd5c1f84a862d4ff90d9fd1a73ab0f64a5/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e", size = 283591 },
+    { url = "https://files.pythonhosted.org/packages/29/e2/ffbb1fae55a791fd6c2938dd9ea779509c977435ba3940b9f2e8dc9d5316/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9", size = 273249 },
+    { url = "https://files.pythonhosted.org/packages/2e/6e/008136a30798bb63618a114b9321b5971172a5abddff44a100c7edc5ad4f/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039", size = 271075 },
+    { url = "https://files.pythonhosted.org/packages/ae/f0/4e71e54a026b06724cec9b6c54f0b13a4e9e298cc8db0f82ec70e151f5ce/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784", size = 285398 },
+    { url = "https://files.pythonhosted.org/packages/4d/36/70ec246851478b1c0b59f11ef8ade9c482ff447c1363c2bd5fad45098b12/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631", size = 294445 },
+    { url = "https://files.pythonhosted.org/packages/37/e0/47f87544055b3349b633a03c4d94b405956cf2437f4ab46d0928b74b7526/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f", size = 280569 },
+    { url = "https://files.pythonhosted.org/packages/f9/7c/490133c160fb6b84ed374c266f42800e33b50c3bbab1652764e6e1fc498a/frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8", size = 44721 },
+    { url = "https://files.pythonhosted.org/packages/b1/56/4e45136ffc6bdbfa68c29ca56ef53783ef4c2fd395f7cbf99a2624aa9aaa/frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f", size = 51329 },
+    { url = "https://files.pythonhosted.org/packages/da/3b/915f0bca8a7ea04483622e84a9bd90033bab54bdf485479556c74fd5eaf5/frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953", size = 91538 },
+    { url = "https://files.pythonhosted.org/packages/c7/d1/a7c98aad7e44afe5306a2b068434a5830f1470675f0e715abb86eb15f15b/frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0", size = 52849 },
+    { url = "https://files.pythonhosted.org/packages/3a/c8/76f23bf9ab15d5f760eb48701909645f686f9c64fbb8982674c241fbef14/frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2", size = 50583 },
+    { url = "https://files.pythonhosted.org/packages/1f/22/462a3dd093d11df623179d7754a3b3269de3b42de2808cddef50ee0f4f48/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f", size = 265636 },
+    { url = "https://files.pythonhosted.org/packages/80/cf/e075e407fc2ae7328155a1cd7e22f932773c8073c1fc78016607d19cc3e5/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608", size = 270214 },
+    { url = "https://files.pythonhosted.org/packages/a1/58/0642d061d5de779f39c50cbb00df49682832923f3d2ebfb0fedf02d05f7f/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b", size = 273905 },
+    { url = "https://files.pythonhosted.org/packages/ab/66/3fe0f5f8f2add5b4ab7aa4e199f767fd3b55da26e3ca4ce2cc36698e50c4/frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840", size = 250542 },
+    { url = "https://files.pythonhosted.org/packages/f6/b8/260791bde9198c87a465224e0e2bb62c4e716f5d198fc3a1dacc4895dbd1/frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439", size = 267026 },
+    { url = "https://files.pythonhosted.org/packages/2e/a4/3d24f88c527f08f8d44ade24eaee83b2627793fa62fa07cbb7ff7a2f7d42/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de", size = 257690 },
+    { url = "https://files.pythonhosted.org/packages/de/9a/d311d660420b2beeff3459b6626f2ab4fb236d07afbdac034a4371fe696e/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641", size = 253893 },
+    { url = "https://files.pythonhosted.org/packages/c6/23/e491aadc25b56eabd0f18c53bb19f3cdc6de30b2129ee0bc39cd387cd560/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e", size = 267006 },
+    { url = "https://files.pythonhosted.org/packages/08/c4/ab918ce636a35fb974d13d666dcbe03969592aeca6c3ab3835acff01f79c/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9", size = 276157 },
+    { url = "https://files.pythonhosted.org/packages/c0/29/3b7a0bbbbe5a34833ba26f686aabfe982924adbdcafdc294a7a129c31688/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03", size = 264642 },
+    { url = "https://files.pythonhosted.org/packages/ab/42/0595b3dbffc2e82d7fe658c12d5a5bafcd7516c6bf2d1d1feb5387caa9c1/frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c", size = 44914 },
+    { url = "https://files.pythonhosted.org/packages/17/c4/b7db1206a3fea44bf3b838ca61deb6f74424a8a5db1dd53ecb21da669be6/frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28", size = 51167 },
+    { url = "https://files.pythonhosted.org/packages/da/4d/d94ff0fb0f5313902c132817c62d19cdc5bdcd0c195d392006ef4b779fc6/frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972", size = 95319 },
+    { url = "https://files.pythonhosted.org/packages/8c/1b/d90e554ca2b483d31cb2296e393f72c25bdc38d64526579e95576bfda587/frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336", size = 54749 },
+    { url = "https://files.pythonhosted.org/packages/f8/66/7fdecc9ef49f8db2aa4d9da916e4ecf357d867d87aea292efc11e1b2e932/frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f", size = 52718 },
+    { url = "https://files.pythonhosted.org/packages/08/04/e2fddc92135276e07addbc1cf413acffa0c2d848b3e54cacf684e146df49/frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f", size = 241756 },
+    { url = "https://files.pythonhosted.org/packages/c6/52/be5ff200815d8a341aee5b16b6b707355e0ca3652953852238eb92b120c2/frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6", size = 267718 },
+    { url = "https://files.pythonhosted.org/packages/88/be/4bd93a58be57a3722fc544c36debdf9dcc6758f761092e894d78f18b8f20/frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411", size = 263494 },
+    { url = "https://files.pythonhosted.org/packages/32/ba/58348b90193caa096ce9e9befea6ae67f38dabfd3aacb47e46137a6250a8/frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08", size = 232838 },
+    { url = "https://files.pythonhosted.org/packages/f6/33/9f152105227630246135188901373c4f322cc026565ca6215b063f4c82f4/frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2", size = 242912 },
+    { url = "https://files.pythonhosted.org/packages/a0/10/3db38fb3ccbafadd80a1b0d6800c987b0e3fe3ef2d117c6ced0246eea17a/frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d", size = 244763 },
+    { url = "https://files.pythonhosted.org/packages/e2/cd/1df468fdce2f66a4608dffe44c40cdc35eeaa67ef7fd1d813f99a9a37842/frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b", size = 242841 },
+    { url = "https://files.pythonhosted.org/packages/ee/5f/16097a5ca0bb6b6779c02cc9379c72fe98d56115d4c54d059fb233168fb6/frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b", size = 263407 },
+    { url = "https://files.pythonhosted.org/packages/0f/f7/58cd220ee1c2248ee65a32f5b4b93689e3fe1764d85537eee9fc392543bc/frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0", size = 265083 },
+    { url = "https://files.pythonhosted.org/packages/62/b8/49768980caabf81ac4a2d156008f7cbd0107e6b36d08a313bb31035d9201/frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c", size = 251564 },
+    { url = "https://files.pythonhosted.org/packages/cb/83/619327da3b86ef957ee7a0cbf3c166a09ed1e87a3f7f1ff487d7d0284683/frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3", size = 45691 },
+    { url = "https://files.pythonhosted.org/packages/8b/28/407bc34a745151ed2322c690b6e7d83d7101472e81ed76e1ebdac0b70a78/frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0", size = 51767 },
+    { url = "https://files.pythonhosted.org/packages/c6/c8/a5be5b7550c10858fcf9b0ea054baccab474da77d37f1e828ce043a3a5d4/frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3", size = 11901 },
+]
+
+[[package]]
+name = "fsspec"
+version = "2024.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/90/b6/eba5024a9889fcfff396db543a34bef0ab9d002278f163129f9f01005960/fsspec-2024.6.1.tar.gz", hash = "sha256:fad7d7e209dd4c1208e3bbfda706620e0da5142bebbd9c384afb95b07e798e49", size = 284584 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5e/44/73bea497ac69bafde2ee4269292fa3b41f1198f4bb7bbaaabde30ad29d4a/fsspec-2024.6.1-py3-none-any.whl", hash = "sha256:3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e", size = 177561 },
+]
+
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp" },
+]
+
+[[package]]
+name = "googleapis-common-protos"
+version = "1.66.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ff/a7/8e9cccdb1c49870de6faea2a2764fa23f627dd290633103540209f03524c/googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c", size = 114376 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/0f/c0713fb2b3d28af4b2fded3291df1c4d4f79a00d15c2374a9e010870016c/googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed", size = 221682 },
+]
+
+[[package]]
+name = "grpc-interceptor"
+version = "0.15.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9f/28/57449d5567adf4c1d3e216aaca545913fbc21a915f2da6790d6734aac76e/grpc-interceptor-0.15.4.tar.gz", hash = "sha256:1f45c0bcb58b6f332f37c637632247c9b02bc6af0fdceb7ba7ce8d2ebbfb0926", size = 19322 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/15/ac/8d53f230a7443401ce81791ec50a3b0e54924bf615ad287654fa4a2f5cdc/grpc_interceptor-0.15.4-py3-none-any.whl", hash = "sha256:0035f33228693ed3767ee49d937bac424318db173fef4d2d0170b3215f254d9d", size = 20848 },
+]
+
+[[package]]
+name = "grpcio"
+version = "1.69.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e4/87/06a145284cbe86c91ca517fe6b57be5efbb733c0d6374b407f0992054d18/grpcio-1.69.0.tar.gz", hash = "sha256:936fa44241b5379c5afc344e1260d467bee495747eaf478de825bab2791da6f5", size = 12738244 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b0/6e/2f8ee5fb65aef962d0bd7e46b815e7b52820687e29c138eaee207a688abc/grpcio-1.69.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:2060ca95a8db295ae828d0fc1c7f38fb26ccd5edf9aa51a0f44251f5da332e97", size = 5190753 },
+    { url = "https://files.pythonhosted.org/packages/89/07/028dcda44d40f9488f0a0de79c5ffc80e2c1bc5ed89da9483932e3ea67cf/grpcio-1.69.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2e52e107261fd8fa8fa457fe44bfadb904ae869d87c1280bf60f93ecd3e79278", size = 11096752 },
+    { url = "https://files.pythonhosted.org/packages/99/a0/c727041b1410605ba38b585b6b52c1a289d7fcd70a41bccbc2c58fc643b2/grpcio-1.69.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:316463c0832d5fcdb5e35ff2826d9aa3f26758d29cdfb59a368c1d6c39615a11", size = 5705442 },
+    { url = "https://files.pythonhosted.org/packages/7a/2f/1c53f5d127ff882443b19c757d087da1908f41c58c4b098e8eaf6b2bb70a/grpcio-1.69.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:26c9a9c4ac917efab4704b18eed9082ed3b6ad19595f047e8173b5182fec0d5e", size = 6333796 },
+    { url = "https://files.pythonhosted.org/packages/cc/f6/2017da2a1b64e896af710253e5bfbb4188605cdc18bce3930dae5cdbf502/grpcio-1.69.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b3646ced2eae3a0599658eeccc5ba7f303bf51b82514c50715bdd2b109e5ec", size = 5954245 },
+    { url = "https://files.pythonhosted.org/packages/c1/65/1395bec928e99ba600464fb01b541e7e4cdd462e6db25259d755ef9f8d02/grpcio-1.69.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3b75aea7c6cb91b341c85e7c1d9db1e09e1dd630b0717f836be94971e015031e", size = 6664854 },
+    { url = "https://files.pythonhosted.org/packages/40/57/8b3389cfeb92056c8b44288c9c4ed1d331bcad0215c4eea9ae4629e156d9/grpcio-1.69.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5cfd14175f9db33d4b74d63de87c64bb0ee29ce475ce3c00c01ad2a3dc2a9e51", size = 6226854 },
+    { url = "https://files.pythonhosted.org/packages/cc/61/1f2bbeb7c15544dffc98b3f65c093e746019995e6f1e21dc3655eec3dc23/grpcio-1.69.0-cp310-cp310-win32.whl", hash = "sha256:9031069d36cb949205293cf0e243abd5e64d6c93e01b078c37921493a41b72dc", size = 3662734 },
+    { url = "https://files.pythonhosted.org/packages/ef/ba/bf1a6d9f5c17d2da849793d72039776c56c98c889c9527f6721b6ee57e6e/grpcio-1.69.0-cp310-cp310-win_amd64.whl", hash = "sha256:cc89b6c29f3dccbe12d7a3b3f1b3999db4882ae076c1c1f6df231d55dbd767a5", size = 4410306 },
+    { url = "https://files.pythonhosted.org/packages/8d/cd/ca256aeef64047881586331347cd5a68a4574ba1a236e293cd8eba34e355/grpcio-1.69.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:8de1b192c29b8ce45ee26a700044717bcbbd21c697fa1124d440548964328561", size = 5198734 },
+    { url = "https://files.pythonhosted.org/packages/37/3f/10c1e5e0150bf59aa08ea6aebf38f87622f95f7f33f98954b43d1b2a3200/grpcio-1.69.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:7e76accf38808f5c5c752b0ab3fd919eb14ff8fafb8db520ad1cc12afff74de6", size = 11135285 },
+    { url = "https://files.pythonhosted.org/packages/08/61/61cd116a572203a740684fcba3fef37a3524f1cf032b6568e1e639e59db0/grpcio-1.69.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:d5658c3c2660417d82db51e168b277e0ff036d0b0f859fa7576c0ffd2aec1442", size = 5699468 },
+    { url = "https://files.pythonhosted.org/packages/01/f1/a841662e8e2465ba171c973b77d18fa7438ced535519b3c53617b7e6e25c/grpcio-1.69.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5494d0e52bf77a2f7eb17c6da662886ca0a731e56c1c85b93505bece8dc6cf4c", size = 6332337 },
+    { url = "https://files.pythonhosted.org/packages/62/b1/c30e932e02c2e0bfdb8df46fe3b0c47f518fb04158ebdc0eb96cc97d642f/grpcio-1.69.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ed866f9edb574fd9be71bf64c954ce1b88fc93b2a4cbf94af221e9426eb14d6", size = 5949844 },
+    { url = "https://files.pythonhosted.org/packages/5e/cb/55327d43b6286100ffae7d1791be6178d13c917382f3e9f43f82e8b393cf/grpcio-1.69.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c5ba38aeac7a2fe353615c6b4213d1fbb3a3c34f86b4aaa8be08baaaee8cc56d", size = 6661828 },
+    { url = "https://files.pythonhosted.org/packages/6f/e4/120d72ae982d51cb9cabcd9672f8a1c6d62011b493a4d049d2abdf564db0/grpcio-1.69.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f79e05f5bbf551c4057c227d1b041ace0e78462ac8128e2ad39ec58a382536d2", size = 6226026 },
+    { url = "https://files.pythonhosted.org/packages/96/e8/2cc15f11db506d7b1778f0587fa7bdd781602b05b3c4d75b7ca13de33d62/grpcio-1.69.0-cp311-cp311-win32.whl", hash = "sha256:bf1f8be0da3fcdb2c1e9f374f3c2d043d606d69f425cd685110dd6d0d2d61258", size = 3662653 },
+    { url = "https://files.pythonhosted.org/packages/42/78/3c5216829a48237fcb71a077f891328a435e980d9757a9ebc49114d88768/grpcio-1.69.0-cp311-cp311-win_amd64.whl", hash = "sha256:fb9302afc3a0e4ba0b225cd651ef8e478bf0070cf11a529175caecd5ea2474e7", size = 4412824 },
+    { url = "https://files.pythonhosted.org/packages/61/1d/8f28f147d7f3f5d6b6082f14e1e0f40d58e50bc2bd30d2377c730c57a286/grpcio-1.69.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:fc18a4de8c33491ad6f70022af5c460b39611e39578a4d84de0fe92f12d5d47b", size = 5161414 },
+    { url = "https://files.pythonhosted.org/packages/35/4b/9ab8ea65e515e1844feced1ef9e7a5d8359c48d986c93f3d2a2006fbdb63/grpcio-1.69.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:0f0270bd9ffbff6961fe1da487bdcd594407ad390cc7960e738725d4807b18c4", size = 11108909 },
+    { url = "https://files.pythonhosted.org/packages/99/68/1856fde2b3c3162bdfb9845978608deef3606e6907fdc2c87443fce6ecd0/grpcio-1.69.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:dc48f99cc05e0698e689b51a05933253c69a8c8559a47f605cff83801b03af0e", size = 5658302 },
+    { url = "https://files.pythonhosted.org/packages/3e/21/3fa78d38dc5080d0d677103fad3a8cd55091635cc2069a7c06c7a54e6c4d/grpcio-1.69.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e925954b18d41aeb5ae250262116d0970893b38232689c4240024e4333ac084", size = 6306201 },
+    { url = "https://files.pythonhosted.org/packages/f3/cb/5c47b82fd1baf43dba973ae399095d51aaf0085ab0439838b4cbb1e87e3c/grpcio-1.69.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87d222569273720366f68a99cb62e6194681eb763ee1d3b1005840678d4884f9", size = 5919649 },
+    { url = "https://files.pythonhosted.org/packages/c6/67/59d1a56a0f9508a29ea03e1ce800bdfacc1f32b4f6b15274b2e057bf8758/grpcio-1.69.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b62b0f41e6e01a3e5082000b612064c87c93a49b05f7602fe1b7aa9fd5171a1d", size = 6648974 },
+    { url = "https://files.pythonhosted.org/packages/f8/fe/ca70c14d98c6400095f19a0f4df8273d09c2106189751b564b26019f1dbe/grpcio-1.69.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:db6f9fd2578dbe37db4b2994c94a1d9c93552ed77dca80e1657bb8a05b898b55", size = 6215144 },
+    { url = "https://files.pythonhosted.org/packages/b3/94/b2b0a9fd487fc8262e20e6dd0ec90d9fa462c82a43b4855285620f6e9d01/grpcio-1.69.0-cp312-cp312-win32.whl", hash = "sha256:b192b81076073ed46f4b4dd612b8897d9a1e39d4eabd822e5da7b38497ed77e1", size = 3644552 },
+    { url = "https://files.pythonhosted.org/packages/93/99/81aec9f85412e3255a591ae2ccb799238e074be774e5f741abae08a23418/grpcio-1.69.0-cp312-cp312-win_amd64.whl", hash = "sha256:1227ff7836f7b3a4ab04e5754f1d001fa52a730685d3dc894ed8bc262cc96c01", size = 4399532 },
+    { url = "https://files.pythonhosted.org/packages/54/47/3ff4501365f56b7cc16617695dbd4fd838c5e362bc7fa9fee09d592f7d78/grpcio-1.69.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:a78a06911d4081a24a1761d16215a08e9b6d4d29cdbb7e427e6c7e17b06bcc5d", size = 5162928 },
+    { url = "https://files.pythonhosted.org/packages/c0/63/437174c5fa951052c9ecc5f373f62af6f3baf25f3f5ef35cbf561806b371/grpcio-1.69.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:dc5a351927d605b2721cbb46158e431dd49ce66ffbacb03e709dc07a491dde35", size = 11103027 },
+    { url = "https://files.pythonhosted.org/packages/53/df/53566a6fdc26b6d1f0585896e1cc4825961039bca5a6a314ff29d79b5d5b/grpcio-1.69.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:3629d8a8185f5139869a6a17865d03113a260e311e78fbe313f1a71603617589", size = 5659277 },
+    { url = "https://files.pythonhosted.org/packages/e6/4c/b8a0c4f71498b6f9be5ca6d290d576cf2af9d95fd9827c47364f023969ad/grpcio-1.69.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9a281878feeb9ae26db0622a19add03922a028d4db684658f16d546601a4870", size = 6305255 },
+    { url = "https://files.pythonhosted.org/packages/ef/55/d9aa05eb3dfcf6aa946aaf986740ec07fc5189f20e2cbeb8c5d278ffd00f/grpcio-1.69.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc614e895177ab7e4b70f154d1a7c97e152577ea101d76026d132b7aaba003b", size = 5920240 },
+    { url = "https://files.pythonhosted.org/packages/ea/eb/774b27c51e3e386dfe6c491a710f6f87ffdb20d88ec6c3581e047d9354a2/grpcio-1.69.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:1ee76cd7e2e49cf9264f6812d8c9ac1b85dda0eaea063af07292400f9191750e", size = 6652974 },
+    { url = "https://files.pythonhosted.org/packages/59/98/96de14e6e7d89123813d58c246d9b0f1fbd24f9277f5295264e60861d9d6/grpcio-1.69.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:0470fa911c503af59ec8bc4c82b371ee4303ececbbdc055f55ce48e38b20fd67", size = 6215757 },
+    { url = "https://files.pythonhosted.org/packages/7d/5b/ce922e0785910b10756fabc51fd294260384a44bea41651dadc4e47ddc82/grpcio-1.69.0-cp313-cp313-win32.whl", hash = "sha256:b650f34aceac8b2d08a4c8d7dc3e8a593f4d9e26d86751ebf74ebf5107d927de", size = 3642488 },
+    { url = "https://files.pythonhosted.org/packages/5d/04/11329e6ca1ceeb276df2d9c316b5e170835a687a4d0f778dba8294657e36/grpcio-1.69.0-cp313-cp313-win_amd64.whl", hash = "sha256:028337786f11fecb5d7b7fa660475a06aabf7e5e52b5ac2df47414878c0ce7ea", size = 4399968 },
+    { url = "https://files.pythonhosted.org/packages/c6/e6/9c6448a9f2b192b4dab8ecba6a99d34aebfb3398da9f407eb8f5a14181d4/grpcio-1.69.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:dd034d68a2905464c49479b0c209c773737a4245d616234c79c975c7c90eca03", size = 5190897 },
+    { url = "https://files.pythonhosted.org/packages/4d/ce/fb54596867c813756c70266cb433e37619324c0f18ad917c2bbeeb6b5b21/grpcio-1.69.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:01f834732c22a130bdf3dc154d1053bdbc887eb3ccb7f3e6285cfbfc33d9d5cc", size = 11124006 },
+    { url = "https://files.pythonhosted.org/packages/af/c1/c314372f3b6605b3ef8c03bcecd3deef92a3a5817b26ca4c5a6d519bdf04/grpcio-1.69.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:a7f4ed0dcf202a70fe661329f8874bc3775c14bb3911d020d07c82c766ce0eb1", size = 5703399 },
+    { url = "https://files.pythonhosted.org/packages/c6/e4/d4a051b2e3752590e5a8fdfd3270045d8c0e49f0566fd9dacf30e3de1bc3/grpcio-1.69.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd7ea241b10bc5f0bb0f82c0d7896822b7ed122b3ab35c9851b440c1ccf81588", size = 6333585 },
+    { url = "https://files.pythonhosted.org/packages/9b/dd/3b0057863f27325ad9371e966684d2e287cdb4ee5861b4cd4fbbb1c7bf91/grpcio-1.69.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f03dc9b4da4c0dc8a1db7a5420f575251d7319b7a839004d8916257ddbe4816", size = 5953919 },
+    { url = "https://files.pythonhosted.org/packages/98/8a/5f782d5493e4c67c64389996d800a666987dc27ab5fbe093864e9fd66982/grpcio-1.69.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ca71d73a270dff052fe4edf74fef142d6ddd1f84175d9ac4a14b7280572ac519", size = 6666357 },
+    { url = "https://files.pythonhosted.org/packages/de/a4/d1a03913df292ba7322086c68301c66e14b3f8f9532e4c3854846442f0a0/grpcio-1.69.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5ccbed100dc43704e94ccff9e07680b540d64e4cc89213ab2832b51b4f68a520", size = 6226574 },
+    { url = "https://files.pythonhosted.org/packages/8d/fb/e104bc4296ee4991d803dd39b6c72ed247ba0e18a4e56fd895947aae1249/grpcio-1.69.0-cp39-cp39-win32.whl", hash = "sha256:1514341def9c6ec4b7f0b9628be95f620f9d4b99331b7ef0a1845fd33d9b579c", size = 3663452 },
+    { url = "https://files.pythonhosted.org/packages/ad/39/12d48bccd429699a3c909173b395900eb64e4c6bc5eed34d7088e438bc4d/grpcio-1.69.0-cp39-cp39-win_amd64.whl", hash = "sha256:c1fea55d26d647346acb0069b08dca70984101f2dc95066e003019207212e303", size = 4411151 },
+]
+
+[[package]]
+name = "grpcio-reflection"
+version = "1.69.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio" },
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bd/88/064538adae2e526f4626ad57f53a99fca5ae5013e627214093ef1d47cc35/grpcio_reflection-1.69.0.tar.gz", hash = "sha256:3474aaedfb10d8da5503135a2fb9fbb4e1f72044ad7ee2d0bf22090934e80908", size = 18824 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4f/2a/15455e320839c7ded83cf347b7256c38b8a54352b46b402a3d20b05abd65/grpcio_reflection-1.69.0-py3-none-any.whl", hash = "sha256:6dc356cc24a324295550991b19cef1804c8d1fd9d85ea313809af4b03459d5b2", size = 22690 },
+]
+
+[[package]]
+name = "grpcio-status"
+version = "1.69.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "grpcio" },
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/02/35/52dc0d8300f879dbf9cdc95764cee9f56d5a212998cfa1a8871b262df2a4/grpcio_status-1.69.0.tar.gz", hash = "sha256:595ef84e5178d6281caa732ccf68ff83259241608d26b0e9c40a5e66eee2a2d2", size = 13662 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f6/e2/346a766a4232f74f45f8bc70e636fc3a6677e6bc3893382187829085f12e/grpcio_status-1.69.0-py3-none-any.whl", hash = "sha256:d6b2a3c9562c03a817c628d7ba9a925e209c228762d6d7677ae5c9401a542853", size = 14428 },
+]
+
+[[package]]
+name = "grpcio-tools"
+version = "1.69.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio" },
+    { name = "protobuf" },
+    { name = "setuptools" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/64/ec/1c25136ca1697eaa09a02effe3e74959fd9fb6aba9960d7340dd6341c5ce/grpcio_tools-1.69.0.tar.gz", hash = "sha256:3e1a98f4d9decb84979e1ddd3deb09c0a33a84b6e3c0776d5bde4097e3ab66dd", size = 5323319 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/90/7df7326552fec627adcf3880cf13e9a5b23c090bbcedba367f64fa2bb54b/grpcio_tools-1.69.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:8c210630faa581c3bd08953dac4ad21a7f49862f3b92d69686e9b436d2f1265d", size = 2388795 },
+    { url = "https://files.pythonhosted.org/packages/e2/03/6ccaa58b3ca1734d0868a389148e22ac15248a9be4c223805339f7904e31/grpcio_tools-1.69.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:09b66ea279fcdaebae4ec34b1baf7577af3b14322738aa980c1c33cfea71f7d7", size = 5703156 },
+    { url = "https://files.pythonhosted.org/packages/c9/f6/162b456684d2444b43e45ace4e889087301e5890bbfd16ee6b2aedf36219/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:be94a4bfa56d356aae242cc54072c9ccc2704b659eaae2fd599a94afebf791ce", size = 2350725 },
+    { url = "https://files.pythonhosted.org/packages/db/3a/2e83fea8c90b9902d68964491d014d688177a6ad0303dbbe6c2c16f25da6/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28778debad73a8c8e0a0e07e6a2f76eecce43adbc205d17dd244d2d58bb0f0aa", size = 2727230 },
+    { url = "https://files.pythonhosted.org/packages/63/06/be27b8f1811ff4cc556bdec64a9004755a929df035dc606466a75c9ac0fa/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:449308d93e4c97ae3a4503510c6d64978748ff5e21429c85da14fdc783c0f498", size = 2472752 },
+    { url = "https://files.pythonhosted.org/packages/a3/43/f94578afa1535287b7b0ba39eeb23b2b8304a2a5b8e325ed7079d2ad9cba/grpcio_tools-1.69.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b9343651e73bc6e0df6bb518c2638bf9cc2194b50d060cdbcf1b2121cd4e4ae3", size = 3344074 },
+    { url = "https://files.pythonhosted.org/packages/13/d1/5f9030cbb6195f3bb182e740f349cdaa71d9c38c1b2572f401270709d7d2/grpcio_tools-1.69.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2f08b063612553e726e328aef3a27adfaea8d92712b229012afc54d59da88a02", size = 2953778 },
+    { url = "https://files.pythonhosted.org/packages/0c/cb/4812660e150d197de81296fa04ed6ad012d1aeac23bbe21be5f51493f455/grpcio_tools-1.69.0-cp310-cp310-win32.whl", hash = "sha256:599ffd39525e7bbb6412a63e56a2e6c1af8f3493fe4305260efd4a11d064cce0", size = 957556 },
+    { url = "https://files.pythonhosted.org/packages/4e/c7/c7d5f5418909764e63208b9f76812db3287ece4f79500e815178194e1db9/grpcio_tools-1.69.0-cp310-cp310-win_amd64.whl", hash = "sha256:02f92e3c2bae67ece818787f8d3d89df0fa1e5e6bbb7c1493824fd5dfad886dd", size = 1114783 },
+    { url = "https://files.pythonhosted.org/packages/7e/f4/575f536bada8d8f5f8943c317ae28faafe7b4aaf95ef84a599f4f3e67db3/grpcio_tools-1.69.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:c18df5d1c8e163a29863583ec51237d08d7059ef8d4f7661ee6d6363d3e38fe3", size = 2388772 },
+    { url = "https://files.pythonhosted.org/packages/87/94/1157342b046f51c4d076f21ef76da6d89323929b7e870389204fd49e3f09/grpcio_tools-1.69.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:37876ae49235ef2e61e5059faf45dc5e7142ca54ae61aec378bb9483e0cd7e95", size = 5726348 },
+    { url = "https://files.pythonhosted.org/packages/36/5c/cfd9160ef1867e025844b2695d436bb953c2d5f9c20eaaa7da6fd739ab0c/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:33120920e29959eaa37a1268c6a22af243d086b1a5e5222b4203e29560ece9ce", size = 2350857 },
+    { url = "https://files.pythonhosted.org/packages/61/70/10614b8bc39f06548a0586fdd5d97843da4789965e758fba87726bde8c2f/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:788bb3ecd1b44664d829d319b3c1ebc15c7d7b5e7d1f22706ab57d6acd2c6301", size = 2727157 },
+    { url = "https://files.pythonhosted.org/packages/37/fb/33faedb3e991dceb7a2bf802d3875bff7d6a6b6a80d314197adc73739cae/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f453b11a112e3774c8957ec2570669f3da1f7fbc8ee242482c38981496e88da2", size = 2472882 },
+    { url = "https://files.pythonhosted.org/packages/41/f7/abddc158919a982f6b8e61d4a5c72569b2963304c162c3ca53c6c14d23ee/grpcio_tools-1.69.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7e5c5dc2b656755cb58b11a7e87b65258a4a8eaff01b6c30ffcb230dd447c03d", size = 3343987 },
+    { url = "https://files.pythonhosted.org/packages/ba/46/e7219456aefe29137728246a67199fcbfdaa99ede93d2045a6406f0e4c0b/grpcio_tools-1.69.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8eabf0a7a98c14322bc74f9910c96f98feebe311e085624b2d022924d4f652ca", size = 2953659 },
+    { url = "https://files.pythonhosted.org/packages/74/be/262c5d2b681930f8c58012500741fe06cb40a770c9d395650efe9042467f/grpcio_tools-1.69.0-cp311-cp311-win32.whl", hash = "sha256:ad567bea43d018c2215e1db10316eda94ca19229a834a3221c15d132d24c1b8a", size = 957447 },
+    { url = "https://files.pythonhosted.org/packages/8e/55/68153acca126dced35f888e708a65169df8fa8a4d5f0e78166a395e3fa9c/grpcio_tools-1.69.0-cp311-cp311-win_amd64.whl", hash = "sha256:3d64e801586dbea3530f245d48b9ed031738cc3eb099d5ce2fdb1b3dc2e1fb20", size = 1114753 },
+    { url = "https://files.pythonhosted.org/packages/5b/f6/9cd1aa47556664564b873cd187d8dec978ff2f4a539d8c6d5d2f418d3d36/grpcio_tools-1.69.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8ef8efe8beac4cc1e30d41893e4096ca2601da61001897bd17441645de2d4d3c", size = 2388440 },
+    { url = "https://files.pythonhosted.org/packages/62/37/0bcd8431e44b38f648f70368dd60542d10ffaffa109563349ee635013e10/grpcio_tools-1.69.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:a00e87a0c5a294028115a098819899b08dd18449df5b2aac4a2b87ba865e8681", size = 5726135 },
+    { url = "https://files.pythonhosted.org/packages/8b/f5/2ec994bbf522a231ce54c41a2d3621e77bece1240aafe31f12804052af0f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:7722700346d5b223159532e046e51f2ff743ed4342e5fe3e0457120a4199015e", size = 2350247 },
+    { url = "https://files.pythonhosted.org/packages/a9/29/9ebf54315a499a766e4c3bd53124267491162e9049c2d9ed45f43222b98f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a934116fdf202cb675246056ee54645c743e2240632f86a37e52f91a405c7143", size = 2727994 },
+    { url = "https://files.pythonhosted.org/packages/f0/2a/1a031018660b5d95c1a4c587a0babd0d28f0aa0c9a40dbca330567049a3f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e6a6d44359ca836acfbc58103daf94b3bb8ac919d659bb348dcd7fbecedc293", size = 2472625 },
+    { url = "https://files.pythonhosted.org/packages/74/bf/76d24078e1c76976a10760c3193b6c62685a7aed64b1cb0d8242afa16f1d/grpcio_tools-1.69.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e27662c0597fd1ab5399a583d358b5203edcb6fc2b29d6245099dfacd51a6ddc", size = 3344290 },
+    { url = "https://files.pythonhosted.org/packages/f1/f7/4ab645e4955ca1e5240b0bbd557662cec4838f0e21e072ff40f4e191b48d/grpcio_tools-1.69.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7bbb2b2fb81d95bcdd1d8331defb5f5dc256dbe423bb98b682cf129cdd432366", size = 2953592 },
+    { url = "https://files.pythonhosted.org/packages/8f/32/57e67b126f209f289fc32009309d155b8dbe9ac760c32733746e4dda7b51/grpcio_tools-1.69.0-cp312-cp312-win32.whl", hash = "sha256:e11accd10cf4af5031ac86c45f1a13fb08f55e005cea070917c12e78fe6d2aa2", size = 957042 },
+    { url = "https://files.pythonhosted.org/packages/19/64/7bfcb4e50a0ce87690c24696cd666f528e672119966abead09ae65a2e1da/grpcio_tools-1.69.0-cp312-cp312-win_amd64.whl", hash = "sha256:6df4c6ac109af338a8ccde29d184e0b0bdab13d78490cb360ff9b192a1aec7e2", size = 1114248 },
+    { url = "https://files.pythonhosted.org/packages/0c/ef/a9867f612e3aa5e69d299e47a72ea8dafa476b1f099462c9a1223cd6a83c/grpcio_tools-1.69.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c320c4faa1431f2e1252ef2325a970ac23b2fd04ffef6c12f96dd4552c3445c", size = 2388281 },
+    { url = "https://files.pythonhosted.org/packages/4b/53/b2752d8ec338778e48d76845d605a0f8bca9e43a5f09428e5ed1a76e4e1d/grpcio_tools-1.69.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:5f1224596ad74dd14444b20c37122b361c5d203b67e14e018b995f3c5d76eede", size = 5725856 },
+    { url = "https://files.pythonhosted.org/packages/83/dd/195d3639634c0c1d1e48b6693c074d66a64f16c748df2f40bcee74aa04e2/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:965a0cf656a113bc32d15ac92ca51ed702a75d5370ae0afbdd36f818533a708a", size = 2350180 },
+    { url = "https://files.pythonhosted.org/packages/8c/18/c412884fa0e888d8a271f3e31d23e3765cde0efe2404653ab67971c411c2/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:978835768c11a7f28778b3b7c40f839d8a57f765c315e80c4246c23900d56149", size = 2726724 },
+    { url = "https://files.pythonhosted.org/packages/be/c7/dfb59b7e25d760bfdd93f0aef7dd0e2a37f8437ac3017b8b526c68764e2f/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:094c7cec9bd271a32dfb7c620d4a558c63fcb0122fd1651b9ed73d6afd4ae6fe", size = 2472127 },
+    { url = "https://files.pythonhosted.org/packages/f2/b6/af4edf0a181fd7b148a83d491f5677d7d1c9f86f03282f8f0209d9dfb793/grpcio_tools-1.69.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:b51bf4981b3d7e47c2569efadff08284787124eb3dea0f63f491d39703231d3c", size = 3344015 },
+    { url = "https://files.pythonhosted.org/packages/0a/9f/4c2b5ae642f7d3df73c16df6c7d53e9443cb0e49e1dcf2c8d1a49058e0b5/grpcio_tools-1.69.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ea7aaf0dc1a828e2133357a9e9553fd1bb4e766890d52a506cc132e40632acdc", size = 2952942 },
+    { url = "https://files.pythonhosted.org/packages/97/8e/6b707871db5927a17ad7475c070916bff4f32463a51552b424779236ab65/grpcio_tools-1.69.0-cp313-cp313-win32.whl", hash = "sha256:4320f11b79d3a148cc23bad1b81719ce1197808dc2406caa8a8ba0a5cfb0260d", size = 956242 },
+    { url = "https://files.pythonhosted.org/packages/27/e2/b419a02b50240143605f77cd50cb07f724caf0fd35a01540a4f044ae9f21/grpcio_tools-1.69.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9bae733654e0eb8ca83aa1d0d6b6c2f4a3525ce70d5ffc07df68d28f6520137", size = 1113616 },
+    { url = "https://files.pythonhosted.org/packages/43/22/b1154a8c019698aee265bfb7a3deba46925c4a7b0ce68a75e80dbb649514/grpcio_tools-1.69.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:01121b6570932bfb7d8b2ce2c0055dba902a415477079e249d85fe4494f72db2", size = 2389129 },
+    { url = "https://files.pythonhosted.org/packages/9b/11/35856fc16817feeba962e502d17d10543a5e1229b4b57a1761b8918cd2e1/grpcio_tools-1.69.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:9861e282aa7b3656c67e84d0c25ee0e9210b955e0ec2c64699b8f80483f90853", size = 5726369 },
+    { url = "https://files.pythonhosted.org/packages/a7/94/cfba957813713a9b42a2ea561e7faf24ff995c00acc9866576a62591abf4/grpcio_tools-1.69.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:00adf628259e8c314a02ca1580d6a8b14eeef266f5dd5e15bf92c1efbbcf63c0", size = 2350830 },
+    { url = "https://files.pythonhosted.org/packages/06/c0/c66345bee75406318536d2a3cbbb254290de1228c42ba4a34cb68d31c066/grpcio_tools-1.69.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:371d03ac31b76ba77d44bdba6a8560f344c6d1ed558babab64760da085e392b7", size = 2727551 },
+    { url = "https://files.pythonhosted.org/packages/cd/c0/0dec90e2af8f59f20d3042ed219ab55f37f49859de53c425ad339793ff09/grpcio_tools-1.69.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6730414c01fe9027ba12538fd6e192e1bea94d5b819a1e03d15e89aab1b4573", size = 2472947 },
+    { url = "https://files.pythonhosted.org/packages/e0/e5/fba8010868909ab6301068b4c8615a372420d110e68cac7aae8085c52541/grpcio_tools-1.69.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:5562a1b1b67deffd04fbb1bcf8f1634580538ce35895b77cdfaec1fb115efd95", size = 3344136 },
+    { url = "https://files.pythonhosted.org/packages/9f/3b/c03c5919bc207c5e10723f1bdd0268475e756fa39d6f6c184aa3896b9ab1/grpcio_tools-1.69.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9f8996efddc867134f22bbf8a368b1b2a018d0a9b0ac9d3185cfd81d1abd8066", size = 2953685 },
+    { url = "https://files.pythonhosted.org/packages/d4/d7/9e00a22a0a64b9207a3ddf41aa50dd0d20df209056264098885a6a0d5c18/grpcio_tools-1.69.0-cp39-cp39-win32.whl", hash = "sha256:8f5959d8a453d613e7137831f6885b43b5c378ec317943b4ec599046baa97bfc", size = 957547 },
+    { url = "https://files.pythonhosted.org/packages/64/51/f6b198152399d17247d962340947728fb1b06da6bc0c0a542446b2ffee49/grpcio_tools-1.69.0-cp39-cp39-win_amd64.whl", hash = "sha256:5d47abf7e0662dd5dbb9cc252c3616e5fbc5f71d34e3f6332cd24bcdf2940abd", size = 1114931 },
+]
+
+[[package]]
+name = "hf-transfer"
+version = "0.1.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1a/eb/8fc64f40388c29ce8ce3b2b180a089d4d6b25b1d0d232d016704cb852104/hf_transfer-0.1.9.tar.gz", hash = "sha256:035572865dab29d17e783fbf1e84cf1cb24f3fcf8f1b17db1cfc7fdf139f02bf", size = 25201 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a4/78/0dce00208f585fae675f40033ef9a30dedfa83665d5ac79f16beb4a0a6c2/hf_transfer-0.1.9-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:6e94e8822da79573c9b6ae4d6b2f847c59a7a06c5327d7db20751b68538dc4f6", size = 1386084 },
+    { url = "https://files.pythonhosted.org/packages/ea/2e/3d60b1a9e9f29a2152aa66c823bf5e399ae7be3fef310ff0de86779c5d2d/hf_transfer-0.1.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3ebc4ab9023414880c8b1d3c38174d1c9989eb5022d37e814fa91a3060123eb0", size = 1343558 },
+    { url = "https://files.pythonhosted.org/packages/fb/38/130a5ac3747f104033591bcac1c961cb1faadfdc91704f59b09c0b465ff2/hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8674026f21ed369aa2a0a4b46000aca850fc44cd2b54af33a172ce5325b4fc82", size = 3726676 },
+    { url = "https://files.pythonhosted.org/packages/15/a1/f4e27c5ad17aac616ae0849e2aede5aae31db8267a948c6b3eeb9fd96446/hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a736dfbb2c84f5a2c975478ad200c0c8bfcb58a25a35db402678fb87ce17fa4", size = 3062920 },
+    { url = "https://files.pythonhosted.org/packages/8d/0d/727abdfba39bc3f1132cfa4c970588c2c0bb0d82fe2d645cc10f4e2f8e0b/hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:504b8427fd785dd8546d53b9fafe6e436bd7a3adf76b9dce556507650a7b4567", size = 3578681 },
+    { url = "https://files.pythonhosted.org/packages/50/d0/2b213eb1ea8b1252ccaf1a6c804d0aba03fea38aae4124df6a3acb70511a/hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c7fc1b85f4d0f76e452765d7648c9f4bfd0aedb9ced2ae1ebfece2d8cfaf8e2", size = 3398837 },
+    { url = "https://files.pythonhosted.org/packages/8c/8a/79dbce9006e0bd6b74516f97451a7b7c64dbbb426df15d901dd438cfeee3/hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d991376f0eac70a60f0cbc95602aa708a6f7c8617f28b4945c1431d67b8e3c8", size = 3546986 },
+    { url = "https://files.pythonhosted.org/packages/a9/f7/9ac239b6ee6fe0bad130325d987a93ea58c4118e50479f0786f1733b37e8/hf_transfer-0.1.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e6ac4eddcd99575ed3735ed911ddf9d1697e2bd13aa3f0ad7e3904dd4863842e", size = 4071715 },
+    { url = "https://files.pythonhosted.org/packages/d8/a3/0ed697279f5eeb7a40f279bd783cf50e6d0b91f24120dcf66ef2cf8822b4/hf_transfer-0.1.9-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:57fd9880da1ee0f47250f735f791fab788f0aa1ee36afc49f761349869c8b4d9", size = 3388081 },
+    { url = "https://files.pythonhosted.org/packages/dc/eb/47e477bdf1d784f31c7540db6cc8c354b777e51a186897a7abda34517f36/hf_transfer-0.1.9-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:5d561f0520f493c66b016d99ceabe69c23289aa90be38dd802d2aef279f15751", size = 3658654 },
+    { url = "https://files.pythonhosted.org/packages/45/07/6661e43fbee09594a8a5e9bb778107d95fe38dac4c653982afe03d32bd4d/hf_transfer-0.1.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a5b366d34cd449fe9b20ef25941e6eef0460a2f74e7389f02e673e1f88ebd538", size = 3690551 },
+    { url = "https://files.pythonhosted.org/packages/81/f5/461d2e5f307e5048289b1168d5c642ae3bb2504e88dff1a38b92ed990a21/hf_transfer-0.1.9-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e66acf91df4a8b72f60223059df3003062a5ae111757187ed1a06750a30e911b", size = 1393046 },
+    { url = "https://files.pythonhosted.org/packages/41/ba/8d9fd9f1083525edfcb389c93738c802f3559cb749324090d7109c8bf4c2/hf_transfer-0.1.9-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:8669dbcc7a3e2e8d61d42cd24da9c50d57770bd74b445c65123291ca842a7e7a", size = 1348126 },
+    { url = "https://files.pythonhosted.org/packages/8e/a2/cd7885bc9959421065a6fae0fe67b6c55becdeda4e69b873e52976f9a9f0/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8fd0167c4407a3bc4cdd0307e65ada2294ec04f1813d8a69a5243e379b22e9d8", size = 3728604 },
+    { url = "https://files.pythonhosted.org/packages/f6/2e/a072cf196edfeda3310c9a5ade0a0fdd785e6154b3ce24fc738c818da2a7/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee8b10afedcb75f71091bcc197c526a6ebf5c58bbbadb34fdeee6160f55f619f", size = 3064995 },
+    { url = "https://files.pythonhosted.org/packages/c2/84/aec9ef4c0fab93c1ea2b1badff38c78b4b2f86f0555b26d2051dbc920cde/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5828057e313de59300dd1abb489444bc452efe3f479d3c55b31a8f680936ba42", size = 3580908 },
+    { url = "https://files.pythonhosted.org/packages/29/63/b560d39651a56603d64f1a0212d0472a44cbd965db2fa62b99d99cb981bf/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc6bd19e1cc177c66bdef15ef8636ad3bde79d5a4f608c158021153b4573509d", size = 3400839 },
+    { url = "https://files.pythonhosted.org/packages/d6/d8/f87ea6f42456254b48915970ed98e993110521e9263472840174d32c880d/hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdca9bfb89e6f8f281890cc61a8aff2d3cecaff7e1a4d275574d96ca70098557", size = 3552664 },
+    { url = "https://files.pythonhosted.org/packages/d6/56/1267c39b65fc8f4e2113b36297320f102718bf5799b544a6cbe22013aa1d/hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:89a23f58b7b7effbc047b8ca286f131b17728c99a9f972723323003ffd1bb916", size = 4073732 },
+    { url = "https://files.pythonhosted.org/packages/82/1a/9c748befbe3decf7cb415e34f8a0c3789a0a9c55910dea73d581e48c0ce5/hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:dc7fff1345980d6c0ebb92c811d24afa4b98b3e07ed070c8e38cc91fd80478c5", size = 3390096 },
+    { url = "https://files.pythonhosted.org/packages/72/85/4c03da147b6b4b7cb12e074d3d44eee28604a387ed0eaf7eaaead5069c57/hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1a6bd16c667ebe89a069ca163060127a794fa3a3525292c900b8c8cc47985b0d", size = 3664743 },
+    { url = "https://files.pythonhosted.org/packages/e7/6e/e597b04f753f1b09e6893075d53a82a30c13855cbaa791402695b01e369f/hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d2fde99d502093ade3ab1b53f80da18480e9902aa960dab7f74fb1b9e5bc5746", size = 3695243 },
+    { url = "https://files.pythonhosted.org/packages/09/89/d4e234727a26b2546c8fb70a276cd924260d60135f2165bf8b9ed67bb9a4/hf_transfer-0.1.9-cp38-abi3-win32.whl", hash = "sha256:435cc3cdc8524ce57b074032b8fd76eed70a4224d2091232fa6a8cef8fd6803e", size = 1086605 },
+    { url = "https://files.pythonhosted.org/packages/a1/14/f1e15b851d1c2af5b0b1a82bf8eb10bda2da62d98180220ba6fd8879bb5b/hf_transfer-0.1.9-cp38-abi3-win_amd64.whl", hash = "sha256:16f208fc678911c37e11aa7b586bc66a37d02e636208f18b6bc53d29b5df40ad", size = 1160240 },
+]
+
+[[package]]
+name = "huggingface-hub"
+version = "0.27.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e1/d2/d6976de7542792fc077b498d64af64882b6d8bb40679284ec0bff77d5929/huggingface_hub-0.27.1.tar.gz", hash = "sha256:c004463ca870283909d715d20f066ebd6968c2207dae9393fdffb3c1d4d8f98b", size = 379407 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6c/3f/50f6b25fafdcfb1c089187a328c95081abf882309afd86f4053951507cd1/huggingface_hub-0.27.1-py3-none-any.whl", hash = "sha256:1c5155ca7d60b60c2e2fc38cbb3ffb7f7c3adf48f824015b219af9061771daec", size = 450658 },
+]
+
+[[package]]
+name = "idna"
+version = "3.10"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
+]
+
+[[package]]
+name = "importlib-metadata"
+version = "8.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "zipp" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/d9/a1e041c5e7caa9a05c925f4bdbdfb7f006d1f74996af53467bc394c97be7/importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b", size = 26514 },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 },
+]
+
+[[package]]
+name = "interegular"
+version = "0.3.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/dc/9d/8b6dde58a028a3962ce17e84d5fe73758df61378e00ef8ac3d85da34b0ff/interegular-0.3.3.tar.gz", hash = "sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600", size = 24705 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/01/72d6472f80651673716d1deda2a5bbb633e563ecf94f4479da5519d69d25/interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c", size = 23635 },
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/af/92/b3130cbbf5591acf9ade8708c365f3238046ac7cb8ccba6e81abccb0ccff/jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb", size = 244674 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bd/0f/2ba5fbcd631e3e88689309dbe978c5769e883e4b84ebfe7da30b43275c5a/jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb", size = 134596 },
+]
+
+[[package]]
+name = "jsonschema"
+version = "4.23.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "attrs" },
+    { name = "jsonschema-specifications" },
+    { name = "referencing" },
+    { name = "rpds-py" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/38/2e/03362ee4034a4c917f697890ccd4aec0800ccf9ded7f511971c75451deec/jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4", size = 325778 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/4a/4f9dbeb84e8850557c02365a0eee0649abe5eb1d84af92a25731c6c0f922/jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566", size = 88462 },
+]
+
+[[package]]
+name = "jsonschema-specifications"
+version = "2024.10.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "referencing" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/10/db/58f950c996c793472e336ff3655b13fbcf1e3b359dcf52dcf3ed3b52c352/jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272", size = 15561 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/0f/8910b19ac0670a0f80ce1008e5e751c4a57e14d2c4c13a482aa6079fa9d6/jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf", size = 18459 },
+]
+
+[[package]]
+name = "lark"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/af/60/bc7622aefb2aee1c0b4ba23c1446d3e30225c8770b38d7aedbfb65ca9d5a/lark-1.2.2.tar.gz", hash = "sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80", size = 252132 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/00/d90b10b962b4277f5e64a78b6609968859ff86889f5b898c1a778c06ec00/lark-1.2.2-py3-none-any.whl", hash = "sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c", size = 111036 },
+]
+
+[[package]]
+name = "loguru"
+version = "0.7.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595 },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "3.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 },
+]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/90/d08277ce111dd22f77149fd1a5d4653eeb3b3eaacbdfcbae5afb2600eebd/MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8", size = 14357 },
+    { url = "https://files.pythonhosted.org/packages/04/e1/6e2194baeae0bca1fae6629dc0cbbb968d4d941469cbab11a3872edff374/MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158", size = 12393 },
+    { url = "https://files.pythonhosted.org/packages/1d/69/35fa85a8ece0a437493dc61ce0bb6d459dcba482c34197e3efc829aa357f/MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579", size = 21732 },
+    { url = "https://files.pythonhosted.org/packages/22/35/137da042dfb4720b638d2937c38a9c2df83fe32d20e8c8f3185dbfef05f7/MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d", size = 20866 },
+    { url = "https://files.pythonhosted.org/packages/29/28/6d029a903727a1b62edb51863232152fd335d602def598dade38996887f0/MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb", size = 20964 },
+    { url = "https://files.pythonhosted.org/packages/cc/cd/07438f95f83e8bc028279909d9c9bd39e24149b0d60053a97b2bc4f8aa51/MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b", size = 21977 },
+    { url = "https://files.pythonhosted.org/packages/29/01/84b57395b4cc062f9c4c55ce0df7d3108ca32397299d9df00fedd9117d3d/MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c", size = 21366 },
+    { url = "https://files.pythonhosted.org/packages/bd/6e/61ebf08d8940553afff20d1fb1ba7294b6f8d279df9fd0c0db911b4bbcfd/MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171", size = 21091 },
+    { url = "https://files.pythonhosted.org/packages/11/23/ffbf53694e8c94ebd1e7e491de185124277964344733c45481f32ede2499/MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50", size = 15065 },
+    { url = "https://files.pythonhosted.org/packages/44/06/e7175d06dd6e9172d4a69a72592cb3f7a996a9c396eee29082826449bbc3/MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a", size = 15514 },
+    { url = "https://files.pythonhosted.org/packages/6b/28/bbf83e3f76936960b850435576dd5e67034e200469571be53f69174a2dfd/MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d", size = 14353 },
+    { url = "https://files.pythonhosted.org/packages/6c/30/316d194b093cde57d448a4c3209f22e3046c5bb2fb0820b118292b334be7/MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93", size = 12392 },
+    { url = "https://files.pythonhosted.org/packages/f2/96/9cdafba8445d3a53cae530aaf83c38ec64c4d5427d975c974084af5bc5d2/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832", size = 23984 },
+    { url = "https://files.pythonhosted.org/packages/f1/a4/aefb044a2cd8d7334c8a47d3fb2c9f328ac48cb349468cc31c20b539305f/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84", size = 23120 },
+    { url = "https://files.pythonhosted.org/packages/8d/21/5e4851379f88f3fad1de30361db501300d4f07bcad047d3cb0449fc51f8c/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca", size = 23032 },
+    { url = "https://files.pythonhosted.org/packages/00/7b/e92c64e079b2d0d7ddf69899c98842f3f9a60a1ae72657c89ce2655c999d/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798", size = 24057 },
+    { url = "https://files.pythonhosted.org/packages/f9/ac/46f960ca323037caa0a10662ef97d0a4728e890334fc156b9f9e52bcc4ca/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e", size = 23359 },
+    { url = "https://files.pythonhosted.org/packages/69/84/83439e16197337b8b14b6a5b9c2105fff81d42c2a7c5b58ac7b62ee2c3b1/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4", size = 23306 },
+    { url = "https://files.pythonhosted.org/packages/9a/34/a15aa69f01e2181ed8d2b685c0d2f6655d5cca2c4db0ddea775e631918cd/MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d", size = 15094 },
+    { url = "https://files.pythonhosted.org/packages/da/b8/3a3bd761922d416f3dc5d00bfbed11f66b1ab89a0c2b6e887240a30b0f6b/MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b", size = 15521 },
+    { url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274 },
+    { url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348 },
+    { url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149 },
+    { url = "https://files.pythonhosted.org/packages/f3/f0/89e7aadfb3749d0f52234a0c8c7867877876e0a20b60e2188e9850794c17/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8", size = 23118 },
+    { url = "https://files.pythonhosted.org/packages/d5/da/f2eeb64c723f5e3777bc081da884b414671982008c47dcc1873d81f625b6/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c", size = 22993 },
+    { url = "https://files.pythonhosted.org/packages/da/0e/1f32af846df486dce7c227fe0f2398dc7e2e51d4a370508281f3c1c5cddc/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557", size = 24178 },
+    { url = "https://files.pythonhosted.org/packages/c4/f6/bb3ca0532de8086cbff5f06d137064c8410d10779c4c127e0e47d17c0b71/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22", size = 23319 },
+    { url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352 },
+    { url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097 },
+    { url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601 },
+    { url = "https://files.pythonhosted.org/packages/83/0e/67eb10a7ecc77a0c2bbe2b0235765b98d164d81600746914bebada795e97/MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd", size = 14274 },
+    { url = "https://files.pythonhosted.org/packages/2b/6d/9409f3684d3335375d04e5f05744dfe7e9f120062c9857df4ab490a1031a/MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430", size = 12352 },
+    { url = "https://files.pythonhosted.org/packages/d2/f5/6eadfcd3885ea85fe2a7c128315cc1bb7241e1987443d78c8fe712d03091/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094", size = 24122 },
+    { url = "https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396", size = 23085 },
+    { url = "https://files.pythonhosted.org/packages/c2/cf/c9d56af24d56ea04daae7ac0940232d31d5a8354f2b457c6d856b2057d69/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79", size = 22978 },
+    { url = "https://files.pythonhosted.org/packages/2a/9f/8619835cd6a711d6272d62abb78c033bda638fdc54c4e7f4272cf1c0962b/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a", size = 24208 },
+    { url = "https://files.pythonhosted.org/packages/f9/bf/176950a1792b2cd2102b8ffeb5133e1ed984547b75db47c25a67d3359f77/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca", size = 23357 },
+    { url = "https://files.pythonhosted.org/packages/ce/4f/9a02c1d335caabe5c4efb90e1b6e8ee944aa245c1aaaab8e8a618987d816/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c", size = 23344 },
+    { url = "https://files.pythonhosted.org/packages/ee/55/c271b57db36f748f0e04a759ace9f8f759ccf22b4960c270c78a394f58be/MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1", size = 15101 },
+    { url = "https://files.pythonhosted.org/packages/29/88/07df22d2dd4df40aba9f3e402e6dc1b8ee86297dddbad4872bd5e7b0094f/MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f", size = 15603 },
+    { url = "https://files.pythonhosted.org/packages/62/6a/8b89d24db2d32d433dffcd6a8779159da109842434f1dd2f6e71f32f738c/MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c", size = 14510 },
+    { url = "https://files.pythonhosted.org/packages/7a/06/a10f955f70a2e5a9bf78d11a161029d278eeacbd35ef806c3fd17b13060d/MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb", size = 12486 },
+    { url = "https://files.pythonhosted.org/packages/34/cf/65d4a571869a1a9078198ca28f39fba5fbb910f952f9dbc5220afff9f5e6/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c", size = 25480 },
+    { url = "https://files.pythonhosted.org/packages/0c/e3/90e9651924c430b885468b56b3d597cabf6d72be4b24a0acd1fa0e12af67/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d", size = 23914 },
+    { url = "https://files.pythonhosted.org/packages/66/8c/6c7cf61f95d63bb866db39085150df1f2a5bd3335298f14a66b48e92659c/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe", size = 23796 },
+    { url = "https://files.pythonhosted.org/packages/bb/35/cbe9238ec3f47ac9a7c8b3df7a808e7cb50fe149dc7039f5f454b3fba218/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5", size = 25473 },
+    { url = "https://files.pythonhosted.org/packages/e6/32/7621a4382488aa283cc05e8984a9c219abad3bca087be9ec77e89939ded9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a", size = 24114 },
+    { url = "https://files.pythonhosted.org/packages/0d/80/0985960e4b89922cb5a0bac0ed39c5b96cbc1a536a99f30e8c220a996ed9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9", size = 24098 },
+    { url = "https://files.pythonhosted.org/packages/82/78/fedb03c7d5380df2427038ec8d973587e90561b2d90cd472ce9254cf348b/MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6", size = 15208 },
+    { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 },
+    { url = "https://files.pythonhosted.org/packages/a7/ea/9b1530c3fdeeca613faeb0fb5cbcf2389d816072fab72a71b45749ef6062/MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a", size = 14344 },
+    { url = "https://files.pythonhosted.org/packages/4b/c2/fbdbfe48848e7112ab05e627e718e854d20192b674952d9042ebd8c9e5de/MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff", size = 12389 },
+    { url = "https://files.pythonhosted.org/packages/f0/25/7a7c6e4dbd4f867d95d94ca15449e91e52856f6ed1905d58ef1de5e211d0/MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13", size = 21607 },
+    { url = "https://files.pythonhosted.org/packages/53/8f/f339c98a178f3c1e545622206b40986a4c3307fe39f70ccd3d9df9a9e425/MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144", size = 20728 },
+    { url = "https://files.pythonhosted.org/packages/1a/03/8496a1a78308456dbd50b23a385c69b41f2e9661c67ea1329849a598a8f9/MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29", size = 20826 },
+    { url = "https://files.pythonhosted.org/packages/e6/cf/0a490a4bd363048c3022f2f475c8c05582179bb179defcee4766fb3dcc18/MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0", size = 21843 },
+    { url = "https://files.pythonhosted.org/packages/19/a3/34187a78613920dfd3cdf68ef6ce5e99c4f3417f035694074beb8848cd77/MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0", size = 21219 },
+    { url = "https://files.pythonhosted.org/packages/17/d8/5811082f85bb88410ad7e452263af048d685669bbbfb7b595e8689152498/MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178", size = 20946 },
+    { url = "https://files.pythonhosted.org/packages/7c/31/bd635fb5989440d9365c5e3c47556cfea121c7803f5034ac843e8f37c2f2/MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f", size = 15063 },
+    { url = "https://files.pythonhosted.org/packages/b3/73/085399401383ce949f727afec55ec3abd76648d04b9f22e1c0e99cb4bec3/MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a", size = 15506 },
+]
+
+[[package]]
+name = "marlin-kernels"
+version = "0.3.7"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.13'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b2/82/886d1eece474ef23668c4780f5053ea654999704a0195aadc651631b740d/marlin-kernels-0.3.7.tar.gz", hash = "sha256:8be8a65fd9ae21b2406afba9e460e3922582479b85a1372096e87e3a15684a77", size = 15662 }
+
+[[package]]
+name = "marlin-kernels"
+version = "0.3.7"
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version == '3.10.*'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version == '3.10.*'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:bb416d14623dc0ad0eeb2835446c37a41f994555f1baec8701de6d4c1fc17ec8" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "torch" }]
+
+[[package]]
+name = "marlin-kernels"
+version = "0.3.7"
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version == '3.11.*'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version == '3.11.*'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:a89bb61d718002d4432158641bce95c6fd68f9ee1a7d5402dd283903397f3185" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "torch" }]
+
+[[package]]
+name = "marlin-kernels"
+version = "0.3.7"
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version == '3.12.*'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version == '3.12.*'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:ed938d196fc5e9cce9fc44cd2b889d5adc5ca7475c8a23858f1474d29e38bdbf" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "torch" }]
+
+[[package]]
+name = "marlin-kernels"
+version = "0.3.7"
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version < '3.10'",
+]
+dependencies = [
+    { name = "torch", marker = "python_full_version < '3.10'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:113c54f68565ad476ca12366b4de92131fa3e9ddb16cbe8ad63272972a15ac28" },
+]
+
+[package.metadata]
+requires-dist = [{ name = "torch" }]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
+]
+
+[[package]]
+name = "moe-kernels"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.13'",
+]
+dependencies = [
+    { name = "nvidia-ml-py", marker = "python_full_version >= '3.13'" },
+    { name = "torch", marker = "python_full_version >= '3.13'" },
+    { name = "triton", marker = "python_full_version >= '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9c/a1/76f32a7ce5b18e5b841e64e0e2631fa2b94f432743d6ab76b76fb24fe961/moe-kernels-0.7.0.tar.gz", hash = "sha256:1f564affad32077fe17c24566bd6200e38087e90b248636bab55fac2e1ac6874", size = 23629 }
+
+[[package]]
+name = "moe-kernels"
+version = "0.7.0"
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version == '3.10.*'",
+]
+dependencies = [
+    { name = "nvidia-ml-py", marker = "python_full_version == '3.10.*'" },
+    { name = "torch", marker = "python_full_version == '3.10.*'" },
+    { name = "triton", marker = "python_full_version == '3.10.*'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:f8c126395f11522881c6bf1f6120e3670822006a84e2ff74af561c22445746b3" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "nvidia-ml-py" },
+    { name = "torch" },
+    { name = "triton" },
+]
+
+[[package]]
+name = "moe-kernels"
+version = "0.7.0"
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version == '3.11.*'",
+]
+dependencies = [
+    { name = "nvidia-ml-py", marker = "python_full_version == '3.11.*'" },
+    { name = "torch", marker = "python_full_version == '3.11.*'" },
+    { name = "triton", marker = "python_full_version == '3.11.*'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:2afff8346251f01d5d90bab738e3dfaa6b14a414a9c88205d396ab2bae87983a" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "nvidia-ml-py" },
+    { name = "torch" },
+    { name = "triton" },
+]
+
+[[package]]
+name = "moe-kernels"
+version = "0.7.0"
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version == '3.12.*'",
+]
+dependencies = [
+    { name = "nvidia-ml-py", marker = "python_full_version == '3.12.*'" },
+    { name = "torch", marker = "python_full_version == '3.12.*'" },
+    { name = "triton", marker = "python_full_version == '3.12.*'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:b1a29e33d3b7d85e2b4f8bd47db28211096d1f645e0868d5a1f3666ebb9bd9e3" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "nvidia-ml-py" },
+    { name = "torch" },
+    { name = "triton" },
+]
+
+[[package]]
+name = "moe-kernels"
+version = "0.7.0"
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }
+resolution-markers = [
+    "python_full_version < '3.10'",
+]
+dependencies = [
+    { name = "nvidia-ml-py", marker = "python_full_version < '3.10'" },
+    { name = "torch", marker = "python_full_version < '3.10'" },
+    { name = "triton", marker = "python_full_version < '3.10'" },
+]
+wheels = [
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:9573611174cda9f6fafa1816521e38582fd2903b321bbaf78f83cf6e3189ac7d" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "nvidia-ml-py" },
+    { name = "torch" },
+    { name = "triton" },
+]
+
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198 },
+]
+
+[[package]]
+name = "multidict"
+version = "6.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d6/be/504b89a5e9ca731cd47487e91c469064f8ae5af93b7259758dcfc2b9c848/multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a", size = 64002 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/68/259dee7fd14cf56a17c554125e534f6274c2860159692a414d0b402b9a6d/multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60", size = 48628 },
+    { url = "https://files.pythonhosted.org/packages/50/79/53ba256069fe5386a4a9e80d4e12857ced9de295baf3e20c68cdda746e04/multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1", size = 29327 },
+    { url = "https://files.pythonhosted.org/packages/ff/10/71f1379b05b196dae749b5ac062e87273e3f11634f447ebac12a571d90ae/multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53", size = 29689 },
+    { url = "https://files.pythonhosted.org/packages/71/45/70bac4f87438ded36ad4793793c0095de6572d433d98575a5752629ef549/multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5", size = 126639 },
+    { url = "https://files.pythonhosted.org/packages/80/cf/17f35b3b9509b4959303c05379c4bfb0d7dd05c3306039fc79cf035bbac0/multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581", size = 134315 },
+    { url = "https://files.pythonhosted.org/packages/ef/1f/652d70ab5effb33c031510a3503d4d6efc5ec93153562f1ee0acdc895a57/multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56", size = 129471 },
+    { url = "https://files.pythonhosted.org/packages/a6/64/2dd6c4c681688c0165dea3975a6a4eab4944ea30f35000f8b8af1df3148c/multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429", size = 124585 },
+    { url = "https://files.pythonhosted.org/packages/87/56/e6ee5459894c7e554b57ba88f7257dc3c3d2d379cb15baaa1e265b8c6165/multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748", size = 116957 },
+    { url = "https://files.pythonhosted.org/packages/36/9e/616ce5e8d375c24b84f14fc263c7ef1d8d5e8ef529dbc0f1df8ce71bb5b8/multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db", size = 128609 },
+    { url = "https://files.pythonhosted.org/packages/8c/4f/4783e48a38495d000f2124020dc96bacc806a4340345211b1ab6175a6cb4/multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056", size = 123016 },
+    { url = "https://files.pythonhosted.org/packages/3e/b3/4950551ab8fc39862ba5e9907dc821f896aa829b4524b4deefd3e12945ab/multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76", size = 133542 },
+    { url = "https://files.pythonhosted.org/packages/96/4d/f0ce6ac9914168a2a71df117935bb1f1781916acdecbb43285e225b484b8/multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160", size = 130163 },
+    { url = "https://files.pythonhosted.org/packages/be/72/17c9f67e7542a49dd252c5ae50248607dfb780bcc03035907dafefb067e3/multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7", size = 126832 },
+    { url = "https://files.pythonhosted.org/packages/71/9f/72d719e248cbd755c8736c6d14780533a1606ffb3fbb0fbd77da9f0372da/multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0", size = 26402 },
+    { url = "https://files.pythonhosted.org/packages/04/5a/d88cd5d00a184e1ddffc82aa2e6e915164a6d2641ed3606e766b5d2f275a/multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d", size = 28800 },
+    { url = "https://files.pythonhosted.org/packages/93/13/df3505a46d0cd08428e4c8169a196131d1b0c4b515c3649829258843dde6/multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6", size = 48570 },
+    { url = "https://files.pythonhosted.org/packages/f0/e1/a215908bfae1343cdb72f805366592bdd60487b4232d039c437fe8f5013d/multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156", size = 29316 },
+    { url = "https://files.pythonhosted.org/packages/70/0f/6dc70ddf5d442702ed74f298d69977f904960b82368532c88e854b79f72b/multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb", size = 29640 },
+    { url = "https://files.pythonhosted.org/packages/d8/6d/9c87b73a13d1cdea30b321ef4b3824449866bd7f7127eceed066ccb9b9ff/multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b", size = 131067 },
+    { url = "https://files.pythonhosted.org/packages/cc/1e/1b34154fef373371fd6c65125b3d42ff5f56c7ccc6bfff91b9b3c60ae9e0/multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72", size = 138507 },
+    { url = "https://files.pythonhosted.org/packages/fb/e0/0bc6b2bac6e461822b5f575eae85da6aae76d0e2a79b6665d6206b8e2e48/multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304", size = 133905 },
+    { url = "https://files.pythonhosted.org/packages/ba/af/73d13b918071ff9b2205fcf773d316e0f8fefb4ec65354bbcf0b10908cc6/multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351", size = 129004 },
+    { url = "https://files.pythonhosted.org/packages/74/21/23960627b00ed39643302d81bcda44c9444ebcdc04ee5bedd0757513f259/multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb", size = 121308 },
+    { url = "https://files.pythonhosted.org/packages/8b/5c/cf282263ffce4a596ed0bb2aa1a1dddfe1996d6a62d08842a8d4b33dca13/multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3", size = 132608 },
+    { url = "https://files.pythonhosted.org/packages/d7/3e/97e778c041c72063f42b290888daff008d3ab1427f5b09b714f5a8eff294/multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399", size = 127029 },
+    { url = "https://files.pythonhosted.org/packages/47/ac/3efb7bfe2f3aefcf8d103e9a7162572f01936155ab2f7ebcc7c255a23212/multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423", size = 137594 },
+    { url = "https://files.pythonhosted.org/packages/42/9b/6c6e9e8dc4f915fc90a9b7798c44a30773dea2995fdcb619870e705afe2b/multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3", size = 134556 },
+    { url = "https://files.pythonhosted.org/packages/1d/10/8e881743b26aaf718379a14ac58572a240e8293a1c9d68e1418fb11c0f90/multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753", size = 130993 },
+    { url = "https://files.pythonhosted.org/packages/45/84/3eb91b4b557442802d058a7579e864b329968c8d0ea57d907e7023c677f2/multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80", size = 26405 },
+    { url = "https://files.pythonhosted.org/packages/9f/0b/ad879847ecbf6d27e90a6eabb7eff6b62c129eefe617ea45eae7c1f0aead/multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926", size = 28795 },
+    { url = "https://files.pythonhosted.org/packages/fd/16/92057c74ba3b96d5e211b553895cd6dc7cc4d1e43d9ab8fafc727681ef71/multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa", size = 48713 },
+    { url = "https://files.pythonhosted.org/packages/94/3d/37d1b8893ae79716179540b89fc6a0ee56b4a65fcc0d63535c6f5d96f217/multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436", size = 29516 },
+    { url = "https://files.pythonhosted.org/packages/a2/12/adb6b3200c363062f805275b4c1e656be2b3681aada66c80129932ff0bae/multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761", size = 29557 },
+    { url = "https://files.pythonhosted.org/packages/47/e9/604bb05e6e5bce1e6a5cf80a474e0f072e80d8ac105f1b994a53e0b28c42/multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e", size = 130170 },
+    { url = "https://files.pythonhosted.org/packages/7e/13/9efa50801785eccbf7086b3c83b71a4fb501a4d43549c2f2f80b8787d69f/multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef", size = 134836 },
+    { url = "https://files.pythonhosted.org/packages/bf/0f/93808b765192780d117814a6dfcc2e75de6dcc610009ad408b8814dca3ba/multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95", size = 133475 },
+    { url = "https://files.pythonhosted.org/packages/d3/c8/529101d7176fe7dfe1d99604e48d69c5dfdcadb4f06561f465c8ef12b4df/multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925", size = 131049 },
+    { url = "https://files.pythonhosted.org/packages/ca/0c/fc85b439014d5a58063e19c3a158a889deec399d47b5269a0f3b6a2e28bc/multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966", size = 120370 },
+    { url = "https://files.pythonhosted.org/packages/db/46/d4416eb20176492d2258fbd47b4abe729ff3b6e9c829ea4236f93c865089/multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305", size = 125178 },
+    { url = "https://files.pythonhosted.org/packages/5b/46/73697ad7ec521df7de5531a32780bbfd908ded0643cbe457f981a701457c/multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2", size = 119567 },
+    { url = "https://files.pythonhosted.org/packages/cd/ed/51f060e2cb0e7635329fa6ff930aa5cffa17f4c7f5c6c3ddc3500708e2f2/multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2", size = 129822 },
+    { url = "https://files.pythonhosted.org/packages/df/9e/ee7d1954b1331da3eddea0c4e08d9142da5f14b1321c7301f5014f49d492/multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6", size = 128656 },
+    { url = "https://files.pythonhosted.org/packages/77/00/8538f11e3356b5d95fa4b024aa566cde7a38aa7a5f08f4912b32a037c5dc/multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3", size = 125360 },
+    { url = "https://files.pythonhosted.org/packages/be/05/5d334c1f2462d43fec2363cd00b1c44c93a78c3925d952e9a71caf662e96/multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133", size = 26382 },
+    { url = "https://files.pythonhosted.org/packages/a3/bf/f332a13486b1ed0496d624bcc7e8357bb8053823e8cd4b9a18edc1d97e73/multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1", size = 28529 },
+    { url = "https://files.pythonhosted.org/packages/22/67/1c7c0f39fe069aa4e5d794f323be24bf4d33d62d2a348acdb7991f8f30db/multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008", size = 48771 },
+    { url = "https://files.pythonhosted.org/packages/3c/25/c186ee7b212bdf0df2519eacfb1981a017bda34392c67542c274651daf23/multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f", size = 29533 },
+    { url = "https://files.pythonhosted.org/packages/67/5e/04575fd837e0958e324ca035b339cea174554f6f641d3fb2b4f2e7ff44a2/multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28", size = 29595 },
+    { url = "https://files.pythonhosted.org/packages/d3/b2/e56388f86663810c07cfe4a3c3d87227f3811eeb2d08450b9e5d19d78876/multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b", size = 130094 },
+    { url = "https://files.pythonhosted.org/packages/6c/ee/30ae9b4186a644d284543d55d491fbd4239b015d36b23fea43b4c94f7052/multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c", size = 134876 },
+    { url = "https://files.pythonhosted.org/packages/84/c7/70461c13ba8ce3c779503c70ec9d0345ae84de04521c1f45a04d5f48943d/multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3", size = 133500 },
+    { url = "https://files.pythonhosted.org/packages/4a/9f/002af221253f10f99959561123fae676148dd730e2daa2cd053846a58507/multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44", size = 131099 },
+    { url = "https://files.pythonhosted.org/packages/82/42/d1c7a7301d52af79d88548a97e297f9d99c961ad76bbe6f67442bb77f097/multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2", size = 120403 },
+    { url = "https://files.pythonhosted.org/packages/68/f3/471985c2c7ac707547553e8f37cff5158030d36bdec4414cb825fbaa5327/multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3", size = 125348 },
+    { url = "https://files.pythonhosted.org/packages/67/2c/e6df05c77e0e433c214ec1d21ddd203d9a4770a1f2866a8ca40a545869a0/multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa", size = 119673 },
+    { url = "https://files.pythonhosted.org/packages/c5/cd/bc8608fff06239c9fb333f9db7743a1b2eafe98c2666c9a196e867a3a0a4/multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa", size = 129927 },
+    { url = "https://files.pythonhosted.org/packages/44/8e/281b69b7bc84fc963a44dc6e0bbcc7150e517b91df368a27834299a526ac/multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4", size = 128711 },
+    { url = "https://files.pythonhosted.org/packages/12/a4/63e7cd38ed29dd9f1881d5119f272c898ca92536cdb53ffe0843197f6c85/multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6", size = 125519 },
+    { url = "https://files.pythonhosted.org/packages/38/e0/4f5855037a72cd8a7a2f60a3952d9aa45feedb37ae7831642102604e8a37/multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81", size = 26426 },
+    { url = "https://files.pythonhosted.org/packages/7e/a5/17ee3a4db1e310b7405f5d25834460073a8ccd86198ce044dfaf69eac073/multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774", size = 28531 },
+    { url = "https://files.pythonhosted.org/packages/e7/c9/9e153a6572b38ac5ff4434113af38acf8d5e9957897cdb1f513b3d6614ed/multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c", size = 48550 },
+    { url = "https://files.pythonhosted.org/packages/76/f5/79565ddb629eba6c7f704f09a09df085c8dc04643b12506f10f718cee37a/multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1", size = 29298 },
+    { url = "https://files.pythonhosted.org/packages/60/1b/9851878b704bc98e641a3e0bce49382ae9e05743dac6d97748feb5b7baba/multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c", size = 29641 },
+    { url = "https://files.pythonhosted.org/packages/89/87/d451d45aab9e422cb0fb2f7720c31a4c1d3012c740483c37f642eba568fb/multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c", size = 126202 },
+    { url = "https://files.pythonhosted.org/packages/fa/b4/27cbe9f3e2e469359887653f2e45470272eef7295139916cc21107c6b48c/multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f", size = 133925 },
+    { url = "https://files.pythonhosted.org/packages/4d/a3/afc841899face8adfd004235ce759a37619f6ec99eafd959650c5ce4df57/multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875", size = 129039 },
+    { url = "https://files.pythonhosted.org/packages/5e/41/0d0fb18c1ad574f807196f5f3d99164edf9de3e169a58c6dc2d6ed5742b9/multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255", size = 124072 },
+    { url = "https://files.pythonhosted.org/packages/00/22/defd7a2e71a44e6e5b9a5428f972e5b572e7fe28e404dfa6519bbf057c93/multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30", size = 116532 },
+    { url = "https://files.pythonhosted.org/packages/91/25/f7545102def0b1d456ab6449388eed2dfd822debba1d65af60194904a23a/multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057", size = 128173 },
+    { url = "https://files.pythonhosted.org/packages/45/79/3dbe8d35fc99f5ea610813a72ab55f426cb9cf482f860fa8496e5409be11/multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657", size = 122654 },
+    { url = "https://files.pythonhosted.org/packages/97/cb/209e735eeab96e1b160825b5d0b36c56d3862abff828fc43999bb957dcad/multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28", size = 133197 },
+    { url = "https://files.pythonhosted.org/packages/e4/3a/a13808a7ada62808afccea67837a79d00ad6581440015ef00f726d064c2d/multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972", size = 129754 },
+    { url = "https://files.pythonhosted.org/packages/77/dd/8540e139eafb240079242da8f8ffdf9d3f4b4ad1aac5a786cd4050923783/multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43", size = 126402 },
+    { url = "https://files.pythonhosted.org/packages/86/99/e82e1a275d8b1ea16d3a251474262258dbbe41c05cce0c01bceda1fc8ea5/multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada", size = 26421 },
+    { url = "https://files.pythonhosted.org/packages/86/1c/9fa630272355af7e4446a2c7550c259f11ee422ab2d30ff90a0a71cf3d9e/multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a", size = 28791 },
+    { url = "https://files.pythonhosted.org/packages/99/b7/b9e70fde2c0f0c9af4cc5277782a89b66d35948ea3369ec9f598358c3ac5/multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506", size = 10051 },
+]
+
+[[package]]
+name = "multiprocess"
+version = "0.70.16"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/76/6e712a2623d146d314f17598df5de7224c85c0060ef63fd95cc15a25b3fa/multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee", size = 134980 },
+    { url = "https://files.pythonhosted.org/packages/0f/ab/1e6e8009e380e22254ff539ebe117861e5bdb3bff1fc977920972237c6c7/multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec", size = 134982 },
+    { url = "https://files.pythonhosted.org/packages/d8/94/8638a89f93c80df329116e6781a060506c7e91e1f4370dc831e9d17a041d/multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41", size = 133497 },
+    { url = "https://files.pythonhosted.org/packages/89/21/222066f6bb8d8af287923ae3bd26cf4699a9ce020228ac273caca1de8250/multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a", size = 133498 },
+    { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824 },
+    { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519 },
+    { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741 },
+    { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628 },
+    { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351 },
+]
+
+[[package]]
+name = "mypy-protobuf"
+version = "3.6.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+    { name = "types-protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4d/6f/282d64d66bf48ce60e38a6560753f784e0f88ab245ac2fb5e93f701a36cd/mypy-protobuf-3.6.0.tar.gz", hash = "sha256:02f242eb3409f66889f2b1a3aa58356ec4d909cdd0f93115622e9e70366eca3c", size = 24445 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e8/73/d6b999782ae22f16971cc05378b3b33f6a89ede3b9619e8366aa23484bca/mypy_protobuf-3.6.0-py3-none-any.whl", hash = "sha256:56176e4d569070e7350ea620262478b49b7efceba4103d468448f1d21492fd6c", size = 16434 },
+]
+
+[[package]]
+name = "nest-asyncio"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195 },
+]
+
+[[package]]
+name = "networkx"
+version = "3.2.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.10'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/80/a84676339aaae2f1cfdf9f418701dd634aef9cc76f708ef55c36ff39c3ca/networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", size = 2073928 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2", size = 1647772 },
+]
+
+[[package]]
+name = "networkx"
+version = "3.4.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.13'",
+    "python_full_version == '3.12.*'",
+    "python_full_version == '3.11.*'",
+    "python_full_version == '3.10.*'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263 },
+]
+
+[[package]]
+name = "numpy"
+version = "2.0.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.10'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245 },
+    { url = "https://files.pythonhosted.org/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540 },
+    { url = "https://files.pythonhosted.org/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623 },
+    { url = "https://files.pythonhosted.org/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774 },
+    { url = "https://files.pythonhosted.org/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081 },
+    { url = "https://files.pythonhosted.org/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451 },
+    { url = "https://files.pythonhosted.org/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572 },
+    { url = "https://files.pythonhosted.org/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722 },
+    { url = "https://files.pythonhosted.org/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170 },
+    { url = "https://files.pythonhosted.org/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558 },
+    { url = "https://files.pythonhosted.org/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137 },
+    { url = "https://files.pythonhosted.org/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552 },
+    { url = "https://files.pythonhosted.org/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957 },
+    { url = "https://files.pythonhosted.org/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573 },
+    { url = "https://files.pythonhosted.org/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330 },
+    { url = "https://files.pythonhosted.org/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895 },
+    { url = "https://files.pythonhosted.org/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253 },
+    { url = "https://files.pythonhosted.org/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074 },
+    { url = "https://files.pythonhosted.org/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640 },
+    { url = "https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230 },
+    { url = "https://files.pythonhosted.org/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803 },
+    { url = "https://files.pythonhosted.org/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835 },
+    { url = "https://files.pythonhosted.org/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499 },
+    { url = "https://files.pythonhosted.org/packages/e3/ff/ddf6dac2ff0dd50a7327bcdba45cb0264d0e96bb44d33324853f781a8f3c/numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c", size = 6633497 },
+    { url = "https://files.pythonhosted.org/packages/72/21/67f36eac8e2d2cd652a2e69595a54128297cdcb1ff3931cfc87838874bd4/numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692", size = 13621158 },
+    { url = "https://files.pythonhosted.org/packages/39/68/e9f1126d757653496dbc096cb429014347a36b228f5a991dae2c6b6cfd40/numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a", size = 19236173 },
+    { url = "https://files.pythonhosted.org/packages/d1/e9/1f5333281e4ebf483ba1c888b1d61ba7e78d7e910fdd8e6499667041cc35/numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c", size = 19634174 },
+    { url = "https://files.pythonhosted.org/packages/71/af/a469674070c8d8408384e3012e064299f7a2de540738a8e414dcfd639996/numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded", size = 14099701 },
+    { url = "https://files.pythonhosted.org/packages/d0/3d/08ea9f239d0e0e939b6ca52ad403c84a2bce1bde301a8eb4888c1c1543f1/numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5", size = 6174313 },
+    { url = "https://files.pythonhosted.org/packages/b2/b5/4ac39baebf1fdb2e72585c8352c56d063b6126be9fc95bd2bb5ef5770c20/numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a", size = 15606179 },
+    { url = "https://files.pythonhosted.org/packages/43/c1/41c8f6df3162b0c6ffd4437d729115704bd43363de0090c7f913cfbc2d89/numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c", size = 21169942 },
+    { url = "https://files.pythonhosted.org/packages/39/bc/fd298f308dcd232b56a4031fd6ddf11c43f9917fbc937e53762f7b5a3bb1/numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd", size = 13711512 },
+    { url = "https://files.pythonhosted.org/packages/96/ff/06d1aa3eeb1c614eda245c1ba4fb88c483bee6520d361641331872ac4b82/numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b", size = 5306976 },
+    { url = "https://files.pythonhosted.org/packages/2d/98/121996dcfb10a6087a05e54453e28e58694a7db62c5a5a29cee14c6e047b/numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729", size = 6906494 },
+    { url = "https://files.pythonhosted.org/packages/15/31/9dffc70da6b9bbf7968f6551967fc21156207366272c2a40b4ed6008dc9b/numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1", size = 13912596 },
+    { url = "https://files.pythonhosted.org/packages/b9/14/78635daab4b07c0930c919d451b8bf8c164774e6a3413aed04a6d95758ce/numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd", size = 19526099 },
+    { url = "https://files.pythonhosted.org/packages/26/4c/0eeca4614003077f68bfe7aac8b7496f04221865b3a5e7cb230c9d055afd/numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d", size = 19932823 },
+    { url = "https://files.pythonhosted.org/packages/f1/46/ea25b98b13dccaebddf1a803f8c748680d972e00507cd9bc6dcdb5aa2ac1/numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d", size = 14404424 },
+    { url = "https://files.pythonhosted.org/packages/c8/a6/177dd88d95ecf07e722d21008b1b40e681a929eb9e329684d449c36586b2/numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa", size = 6476809 },
+    { url = "https://files.pythonhosted.org/packages/ea/2b/7fc9f4e7ae5b507c1a3a21f0f15ed03e794c1242ea8a242ac158beb56034/numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73", size = 15911314 },
+    { url = "https://files.pythonhosted.org/packages/8f/3b/df5a870ac6a3be3a86856ce195ef42eec7ae50d2a202be1f5a4b3b340e14/numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8", size = 21025288 },
+    { url = "https://files.pythonhosted.org/packages/2c/97/51af92f18d6f6f2d9ad8b482a99fb74e142d71372da5d834b3a2747a446e/numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4", size = 6762793 },
+    { url = "https://files.pythonhosted.org/packages/12/46/de1fbd0c1b5ccaa7f9a005b66761533e2f6a3e560096682683a223631fe9/numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c", size = 19334885 },
+    { url = "https://files.pythonhosted.org/packages/cc/dc/d330a6faefd92b446ec0f0dfea4c3207bb1fef3c4771d19cf4543efd2c78/numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385", size = 15828784 },
+]
+
+[[package]]
+name = "numpy"
+version = "2.2.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.13'",
+    "python_full_version == '3.12.*'",
+    "python_full_version == '3.11.*'",
+    "python_full_version == '3.10.*'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f2/a5/fdbf6a7871703df6160b5cf3dd774074b086d278172285c52c2758b76305/numpy-2.2.1.tar.gz", hash = "sha256:45681fd7128c8ad1c379f0ca0776a8b0c6583d2f69889ddac01559dfe4390918", size = 20227662 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/c4/5588367dc9f91e1a813beb77de46ea8cab13f778e1b3a0e661ab031aba44/numpy-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5edb4e4caf751c1518e6a26a83501fda79bff41cc59dac48d70e6d65d4ec4440", size = 21213214 },
+    { url = "https://files.pythonhosted.org/packages/d8/8b/32dd9f08419023a4cf856c5ad0b4eba9b830da85eafdef841a104c4fc05a/numpy-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa3017c40d513ccac9621a2364f939d39e550c542eb2a894b4c8da92b38896ab", size = 14352248 },
+    { url = "https://files.pythonhosted.org/packages/84/2d/0e895d02940ba6e12389f0ab5cac5afcf8dc2dc0ade4e8cad33288a721bd/numpy-2.2.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:61048b4a49b1c93fe13426e04e04fdf5a03f456616f6e98c7576144677598675", size = 5391007 },
+    { url = "https://files.pythonhosted.org/packages/11/b9/7f1e64a0d46d9c2af6d17966f641fb12d5b8ea3003f31b2308f3e3b9a6aa/numpy-2.2.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:7671dc19c7019103ca44e8d94917eba8534c76133523ca8406822efdd19c9308", size = 6926174 },
+    { url = "https://files.pythonhosted.org/packages/2e/8c/043fa4418bc9364e364ab7aba8ff6ef5f6b9171ade22de8fbcf0e2fa4165/numpy-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4250888bcb96617e00bfa28ac24850a83c9f3a16db471eca2ee1f1714df0f957", size = 14330914 },
+    { url = "https://files.pythonhosted.org/packages/f7/b6/d8110985501ca8912dfc1c3bbef99d66e62d487f72e46b2337494df77364/numpy-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7746f235c47abc72b102d3bce9977714c2444bdfaea7888d241b4c4bb6a78bf", size = 16379607 },
+    { url = "https://files.pythonhosted.org/packages/e2/57/bdca9fb8bdaa810c3a4ff2eb3231379b77f618a7c0d24be9f7070db50775/numpy-2.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:059e6a747ae84fce488c3ee397cee7e5f905fd1bda5fb18c66bc41807ff119b2", size = 15541760 },
+    { url = "https://files.pythonhosted.org/packages/97/55/3b9147b3cbc3b6b1abc2a411dec5337a46c873deca0dd0bf5bef9d0579cc/numpy-2.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f62aa6ee4eb43b024b0e5a01cf65a0bb078ef8c395e8713c6e8a12a697144528", size = 18168476 },
+    { url = "https://files.pythonhosted.org/packages/00/e7/7c2cde16c9b87a8e14fdd262ca7849c4681cf48c8a774505f7e6f5e3b643/numpy-2.2.1-cp310-cp310-win32.whl", hash = "sha256:48fd472630715e1c1c89bf1feab55c29098cb403cc184b4859f9c86d4fcb6a95", size = 6570985 },
+    { url = "https://files.pythonhosted.org/packages/a1/a8/554b0e99fc4ac11ec481254781a10da180d0559c2ebf2c324232317349ee/numpy-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:b541032178a718c165a49638d28272b771053f628382d5e9d1c93df23ff58dbf", size = 12913384 },
+    { url = "https://files.pythonhosted.org/packages/59/14/645887347124e101d983e1daf95b48dc3e136bf8525cb4257bf9eab1b768/numpy-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:40f9e544c1c56ba8f1cf7686a8c9b5bb249e665d40d626a23899ba6d5d9e1484", size = 21217379 },
+    { url = "https://files.pythonhosted.org/packages/9f/fd/2279000cf29f58ccfd3778cbf4670dfe3f7ce772df5e198c5abe9e88b7d7/numpy-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9b57eaa3b0cd8db52049ed0330747b0364e899e8a606a624813452b8203d5f7", size = 14388520 },
+    { url = "https://files.pythonhosted.org/packages/58/b0/034eb5d5ba12d66ab658ff3455a31f20add0b78df8203c6a7451bd1bee21/numpy-2.2.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bc8a37ad5b22c08e2dbd27df2b3ef7e5c0864235805b1e718a235bcb200cf1cb", size = 5389286 },
+    { url = "https://files.pythonhosted.org/packages/5d/69/6f3cccde92e82e7835fdb475c2bf439761cbf8a1daa7c07338e1e132dfec/numpy-2.2.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:9036d6365d13b6cbe8f27a0eaf73ddcc070cae584e5ff94bb45e3e9d729feab5", size = 6930345 },
+    { url = "https://files.pythonhosted.org/packages/d1/72/1cd38e91ab563e67f584293fcc6aca855c9ae46dba42e6b5ff4600022899/numpy-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51faf345324db860b515d3f364eaa93d0e0551a88d6218a7d61286554d190d73", size = 14335748 },
+    { url = "https://files.pythonhosted.org/packages/f2/d4/f999444e86986f3533e7151c272bd8186c55dda554284def18557e013a2a/numpy-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38efc1e56b73cc9b182fe55e56e63b044dd26a72128fd2fbd502f75555d92591", size = 16391057 },
+    { url = "https://files.pythonhosted.org/packages/99/7b/85cef6a3ae1b19542b7afd97d0b296526b6ef9e3c43ea0c4d9c4404fb2d0/numpy-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:31b89fa67a8042e96715c68e071a1200c4e172f93b0fbe01a14c0ff3ff820fc8", size = 15556943 },
+    { url = "https://files.pythonhosted.org/packages/69/7e/b83cc884c3508e91af78760f6b17ab46ad649831b1fa35acb3eb26d9e6d2/numpy-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c86e2a209199ead7ee0af65e1d9992d1dce7e1f63c4b9a616500f93820658d0", size = 18180785 },
+    { url = "https://files.pythonhosted.org/packages/b2/9f/eb4a9a38867de059dcd4b6e18d47c3867fbd3795d4c9557bb49278f94087/numpy-2.2.1-cp311-cp311-win32.whl", hash = "sha256:b34d87e8a3090ea626003f87f9392b3929a7bbf4104a05b6667348b6bd4bf1cd", size = 6568983 },
+    { url = "https://files.pythonhosted.org/packages/6d/1e/be3b9f3073da2f8c7fa361fcdc231b548266b0781029fdbaf75eeab997fd/numpy-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:360137f8fb1b753c5cde3ac388597ad680eccbbbb3865ab65efea062c4a1fd16", size = 12917260 },
+    { url = "https://files.pythonhosted.org/packages/62/12/b928871c570d4a87ab13d2cc19f8817f17e340d5481621930e76b80ffb7d/numpy-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:694f9e921a0c8f252980e85bce61ebbd07ed2b7d4fa72d0e4246f2f8aa6642ab", size = 20909861 },
+    { url = "https://files.pythonhosted.org/packages/3d/c3/59df91ae1d8ad7c5e03efd63fd785dec62d96b0fe56d1f9ab600b55009af/numpy-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3683a8d166f2692664262fd4900f207791d005fb088d7fdb973cc8d663626faa", size = 14095776 },
+    { url = "https://files.pythonhosted.org/packages/af/4e/8ed5868efc8e601fb69419644a280e9c482b75691466b73bfaab7d86922c/numpy-2.2.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:780077d95eafc2ccc3ced969db22377b3864e5b9a0ea5eb347cc93b3ea900315", size = 5126239 },
+    { url = "https://files.pythonhosted.org/packages/1a/74/dd0bbe650d7bc0014b051f092f2de65e34a8155aabb1287698919d124d7f/numpy-2.2.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:55ba24ebe208344aa7a00e4482f65742969a039c2acfcb910bc6fcd776eb4355", size = 6659296 },
+    { url = "https://files.pythonhosted.org/packages/7f/11/4ebd7a3f4a655764dc98481f97bd0a662fb340d1001be6050606be13e162/numpy-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b1d07b53b78bf84a96898c1bc139ad7f10fda7423f5fd158fd0f47ec5e01ac7", size = 14047121 },
+    { url = "https://files.pythonhosted.org/packages/7f/a7/c1f1d978166eb6b98ad009503e4d93a8c1962d0eb14a885c352ee0276a54/numpy-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5062dc1a4e32a10dc2b8b13cedd58988261416e811c1dc4dbdea4f57eea61b0d", size = 16096599 },
+    { url = "https://files.pythonhosted.org/packages/3d/6d/0e22afd5fcbb4d8d0091f3f46bf4e8906399c458d4293da23292c0ba5022/numpy-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fce4f615f8ca31b2e61aa0eb5865a21e14f5629515c9151850aa936c02a1ee51", size = 15243932 },
+    { url = "https://files.pythonhosted.org/packages/03/39/e4e5832820131ba424092b9610d996b37e5557180f8e2d6aebb05c31ae54/numpy-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:67d4cda6fa6ffa073b08c8372aa5fa767ceb10c9a0587c707505a6d426f4e046", size = 17861032 },
+    { url = "https://files.pythonhosted.org/packages/5f/8a/3794313acbf5e70df2d5c7d2aba8718676f8d054a05abe59e48417fb2981/numpy-2.2.1-cp312-cp312-win32.whl", hash = "sha256:32cb94448be47c500d2c7a95f93e2f21a01f1fd05dd2beea1ccd049bb6001cd2", size = 6274018 },
+    { url = "https://files.pythonhosted.org/packages/17/c1/c31d3637f2641e25c7a19adf2ae822fdaf4ddd198b05d79a92a9ce7cb63e/numpy-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:ba5511d8f31c033a5fcbda22dd5c813630af98c70b2661f2d2c654ae3cdfcfc8", size = 12613843 },
+    { url = "https://files.pythonhosted.org/packages/20/d6/91a26e671c396e0c10e327b763485ee295f5a5a7a48c553f18417e5a0ed5/numpy-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f1d09e520217618e76396377c81fba6f290d5f926f50c35f3a5f72b01a0da780", size = 20896464 },
+    { url = "https://files.pythonhosted.org/packages/8c/40/5792ccccd91d45e87d9e00033abc4f6ca8a828467b193f711139ff1f1cd9/numpy-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3ecc47cd7f6ea0336042be87d9e7da378e5c7e9b3c8ad0f7c966f714fc10d821", size = 14111350 },
+    { url = "https://files.pythonhosted.org/packages/c0/2a/fb0a27f846cb857cef0c4c92bef89f133a3a1abb4e16bba1c4dace2e9b49/numpy-2.2.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f419290bc8968a46c4933158c91a0012b7a99bb2e465d5ef5293879742f8797e", size = 5111629 },
+    { url = "https://files.pythonhosted.org/packages/eb/e5/8e81bb9d84db88b047baf4e8b681a3e48d6390bc4d4e4453eca428ecbb49/numpy-2.2.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5b6c390bfaef8c45a260554888966618328d30e72173697e5cabe6b285fb2348", size = 6645865 },
+    { url = "https://files.pythonhosted.org/packages/7a/1a/a90ceb191dd2f9e2897c69dde93ccc2d57dd21ce2acbd7b0333e8eea4e8d/numpy-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:526fc406ab991a340744aad7e25251dd47a6720a685fa3331e5c59fef5282a59", size = 14043508 },
+    { url = "https://files.pythonhosted.org/packages/f1/5a/e572284c86a59dec0871a49cd4e5351e20b9c751399d5f1d79628c0542cb/numpy-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f74e6fdeb9a265624ec3a3918430205dff1df7e95a230779746a6af78bc615af", size = 16094100 },
+    { url = "https://files.pythonhosted.org/packages/0c/2c/a79d24f364788386d85899dd280a94f30b0950be4b4a545f4fa4ed1d4ca7/numpy-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:53c09385ff0b72ba79d8715683c1168c12e0b6e84fb0372e97553d1ea91efe51", size = 15239691 },
+    { url = "https://files.pythonhosted.org/packages/cf/79/1e20fd1c9ce5a932111f964b544facc5bb9bde7865f5b42f00b4a6a9192b/numpy-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f3eac17d9ec51be534685ba877b6ab5edc3ab7ec95c8f163e5d7b39859524716", size = 17856571 },
+    { url = "https://files.pythonhosted.org/packages/be/5b/cc155e107f75d694f562bdc84a26cc930569f3dfdfbccb3420b626065777/numpy-2.2.1-cp313-cp313-win32.whl", hash = "sha256:9ad014faa93dbb52c80d8f4d3dcf855865c876c9660cb9bd7553843dd03a4b1e", size = 6270841 },
+    { url = "https://files.pythonhosted.org/packages/44/be/0e5cd009d2162e4138d79a5afb3b5d2341f0fe4777ab6e675aa3d4a42e21/numpy-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:164a829b6aacf79ca47ba4814b130c4020b202522a93d7bff2202bfb33b61c60", size = 12606618 },
+    { url = "https://files.pythonhosted.org/packages/a8/87/04ddf02dd86fb17c7485a5f87b605c4437966d53de1e3745d450343a6f56/numpy-2.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4dfda918a13cc4f81e9118dea249e192ab167a0bb1966272d5503e39234d694e", size = 20921004 },
+    { url = "https://files.pythonhosted.org/packages/6e/3e/d0e9e32ab14005425d180ef950badf31b862f3839c5b927796648b11f88a/numpy-2.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:733585f9f4b62e9b3528dd1070ec4f52b8acf64215b60a845fa13ebd73cd0712", size = 14119910 },
+    { url = "https://files.pythonhosted.org/packages/b5/5b/aa2d1905b04a8fb681e08742bb79a7bddfc160c7ce8e1ff6d5c821be0236/numpy-2.2.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:89b16a18e7bba224ce5114db863e7029803c179979e1af6ad6a6b11f70545008", size = 5153612 },
+    { url = "https://files.pythonhosted.org/packages/ce/35/6831808028df0648d9b43c5df7e1051129aa0d562525bacb70019c5f5030/numpy-2.2.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:676f4eebf6b2d430300f1f4f4c2461685f8269f94c89698d832cdf9277f30b84", size = 6668401 },
+    { url = "https://files.pythonhosted.org/packages/b1/38/10ef509ad63a5946cc042f98d838daebfe7eaf45b9daaf13df2086b15ff9/numpy-2.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f5cdf9f493b35f7e41e8368e7d7b4bbafaf9660cba53fb21d2cd174ec09631", size = 14014198 },
+    { url = "https://files.pythonhosted.org/packages/df/f8/c80968ae01df23e249ee0a4487fae55a4c0fe2f838dfe9cc907aa8aea0fa/numpy-2.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1ad395cf254c4fbb5b2132fee391f361a6e8c1adbd28f2cd8e79308a615fe9d", size = 16076211 },
+    { url = "https://files.pythonhosted.org/packages/09/69/05c169376016a0b614b432967ac46ff14269eaffab80040ec03ae1ae8e2c/numpy-2.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:08ef779aed40dbc52729d6ffe7dd51df85796a702afbf68a4f4e41fafdc8bda5", size = 15220266 },
+    { url = "https://files.pythonhosted.org/packages/f1/ff/94a4ce67ea909f41cf7ea712aebbe832dc67decad22944a1020bb398a5ee/numpy-2.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:26c9c4382b19fcfbbed3238a14abf7ff223890ea1936b8890f058e7ba35e8d71", size = 17852844 },
+    { url = "https://files.pythonhosted.org/packages/46/72/8a5dbce4020dfc595592333ef2fbb0a187d084ca243b67766d29d03e0096/numpy-2.2.1-cp313-cp313t-win32.whl", hash = "sha256:93cf4e045bae74c90ca833cba583c14b62cb4ba2cba0abd2b141ab52548247e2", size = 6326007 },
+    { url = "https://files.pythonhosted.org/packages/7b/9c/4fce9cf39dde2562584e4cfd351a0140240f82c0e3569ce25a250f47037d/numpy-2.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bff7d8ec20f5f42607599f9994770fa65d76edca264a87b5e4ea5629bce12268", size = 12693107 },
+    { url = "https://files.pythonhosted.org/packages/f1/65/d36a76b811ffe0a4515e290cb05cb0e22171b1b0f0db6bee9141cf023545/numpy-2.2.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7ba9cc93a91d86365a5d270dee221fdc04fb68d7478e6bf6af650de78a8339e3", size = 21044672 },
+    { url = "https://files.pythonhosted.org/packages/aa/3f/b644199f165063154df486d95198d814578f13dd4d8c1651e075bf1cb8af/numpy-2.2.1-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:3d03883435a19794e41f147612a77a8f56d4e52822337844fff3d4040a142964", size = 6789873 },
+    { url = "https://files.pythonhosted.org/packages/d7/df/2adb0bb98a3cbe8a6c3c6d1019aede1f1d8b83927ced228a46cc56c7a206/numpy-2.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4511d9e6071452b944207c8ce46ad2f897307910b402ea5fa975da32e0102800", size = 16194933 },
+    { url = "https://files.pythonhosted.org/packages/13/3e/1959d5219a9e6d200638d924cedda6a606392f7186a4ed56478252e70d55/numpy-2.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5c5cc0cbabe9452038ed984d05ac87910f89370b9242371bd9079cb4af61811e", size = 12820057 },
+]
+
+[[package]]
+name = "nvidia-cublas-cu12"
+version = "12.4.5.8"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 },
+]
+
+[[package]]
+name = "nvidia-cuda-cupti-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 },
+]
+
+[[package]]
+name = "nvidia-cuda-nvrtc-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 },
+]
+
+[[package]]
+name = "nvidia-cuda-runtime-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 },
+]
+
+[[package]]
+name = "nvidia-cudnn-cu12"
+version = "9.1.0.70"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nvidia-cublas-cu12" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 },
+]
+
+[[package]]
+name = "nvidia-cufft-cu12"
+version = "11.2.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
+]
+
+[[package]]
+name = "nvidia-curand-cu12"
+version = "10.3.5.147"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 },
+]
+
+[[package]]
+name = "nvidia-cusolver-cu12"
+version = "11.6.1.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nvidia-cublas-cu12" },
+    { name = "nvidia-cusparse-cu12" },
+    { name = "nvidia-nvjitlink-cu12" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
+]
+
+[[package]]
+name = "nvidia-cusparse-cu12"
+version = "12.3.1.170"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
+]
+
+[[package]]
+name = "nvidia-ml-py"
+version = "12.560.30"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/53/10/5f34de4a71db8b2b7ec4269f4a33287f24c23e2857ea3187c977b7bc3604/nvidia-ml-py-12.560.30.tar.gz", hash = "sha256:f0254dc7400647680a072ee02509bfd46102b60bdfeca321576d4d4817e7fe97", size = 39194 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/f3/a69ce0b1a1e12fbf6b2ad9f4c14c9999fdbdf15f2478d210f0fd501ddc98/nvidia_ml_py-12.560.30-py3-none-any.whl", hash = "sha256:fea371c94d63e38a611c17bbb85fe400e9c8ddb9e8684a9cd0e47786a4bc3c73", size = 40526 },
+]
+
+[[package]]
+name = "nvidia-nccl-cu12"
+version = "2.21.5"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/99/12cd266d6233f47d00daf3a72739872bdc10267d0383508b0b9c84a18bb6/nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0", size = 188654414 },
+]
+
+[[package]]
+name = "nvidia-nvjitlink-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 },
+]
+
+[[package]]
+name = "nvidia-nvtx-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
+]
+
+[[package]]
+name = "opentelemetry-api"
+version = "1.29.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "importlib-metadata" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bc/8e/b886a5e9861afa188d1fe671fb96ff9a1d90a23d57799331e137cc95d573/opentelemetry_api-1.29.0.tar.gz", hash = "sha256:d04a6cf78aad09614f52964ecb38021e248f5714dc32c2e0d8fd99517b4d69cf", size = 62900 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/53/5249ea860d417a26a3a6f1bdedfc0748c4f081a3adaec3d398bc0f7c6a71/opentelemetry_api-1.29.0-py3-none-any.whl", hash = "sha256:5fcd94c4141cc49c736271f3e1efb777bebe9cc535759c54c936cca4f1b312b8", size = 64304 },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp"
+version = "1.29.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-exporter-otlp-proto-grpc" },
+    { name = "opentelemetry-exporter-otlp-proto-http" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/91/23/824e71822969cd3018897f5b0246baf8305bf7635f20df1ce5dfc423c32d/opentelemetry_exporter_otlp-1.29.0.tar.gz", hash = "sha256:ee7dfcccbb5e87ad9b389908452e10b7beeab55f70a83f41ce5b8c4efbde6544", size = 6159 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cf/54/2a84533f39bb240958d691bb3ddf1c3fb6a92356654fb2e02a210f65ce6b/opentelemetry_exporter_otlp-1.29.0-py3-none-any.whl", hash = "sha256:b8da6e20f5b0ffe604154b1e16a407eade17ce310c42fb85bb4e1246fc3688ad", size = 7011 },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-common"
+version = "1.29.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-proto" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/58/f7fd7eaf592b2521999a4271ab3ce1c82fe37fe9b0dc25c348398d95d66a/opentelemetry_exporter_otlp_proto_common-1.29.0.tar.gz", hash = "sha256:e7c39b5dbd1b78fe199e40ddfe477e6983cb61aa74ba836df09c3869a3e3e163", size = 19133 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/75/7609bda3d72bf307839570b226180513e854c01443ebe265ed732a4980fc/opentelemetry_exporter_otlp_proto_common-1.29.0-py3-none-any.whl", hash = "sha256:a9d7376c06b4da9cf350677bcddb9618ed4b8255c3f6476975f5e38274ecd3aa", size = 18459 },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-grpc"
+version = "1.29.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "googleapis-common-protos" },
+    { name = "grpcio" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/41/aa/b3f2190613141f35fe15145bf438334fdd1eac8aeeee4f7ecbc887999443/opentelemetry_exporter_otlp_proto_grpc-1.29.0.tar.gz", hash = "sha256:3d324d07d64574d72ed178698de3d717f62a059a93b6b7685ee3e303384e73ea", size = 26224 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/de/4b4127a25d1594851d99032f3a9acb09cb512d11edec713410fb906607f4/opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl", hash = "sha256:5a2a3a741a2543ed162676cf3eefc2b4150e6f4f0a193187afb0d0e65039c69c", size = 18520 },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-http"
+version = "1.29.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "googleapis-common-protos" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ab/88/e70a2e9fbb1bddb1ab7b6d74fb02c68601bff5948292ce33464c84ee082e/opentelemetry_exporter_otlp_proto_http-1.29.0.tar.gz", hash = "sha256:b10d174e3189716f49d386d66361fbcf6f2b9ad81e05404acdee3f65c8214204", size = 15041 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/31/49/a1c3d24e8fe73b5f422e21b46c24aed3db7fd9427371c06442e7bdfe4d3b/opentelemetry_exporter_otlp_proto_http-1.29.0-py3-none-any.whl", hash = "sha256:b228bdc0f0cfab82eeea834a7f0ffdd2a258b26aa33d89fb426c29e8e934d9d0", size = 17217 },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation"
+version = "0.50b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "packaging" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/79/2e/2e59a7cb636dc394bd7cf1758ada5e8ed87590458ca6bb2f9c26e0243847/opentelemetry_instrumentation-0.50b0.tar.gz", hash = "sha256:7d98af72de8dec5323e5202e46122e5f908592b22c6d24733aad619f07d82979", size = 26539 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/b1/55a77152a83ec8998e520a3a575f44af1020cfe4bdc000b7538583293b85/opentelemetry_instrumentation-0.50b0-py3-none-any.whl", hash = "sha256:b8f9fc8812de36e1c6dffa5bfc6224df258841fb387b6dfe5df15099daa10630", size = 30728 },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-grpc"
+version = "0.50b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/51/56/658110193718ddde6e8e68ef3ad3fee7850055820a9fc2bd7ec1347afeca/opentelemetry_instrumentation_grpc-0.50b0.tar.gz", hash = "sha256:12381fbc0a7a91410fb9dad5f26f6de5eb5c30cd19c840fa9bfee78b584af7e7", size = 30746 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/85/94/94ea66294e0dca44e53267d68a7de68424c9120f142dbc2e7db503d85f3a/opentelemetry_instrumentation_grpc-0.50b0-py3-none-any.whl", hash = "sha256:d4a13cad1c7ce019429e0253a00bee8852314be90004d8cf0051df51d9da1ac3", size = 27069 },
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "1.29.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/80/52/fd3b3d79e1b00ad2dcac92db6885e49bedbf7a6828647954e4952d653132/opentelemetry_proto-1.29.0.tar.gz", hash = "sha256:3c136aa293782e9b44978c738fff72877a4b78b5d21a64e879898db7b2d93e5d", size = 34320 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bd/66/a500e38ee322d89fce61c74bd7769c8ef3bebc6c2f43fda5f3fc3441286d/opentelemetry_proto-1.29.0-py3-none-any.whl", hash = "sha256:495069c6f5495cbf732501cdcd3b7f60fda2b9d3d4255706ca99b7ca8dec53ff", size = 55818 },
+]
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.29.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0c/5a/1ed4c3cf6c09f80565fc085f7e8efa0c222712fd2a9412d07424705dcf72/opentelemetry_sdk-1.29.0.tar.gz", hash = "sha256:b0787ce6aade6ab84315302e72bd7a7f2f014b0fb1b7c3295b88afe014ed0643", size = 157229 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/1d/512b86af21795fb463726665e2f61db77d384e8779fdcf4cb0ceec47866d/opentelemetry_sdk-1.29.0-py3-none-any.whl", hash = "sha256:173be3b5d3f8f7d671f20ea37056710217959e774e2749d984355d1f9391a30a", size = 118078 },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.50b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "opentelemetry-api" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e7/4e/d7c7c91ff47cd96fe4095dd7231701aec7347426fd66872ff320d6cd1fcc/opentelemetry_semantic_conventions-0.50b0.tar.gz", hash = "sha256:02dc6dbcb62f082de9b877ff19a3f1ffaa3c306300fa53bfac761c4567c83d38", size = 100459 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/da/fb/dc15fad105450a015e913cfa4f5c27b6a5f1bea8fb649f8cae11e699c8af/opentelemetry_semantic_conventions-0.50b0-py3-none-any.whl", hash = "sha256:e87efba8fdb67fb38113efea6a349531e75ed7ffc01562f65b802fcecb5e115e", size = 166602 },
+]
+
+[[package]]
+name = "outlines"
+version = "0.1.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "airportsdata" },
+    { name = "cloudpickle" },
+    { name = "diskcache" },
+    { name = "interegular" },
+    { name = "jinja2" },
+    { name = "jsonschema" },
+    { name = "lark" },
+    { name = "nest-asyncio" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "outlines-core" },
+    { name = "pycountry" },
+    { name = "pydantic" },
+    { name = "referencing" },
+    { name = "requests" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/35/bf/77024d3775cb4534cd26564f3a54c909a7c4d85bf59aab197ac97ce723f9/outlines-0.1.13.tar.gz", hash = "sha256:0233cb3ffae9cb6b01ad8d3c32b7d87e3f1cf7bdc7b28a0bc82cd3d277c09bca", size = 2490245 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/eb/2e/add66052a28ec22181560cab1dcaf82265a5f74533d76d336d1e0d7a17de/outlines-0.1.13-py3-none-any.whl", hash = "sha256:d34d36dd5001ffc24ae60588aa3afa27b58c8dfd3ba5f9b8ccb92298b91e5903", size = 87680 },
+]
+
+[[package]]
+name = "outlines-core"
+version = "0.1.26"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "interegular" },
+    { name = "jsonschema" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d3/f3/274d07f4702728b43581235a77e545ec602b25f9b0098b288a0f3052521d/outlines_core-0.1.26.tar.gz", hash = "sha256:481c4301341e77cc8f1832d616784adb4d461b4fec65878e7c0d2cba7163a189", size = 75139 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e2/df/e9ff00f1dcf671cb8c4c20abcfd53406328b344cafa689a2832e8059c0b4/outlines_core-0.1.26-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:6a962a7452e7ac170fa04d405342cadae2d28fafa5b1830cef7aa610257ed32f", size = 322602 },
+    { url = "https://files.pythonhosted.org/packages/3c/f1/e9064f18c462a61f4abbe73b24f25e36d8abef19c593416fa69dce6a83c0/outlines_core-0.1.26-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15a3684fa29564da2db03934cf0097bef3e871f70d3af0ef2b52fdb886da2e09", size = 301929 },
+    { url = "https://files.pythonhosted.org/packages/76/c3/6bc82db40b4818421e573237f43d4026c40a3305fa2558eb0aa1a7aa08f7/outlines_core-0.1.26-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64e01c0cfa9ba371634d7c3f6ea1862397cef98e4509fe98e3f57faa721a72d6", size = 321355 },
+    { url = "https://files.pythonhosted.org/packages/c9/c2/1d85bfeaee3a83327e0d162bee4bdc7d7889bea5998e44fcc66c924dc1fd/outlines_core-0.1.26-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3c4196148e47f455f1ace78e329d5b97e531cbc406456d681592952adae7e17", size = 343552 },
+    { url = "https://files.pythonhosted.org/packages/45/da/1e61d3d997ba1858fb8e71c3127f24a95c30575559da012ea5b45b147ad3/outlines_core-0.1.26-cp310-cp310-win32.whl", hash = "sha256:f38d290a7f6e5e12cbfcaee03269dfc0dbda49b360024b4279d1aba251fdc346", size = 234750 },
+    { url = "https://files.pythonhosted.org/packages/1c/04/6d7968019a81df235ad6bc7405eefe32be8da4c4153792655e7490d06c8d/outlines_core-0.1.26-cp310-cp310-win_amd64.whl", hash = "sha256:11ff56af56cb54c563b7f25d86cd9ee77f3fed825f1d4dccd9449bb1e4e89538", size = 243713 },
+    { url = "https://files.pythonhosted.org/packages/17/94/19d5c50c303ba71f3465c81620ca9b5af4db07fd8922dfe59ae5a9ae61d1/outlines_core-0.1.26-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:b6787b07b7c673fc3087d2b537719ecac8e03b10a47d032dd1926985c32885b0", size = 322344 },
+    { url = "https://files.pythonhosted.org/packages/f2/ea/f44beea7f610f2737ebb908c8dfa37d8324e92ca529468a56b00a77af199/outlines_core-0.1.26-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e0ea28a76da31d25b6f53242bf13e1b59a0241badf82353c88f55e1cf81b128", size = 301670 },
+    { url = "https://files.pythonhosted.org/packages/6a/a6/ceac3760e1feb898b4047aeb54e0a3de975b59e87a17d6ba0a04dec5eaed/outlines_core-0.1.26-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8932044a3d9329be53a226118850638f85b4d7842f9b863d0a123f23de220cd", size = 321067 },
+    { url = "https://files.pythonhosted.org/packages/92/f0/ad0074d6726fed86bb0bba1b9307cbbd67a2af5debd3540d66c69298a001/outlines_core-0.1.26-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a84b7cd2fb6268bf990dd3d479ffb4fa0bace6f571cb85b15b6cdb44b84f5b69", size = 343264 },
+    { url = "https://files.pythonhosted.org/packages/e6/bd/198c9a73d5f36e2ecad558a26359af3f0dbe4f5ba11c4629e46fccdfe2d6/outlines_core-0.1.26-cp311-cp311-win32.whl", hash = "sha256:f19765c151abfc970996368080aeea6d2a19e927817fe4e2af6726e639be3de4", size = 234529 },
+    { url = "https://files.pythonhosted.org/packages/b9/27/354b484045e6368c92f688d954124064ec2ce961681e56711852904e1ec2/outlines_core-0.1.26-cp311-cp311-win_amd64.whl", hash = "sha256:3f59aeccea21ed6ff3cf52102fd163f26d279821c20e5127ddd18d4ea4d0c8d2", size = 243457 },
+    { url = "https://files.pythonhosted.org/packages/c6/86/0fb40746e579db38d89f127122a3900d9e0350f76aae8cb61adeaff44cc2/outlines_core-0.1.26-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f54633bca50055d42ea4d94ae06dcbe52d3d76a9b621b75723b1177d0d952953", size = 321874 },
+    { url = "https://files.pythonhosted.org/packages/ab/0c/b91f7bc03843796c1d643ee030b6cd8fd5a8ba2cd4856c855f140c878976/outlines_core-0.1.26-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9525321b48700dcaaabf60bcdc951e45f9357ba3fb3e1bfc81b662d7d4170e7c", size = 301995 },
+    { url = "https://files.pythonhosted.org/packages/ad/db/fa91a2d54288b900de82d86eda3adb2417b3b5b2db6256854a5e8bc85c32/outlines_core-0.1.26-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00f409f72c11f6ffadb57066950dd384d5388015028c1a1a615c9a64988dae3e", size = 321050 },
+    { url = "https://files.pythonhosted.org/packages/e2/1d/a36292b6198986bd9c3ff8c24355deb82ed5475403379ee40b5b5473e2e3/outlines_core-0.1.26-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e86a1bb46adc5cbf6dfd7a7fe4105e0e2a4c6e041732a053126b41c521a1f223", size = 343201 },
+    { url = "https://files.pythonhosted.org/packages/08/63/5dd2b5a364412f674b6edcb59b0c21513bdb07cdcc7613b064c1a0660d01/outlines_core-0.1.26-cp312-cp312-win32.whl", hash = "sha256:19f462f6b00935708677ad27cb4df55e0e17f6ffe713ab750f5f2683b090f95d", size = 233970 },
+    { url = "https://files.pythonhosted.org/packages/a5/56/8adf0b7446d1e975c2314454813c59eb7b195889908a2932ed34148c113c/outlines_core-0.1.26-cp312-cp312-win_amd64.whl", hash = "sha256:9b36bff12779e58883747116893a17b3551bbd10865878b951b03a44d112229a", size = 243578 },
+    { url = "https://files.pythonhosted.org/packages/5b/b8/d38f13e417f2af82f6cd588df5a26ff8ffeb9a3a15b8a1a883d5922048ac/outlines_core-0.1.26-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:7b7849cf40028319ebb9d8ba0fe4c590ef5888eebe524a81b3af30aaa06ea21c", size = 323047 },
+    { url = "https://files.pythonhosted.org/packages/3b/59/43f5c7b8ad0e98a663db8f24cbff0421a58e946934e3f3751a17a06ae84d/outlines_core-0.1.26-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2f8641aab4a6bd84516907492ce82099503129da01b3c29c1dc9ad50320bae77", size = 302547 },
+    { url = "https://files.pythonhosted.org/packages/23/30/d9eb8202cbc994536a1fefa60e87c582a714ad1b46367c43af299f3889ba/outlines_core-0.1.26-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bba56604efdbc5932c7a8a88c2b8b0d0c740ab883b0012fb5464a9736796802b", size = 321817 },
+    { url = "https://files.pythonhosted.org/packages/80/c7/bd718f6329f5c65b936edff532e40614b66032b153ab9ac89e6cb6d2fcff/outlines_core-0.1.26-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc8c87d89bd267356f8149c9066cbb98970425ec162997fbf195c3f1feb7009", size = 344102 },
+    { url = "https://files.pythonhosted.org/packages/cb/3e/69ca7394fc89c98c763f2d475c08206fa56f2ffeec210833c1db0195ba51/outlines_core-0.1.26-cp39-cp39-win32.whl", hash = "sha256:9d792a43ed9d8a4e1b38f4d83fe99db442d57aad4404c2edf98b710892eda47e", size = 235081 },
+    { url = "https://files.pythonhosted.org/packages/56/7f/38decd8d63f6b2d513c372186c68337455efba3fb45d0ff722ac268f05d2/outlines_core-0.1.26-cp39-cp39-win_amd64.whl", hash = "sha256:ad8564ecd7b64bcb840596c5049ff1c1a96346de494302ffcc0f2b188c15675e", size = 244007 },
+]
+
+[[package]]
+name = "packaging"
+version = "24.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 },
+]
+
+[[package]]
+name = "pandas"
+version = "2.2.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "python-dateutil" },
+    { name = "pytz" },
+    { name = "tzdata" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827 },
+    { url = "https://files.pythonhosted.org/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897 },
+    { url = "https://files.pythonhosted.org/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908 },
+    { url = "https://files.pythonhosted.org/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210 },
+    { url = "https://files.pythonhosted.org/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292 },
+    { url = "https://files.pythonhosted.org/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379 },
+    { url = "https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471 },
+    { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222 },
+    { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274 },
+    { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836 },
+    { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505 },
+    { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420 },
+    { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457 },
+    { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166 },
+    { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 },
+    { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 },
+    { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 },
+    { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 },
+    { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 },
+    { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 },
+    { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 },
+    { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 },
+    { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 },
+    { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 },
+    { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 },
+    { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 },
+    { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 },
+    { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 },
+    { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 },
+    { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 },
+    { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 },
+    { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 },
+    { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 },
+    { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 },
+    { url = "https://files.pythonhosted.org/packages/ca/8c/8848a4c9b8fdf5a534fe2077af948bf53cd713d77ffbcd7bd15710348fd7/pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39", size = 12595535 },
+    { url = "https://files.pythonhosted.org/packages/9c/b9/5cead4f63b6d31bdefeb21a679bc5a7f4aaf262ca7e07e2bc1c341b68470/pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30", size = 11319822 },
+    { url = "https://files.pythonhosted.org/packages/31/af/89e35619fb573366fa68dc26dad6ad2c08c17b8004aad6d98f1a31ce4bb3/pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c", size = 15625439 },
+    { url = "https://files.pythonhosted.org/packages/3d/dd/bed19c2974296661493d7acc4407b1d2db4e2a482197df100f8f965b6225/pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c", size = 13068928 },
+    { url = "https://files.pythonhosted.org/packages/31/a3/18508e10a31ea108d746c848b5a05c0711e0278fa0d6f1c52a8ec52b80a5/pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea", size = 16783266 },
+    { url = "https://files.pythonhosted.org/packages/c4/a5/3429bd13d82bebc78f4d78c3945efedef63a7cd0c15c17b2eeb838d1121f/pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761", size = 14450871 },
+    { url = "https://files.pythonhosted.org/packages/2f/49/5c30646e96c684570925b772eac4eb0a8cb0ca590fa978f56c5d3ae73ea1/pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e", size = 11618011 },
+]
+
+[[package]]
+name = "peft"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "accelerate" },
+    { name = "huggingface-hub" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyyaml" },
+    { name = "safetensors" },
+    { name = "torch" },
+    { name = "tqdm" },
+    { name = "transformers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/21/33/fb0c31eaa8162c01e9250b21aa65d46a5339f17a818a97c68391db2ff44b/peft-0.14.0.tar.gz", hash = "sha256:546d69af7b42f5ef715a3d3261ed818bc917ae6055e5d7e187ed3f2c76ad72dc", size = 411902 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/05/e58e3aaa36544d30a917814e336fc65a746f708e5874945e92999bc22fa3/peft-0.14.0-py3-none-any.whl", hash = "sha256:2f04f3a870c3baf30f15e7dcaa5dd70d3e54cfdd146d3c6c187735d3ae0a0700", size = 374831 },
+]
+
+[[package]]
+name = "pillow"
+version = "11.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/af/c097e544e7bd278333db77933e535098c259609c4eb3b85381109602fb5b/pillow-11.1.0.tar.gz", hash = "sha256:368da70808b36d73b4b390a8ffac11069f8a5c85f29eff1f1b01bcf3ef5b2a20", size = 46742715 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/50/1c/2dcea34ac3d7bc96a1fd1bd0a6e06a57c67167fec2cff8d95d88229a8817/pillow-11.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:e1abe69aca89514737465752b4bcaf8016de61b3be1397a8fc260ba33321b3a8", size = 3229983 },
+    { url = "https://files.pythonhosted.org/packages/14/ca/6bec3df25e4c88432681de94a3531cc738bd85dea6c7aa6ab6f81ad8bd11/pillow-11.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c640e5a06869c75994624551f45e5506e4256562ead981cce820d5ab39ae2192", size = 3101831 },
+    { url = "https://files.pythonhosted.org/packages/d4/2c/668e18e5521e46eb9667b09e501d8e07049eb5bfe39d56be0724a43117e6/pillow-11.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a07dba04c5e22824816b2615ad7a7484432d7f540e6fa86af60d2de57b0fcee2", size = 4314074 },
+    { url = "https://files.pythonhosted.org/packages/02/80/79f99b714f0fc25f6a8499ecfd1f810df12aec170ea1e32a4f75746051ce/pillow-11.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e267b0ed063341f3e60acd25c05200df4193e15a4a5807075cd71225a2386e26", size = 4394933 },
+    { url = "https://files.pythonhosted.org/packages/81/aa/8d4ad25dc11fd10a2001d5b8a80fdc0e564ac33b293bdfe04ed387e0fd95/pillow-11.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bd165131fd51697e22421d0e467997ad31621b74bfc0b75956608cb2906dda07", size = 4353349 },
+    { url = "https://files.pythonhosted.org/packages/84/7a/cd0c3eaf4a28cb2a74bdd19129f7726277a7f30c4f8424cd27a62987d864/pillow-11.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:abc56501c3fd148d60659aae0af6ddc149660469082859fa7b066a298bde9482", size = 4476532 },
+    { url = "https://files.pythonhosted.org/packages/8f/8b/a907fdd3ae8f01c7670dfb1499c53c28e217c338b47a813af8d815e7ce97/pillow-11.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:54ce1c9a16a9561b6d6d8cb30089ab1e5eb66918cb47d457bd996ef34182922e", size = 4279789 },
+    { url = "https://files.pythonhosted.org/packages/6f/9a/9f139d9e8cccd661c3efbf6898967a9a337eb2e9be2b454ba0a09533100d/pillow-11.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:73ddde795ee9b06257dac5ad42fcb07f3b9b813f8c1f7f870f402f4dc54b5269", size = 4413131 },
+    { url = "https://files.pythonhosted.org/packages/a8/68/0d8d461f42a3f37432203c8e6df94da10ac8081b6d35af1c203bf3111088/pillow-11.1.0-cp310-cp310-win32.whl", hash = "sha256:3a5fe20a7b66e8135d7fd617b13272626a28278d0e578c98720d9ba4b2439d49", size = 2291213 },
+    { url = "https://files.pythonhosted.org/packages/14/81/d0dff759a74ba87715509af9f6cb21fa21d93b02b3316ed43bda83664db9/pillow-11.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6123aa4a59d75f06e9dd3dac5bf8bc9aa383121bb3dd9a7a612e05eabc9961a", size = 2625725 },
+    { url = "https://files.pythonhosted.org/packages/ce/1f/8d50c096a1d58ef0584ddc37e6f602828515219e9d2428e14ce50f5ecad1/pillow-11.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:a76da0a31da6fcae4210aa94fd779c65c75786bc9af06289cd1c184451ef7a65", size = 2375213 },
+    { url = "https://files.pythonhosted.org/packages/dd/d6/2000bfd8d5414fb70cbbe52c8332f2283ff30ed66a9cde42716c8ecbe22c/pillow-11.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:e06695e0326d05b06833b40b7ef477e475d0b1ba3a6d27da1bb48c23209bf457", size = 3229968 },
+    { url = "https://files.pythonhosted.org/packages/d9/45/3fe487010dd9ce0a06adf9b8ff4f273cc0a44536e234b0fad3532a42c15b/pillow-11.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96f82000e12f23e4f29346e42702b6ed9a2f2fea34a740dd5ffffcc8c539eb35", size = 3101806 },
+    { url = "https://files.pythonhosted.org/packages/e3/72/776b3629c47d9d5f1c160113158a7a7ad177688d3a1159cd3b62ded5a33a/pillow-11.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3cd561ded2cf2bbae44d4605837221b987c216cff94f49dfeed63488bb228d2", size = 4322283 },
+    { url = "https://files.pythonhosted.org/packages/e4/c2/e25199e7e4e71d64eeb869f5b72c7ddec70e0a87926398785ab944d92375/pillow-11.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f189805c8be5ca5add39e6f899e6ce2ed824e65fb45f3c28cb2841911da19070", size = 4402945 },
+    { url = "https://files.pythonhosted.org/packages/c1/ed/51d6136c9d5911f78632b1b86c45241c712c5a80ed7fa7f9120a5dff1eba/pillow-11.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dd0052e9db3474df30433f83a71b9b23bd9e4ef1de13d92df21a52c0303b8ab6", size = 4361228 },
+    { url = "https://files.pythonhosted.org/packages/48/a4/fbfe9d5581d7b111b28f1d8c2762dee92e9821bb209af9fa83c940e507a0/pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:837060a8599b8f5d402e97197d4924f05a2e0d68756998345c829c33186217b1", size = 4484021 },
+    { url = "https://files.pythonhosted.org/packages/39/db/0b3c1a5018117f3c1d4df671fb8e47d08937f27519e8614bbe86153b65a5/pillow-11.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa8dd43daa836b9a8128dbe7d923423e5ad86f50a7a14dc688194b7be5c0dea2", size = 4287449 },
+    { url = "https://files.pythonhosted.org/packages/d9/58/bc128da7fea8c89fc85e09f773c4901e95b5936000e6f303222490c052f3/pillow-11.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0a2f91f8a8b367e7a57c6e91cd25af510168091fb89ec5146003e424e1558a96", size = 4419972 },
+    { url = "https://files.pythonhosted.org/packages/5f/bb/58f34379bde9fe197f51841c5bbe8830c28bbb6d3801f16a83b8f2ad37df/pillow-11.1.0-cp311-cp311-win32.whl", hash = "sha256:c12fc111ef090845de2bb15009372175d76ac99969bdf31e2ce9b42e4b8cd88f", size = 2291201 },
+    { url = "https://files.pythonhosted.org/packages/3a/c6/fce9255272bcf0c39e15abd2f8fd8429a954cf344469eaceb9d0d1366913/pillow-11.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd43429d0d7ed6533b25fc993861b8fd512c42d04514a0dd6337fb3ccf22761", size = 2625686 },
+    { url = "https://files.pythonhosted.org/packages/c8/52/8ba066d569d932365509054859f74f2a9abee273edcef5cd75e4bc3e831e/pillow-11.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:f7955ecf5609dee9442cbface754f2c6e541d9e6eda87fad7f7a989b0bdb9d71", size = 2375194 },
+    { url = "https://files.pythonhosted.org/packages/95/20/9ce6ed62c91c073fcaa23d216e68289e19d95fb8188b9fb7a63d36771db8/pillow-11.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2062ffb1d36544d42fcaa277b069c88b01bb7298f4efa06731a7fd6cc290b81a", size = 3226818 },
+    { url = "https://files.pythonhosted.org/packages/b9/d8/f6004d98579a2596c098d1e30d10b248798cceff82d2b77aa914875bfea1/pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a85b653980faad27e88b141348707ceeef8a1186f75ecc600c395dcac19f385b", size = 3101662 },
+    { url = "https://files.pythonhosted.org/packages/08/d9/892e705f90051c7a2574d9f24579c9e100c828700d78a63239676f960b74/pillow-11.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9409c080586d1f683df3f184f20e36fb647f2e0bc3988094d4fd8c9f4eb1b3b3", size = 4329317 },
+    { url = "https://files.pythonhosted.org/packages/8c/aa/7f29711f26680eab0bcd3ecdd6d23ed6bce180d82e3f6380fb7ae35fcf3b/pillow-11.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fdadc077553621911f27ce206ffcbec7d3f8d7b50e0da39f10997e8e2bb7f6a", size = 4412999 },
+    { url = "https://files.pythonhosted.org/packages/c8/c4/8f0fe3b9e0f7196f6d0bbb151f9fba323d72a41da068610c4c960b16632a/pillow-11.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:93a18841d09bcdd774dcdc308e4537e1f867b3dec059c131fde0327899734aa1", size = 4368819 },
+    { url = "https://files.pythonhosted.org/packages/38/0d/84200ed6a871ce386ddc82904bfadc0c6b28b0c0ec78176871a4679e40b3/pillow-11.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9aa9aeddeed452b2f616ff5507459e7bab436916ccb10961c4a382cd3e03f47f", size = 4496081 },
+    { url = "https://files.pythonhosted.org/packages/84/9c/9bcd66f714d7e25b64118e3952d52841a4babc6d97b6d28e2261c52045d4/pillow-11.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3cdcdb0b896e981678eee140d882b70092dac83ac1cdf6b3a60e2216a73f2b91", size = 4296513 },
+    { url = "https://files.pythonhosted.org/packages/db/61/ada2a226e22da011b45f7104c95ebda1b63dcbb0c378ad0f7c2a710f8fd2/pillow-11.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36ba10b9cb413e7c7dfa3e189aba252deee0602c86c309799da5a74009ac7a1c", size = 4431298 },
+    { url = "https://files.pythonhosted.org/packages/e7/c4/fc6e86750523f367923522014b821c11ebc5ad402e659d8c9d09b3c9d70c/pillow-11.1.0-cp312-cp312-win32.whl", hash = "sha256:cfd5cd998c2e36a862d0e27b2df63237e67273f2fc78f47445b14e73a810e7e6", size = 2291630 },
+    { url = "https://files.pythonhosted.org/packages/08/5c/2104299949b9d504baf3f4d35f73dbd14ef31bbd1ddc2c1b66a5b7dfda44/pillow-11.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:a697cd8ba0383bba3d2d3ada02b34ed268cb548b369943cd349007730c92bddf", size = 2626369 },
+    { url = "https://files.pythonhosted.org/packages/37/f3/9b18362206b244167c958984b57c7f70a0289bfb59a530dd8af5f699b910/pillow-11.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:4dd43a78897793f60766563969442020e90eb7847463eca901e41ba186a7d4a5", size = 2375240 },
+    { url = "https://files.pythonhosted.org/packages/b3/31/9ca79cafdce364fd5c980cd3416c20ce1bebd235b470d262f9d24d810184/pillow-11.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ae98e14432d458fc3de11a77ccb3ae65ddce70f730e7c76140653048c71bfcbc", size = 3226640 },
+    { url = "https://files.pythonhosted.org/packages/ac/0f/ff07ad45a1f172a497aa393b13a9d81a32e1477ef0e869d030e3c1532521/pillow-11.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cc1331b6d5a6e144aeb5e626f4375f5b7ae9934ba620c0ac6b3e43d5e683a0f0", size = 3101437 },
+    { url = "https://files.pythonhosted.org/packages/08/2f/9906fca87a68d29ec4530be1f893149e0cb64a86d1f9f70a7cfcdfe8ae44/pillow-11.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:758e9d4ef15d3560214cddbc97b8ef3ef86ce04d62ddac17ad39ba87e89bd3b1", size = 4326605 },
+    { url = "https://files.pythonhosted.org/packages/b0/0f/f3547ee15b145bc5c8b336401b2d4c9d9da67da9dcb572d7c0d4103d2c69/pillow-11.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b523466b1a31d0dcef7c5be1f20b942919b62fd6e9a9be199d035509cbefc0ec", size = 4411173 },
+    { url = "https://files.pythonhosted.org/packages/b1/df/bf8176aa5db515c5de584c5e00df9bab0713548fd780c82a86cba2c2fedb/pillow-11.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:9044b5e4f7083f209c4e35aa5dd54b1dd5b112b108648f5c902ad586d4f945c5", size = 4369145 },
+    { url = "https://files.pythonhosted.org/packages/de/7c/7433122d1cfadc740f577cb55526fdc39129a648ac65ce64db2eb7209277/pillow-11.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:3764d53e09cdedd91bee65c2527815d315c6b90d7b8b79759cc48d7bf5d4f114", size = 4496340 },
+    { url = "https://files.pythonhosted.org/packages/25/46/dd94b93ca6bd555588835f2504bd90c00d5438fe131cf01cfa0c5131a19d/pillow-11.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:31eba6bbdd27dde97b0174ddf0297d7a9c3a507a8a1480e1e60ef914fe23d352", size = 4296906 },
+    { url = "https://files.pythonhosted.org/packages/a8/28/2f9d32014dfc7753e586db9add35b8a41b7a3b46540e965cb6d6bc607bd2/pillow-11.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b5d658fbd9f0d6eea113aea286b21d3cd4d3fd978157cbf2447a6035916506d3", size = 4431759 },
+    { url = "https://files.pythonhosted.org/packages/33/48/19c2cbe7403870fbe8b7737d19eb013f46299cdfe4501573367f6396c775/pillow-11.1.0-cp313-cp313-win32.whl", hash = "sha256:f86d3a7a9af5d826744fabf4afd15b9dfef44fe69a98541f666f66fbb8d3fef9", size = 2291657 },
+    { url = "https://files.pythonhosted.org/packages/3b/ad/285c556747d34c399f332ba7c1a595ba245796ef3e22eae190f5364bb62b/pillow-11.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:593c5fd6be85da83656b93ffcccc2312d2d149d251e98588b14fbc288fd8909c", size = 2626304 },
+    { url = "https://files.pythonhosted.org/packages/e5/7b/ef35a71163bf36db06e9c8729608f78dedf032fc8313d19bd4be5c2588f3/pillow-11.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:11633d58b6ee5733bde153a8dafd25e505ea3d32e261accd388827ee987baf65", size = 2375117 },
+    { url = "https://files.pythonhosted.org/packages/79/30/77f54228401e84d6791354888549b45824ab0ffde659bafa67956303a09f/pillow-11.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:70ca5ef3b3b1c4a0812b5c63c57c23b63e53bc38e758b37a951e5bc466449861", size = 3230060 },
+    { url = "https://files.pythonhosted.org/packages/ce/b1/56723b74b07dd64c1010fee011951ea9c35a43d8020acd03111f14298225/pillow-11.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8000376f139d4d38d6851eb149b321a52bb8893a88dae8ee7d95840431977081", size = 3106192 },
+    { url = "https://files.pythonhosted.org/packages/e1/cd/7bf7180e08f80a4dcc6b4c3a0aa9e0b0ae57168562726a05dc8aa8fa66b0/pillow-11.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ee85f0696a17dd28fbcfceb59f9510aa71934b483d1f5601d1030c3c8304f3c", size = 4446805 },
+    { url = "https://files.pythonhosted.org/packages/97/42/87c856ea30c8ed97e8efbe672b58c8304dee0573f8c7cab62ae9e31db6ae/pillow-11.1.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:dd0e081319328928531df7a0e63621caf67652c8464303fd102141b785ef9547", size = 4530623 },
+    { url = "https://files.pythonhosted.org/packages/ff/41/026879e90c84a88e33fb00cc6bd915ac2743c67e87a18f80270dfe3c2041/pillow-11.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e63e4e5081de46517099dc30abe418122f54531a6ae2ebc8680bcd7096860eab", size = 4465191 },
+    { url = "https://files.pythonhosted.org/packages/e5/fb/a7960e838bc5df57a2ce23183bfd2290d97c33028b96bde332a9057834d3/pillow-11.1.0-cp313-cp313t-win32.whl", hash = "sha256:dda60aa465b861324e65a78c9f5cf0f4bc713e4309f83bc387be158b077963d9", size = 2295494 },
+    { url = "https://files.pythonhosted.org/packages/d7/6c/6ec83ee2f6f0fda8d4cf89045c6be4b0373ebfc363ba8538f8c999f63fcd/pillow-11.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ad5db5781c774ab9a9b2c4302bbf0c1014960a0a7be63278d13ae6fdf88126fe", size = 2631595 },
+    { url = "https://files.pythonhosted.org/packages/cf/6c/41c21c6c8af92b9fea313aa47c75de49e2f9a467964ee33eb0135d47eb64/pillow-11.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:67cd427c68926108778a9005f2a04adbd5e67c442ed21d95389fe1d595458756", size = 2377651 },
+    { url = "https://files.pythonhosted.org/packages/9a/1f/9df5ac77491fddd2e36c352d16976dc11fbe6ab842f5df85fd7e31b847b9/pillow-11.1.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:bf902d7413c82a1bfa08b06a070876132a5ae6b2388e2712aab3a7cbc02205c6", size = 3229995 },
+    { url = "https://files.pythonhosted.org/packages/a6/62/c7b359e924dca274173b04922ac06aa63614f7e934d132f2fe1d852509aa/pillow-11.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c1eec9d950b6fe688edee07138993e54ee4ae634c51443cfb7c1e7613322718e", size = 3101890 },
+    { url = "https://files.pythonhosted.org/packages/7b/63/136f21340a434de895b62bcf2c386005a8aa24066c4facd619f5e0e9f283/pillow-11.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e275ee4cb11c262bd108ab2081f750db2a1c0b8c12c1897f27b160c8bd57bbc", size = 4310366 },
+    { url = "https://files.pythonhosted.org/packages/f6/46/0bd0ca03d9d1164a7fa33d285ef6d1c438e963d0c8770e4c5b3737ef5abe/pillow-11.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4db853948ce4e718f2fc775b75c37ba2efb6aaea41a1a5fc57f0af59eee774b2", size = 4391582 },
+    { url = "https://files.pythonhosted.org/packages/0c/55/f182db572b28bd833b8e806f933f782ceb2df64c40e4d8bd3d4226a46eca/pillow-11.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:ab8a209b8485d3db694fa97a896d96dd6533d63c22829043fd9de627060beade", size = 4350278 },
+    { url = "https://files.pythonhosted.org/packages/75/fb/e330fdbbcbc4744214b5f53b84d9d8a9f4ffbebc2e9c2ac10475386e3296/pillow-11.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:54251ef02a2309b5eec99d151ebf5c9904b77976c8abdcbce7891ed22df53884", size = 4471768 },
+    { url = "https://files.pythonhosted.org/packages/eb/51/20ee6c4da4448d7a67ffb720a5fcdb965115a78e211a1f58f9845ae15f86/pillow-11.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5bb94705aea800051a743aa4874bb1397d4695fb0583ba5e425ee0328757f196", size = 4276549 },
+    { url = "https://files.pythonhosted.org/packages/37/f2/a25c0bdaa6d6fd5cc3d4a6f65b5a7ea46e7af58bee00a98efe0a5af79c58/pillow-11.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:89dbdb3e6e9594d512780a5a1c42801879628b38e3efc7038094430844e271d8", size = 4409350 },
+    { url = "https://files.pythonhosted.org/packages/12/a7/06687947604cd3e47abeea1b78b65d34ffce7feab03cfe0dd985f115dca3/pillow-11.1.0-cp39-cp39-win32.whl", hash = "sha256:e5449ca63da169a2e6068dd0e2fcc8d91f9558aba89ff6d02121ca8ab11e79e5", size = 2291271 },
+    { url = "https://files.pythonhosted.org/packages/21/a6/f51d47675940b5c63b08ff0575b3518428b4acb891f88526fa4ee1edab6f/pillow-11.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:3362c6ca227e65c54bf71a5f88b3d4565ff1bcbc63ae72c34b07bbb1cc59a43f", size = 2625783 },
+    { url = "https://files.pythonhosted.org/packages/95/56/97750bd33e68648fa432dfadcb8ede7624bd905822d42262d34bcebdd9d7/pillow-11.1.0-cp39-cp39-win_arm64.whl", hash = "sha256:b20be51b37a75cc54c2c55def3fa2c65bb94ba859dde241cd0a4fd302de5ae0a", size = 2375193 },
+    { url = "https://files.pythonhosted.org/packages/fa/c5/389961578fb677b8b3244fcd934f720ed25a148b9a5cc81c91bdf59d8588/pillow-11.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8c730dc3a83e5ac137fbc92dfcfe1511ce3b2b5d7578315b63dbbb76f7f51d90", size = 3198345 },
+    { url = "https://files.pythonhosted.org/packages/c4/fa/803c0e50ffee74d4b965229e816af55276eac1d5806712de86f9371858fd/pillow-11.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7d33d2fae0e8b170b6a6c57400e077412240f6f5bb2a342cf1ee512a787942bb", size = 3072938 },
+    { url = "https://files.pythonhosted.org/packages/dc/67/2a3a5f8012b5d8c63fe53958ba906c1b1d0482ebed5618057ef4d22f8076/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8d65b38173085f24bc07f8b6c505cbb7418009fa1a1fcb111b1f4961814a442", size = 3400049 },
+    { url = "https://files.pythonhosted.org/packages/e5/a0/514f0d317446c98c478d1872497eb92e7cde67003fed74f696441e647446/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:015c6e863faa4779251436db398ae75051469f7c903b043a48f078e437656f83", size = 3422431 },
+    { url = "https://files.pythonhosted.org/packages/cd/00/20f40a935514037b7d3f87adfc87d2c538430ea625b63b3af8c3f5578e72/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d44ff19eea13ae4acdaaab0179fa68c0c6f2f45d66a4d8ec1eda7d6cecbcc15f", size = 3446208 },
+    { url = "https://files.pythonhosted.org/packages/28/3c/7de681727963043e093c72e6c3348411b0185eab3263100d4490234ba2f6/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d3d8da4a631471dfaf94c10c85f5277b1f8e42ac42bade1ac67da4b4a7359b73", size = 3509746 },
+    { url = "https://files.pythonhosted.org/packages/41/67/936f9814bdd74b2dfd4822f1f7725ab5d8ff4103919a1664eb4874c58b2f/pillow-11.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4637b88343166249fe8aa94e7c4a62a180c4b3898283bb5d3d2fd5fe10d8e4e0", size = 2626353 },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
+]
+
+[[package]]
+name = "prometheus-client"
+version = "0.21.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/62/14/7d0f567991f3a9af8d1cd4f619040c93b68f09a02b6d0b6ab1b2d1ded5fe/prometheus_client-0.21.1.tar.gz", hash = "sha256:252505a722ac04b0456be05c05f75f45d760c2911ffc45f2a06bcaed9f3ae3fb", size = 78551 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/c2/ab7d37426c179ceb9aeb109a85cda8948bb269b7561a0be870cc656eefe4/prometheus_client-0.21.1-py3-none-any.whl", hash = "sha256:594b45c410d6f4f8888940fe80b5cc2521b305a1fafe1c58609ef715a001f301", size = 54682 },
+]
+
+[[package]]
+name = "propcache"
+version = "0.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/c8/2a13f78d82211490855b2fb303b6721348d0787fdd9a12ac46d99d3acde1/propcache-0.2.1.tar.gz", hash = "sha256:3f77ce728b19cb537714499928fe800c3dda29e8d9428778fc7c186da4c09a64", size = 41735 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/a5/0ea64c9426959ef145a938e38c832fc551843481d356713ececa9a8a64e8/propcache-0.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6b3f39a85d671436ee3d12c017f8fdea38509e4f25b28eb25877293c98c243f6", size = 79296 },
+    { url = "https://files.pythonhosted.org/packages/76/5a/916db1aba735f55e5eca4733eea4d1973845cf77dfe67c2381a2ca3ce52d/propcache-0.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d51fbe4285d5db5d92a929e3e21536ea3dd43732c5b177c7ef03f918dff9f2", size = 45622 },
+    { url = "https://files.pythonhosted.org/packages/2d/62/685d3cf268b8401ec12b250b925b21d152b9d193b7bffa5fdc4815c392c2/propcache-0.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6445804cf4ec763dc70de65a3b0d9954e868609e83850a47ca4f0cb64bd79fea", size = 45133 },
+    { url = "https://files.pythonhosted.org/packages/4d/3d/31c9c29ee7192defc05aa4d01624fd85a41cf98e5922aaed206017329944/propcache-0.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9479aa06a793c5aeba49ce5c5692ffb51fcd9a7016e017d555d5e2b0045d212", size = 204809 },
+    { url = "https://files.pythonhosted.org/packages/10/a1/e4050776f4797fc86140ac9a480d5dc069fbfa9d499fe5c5d2fa1ae71f07/propcache-0.2.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9631c5e8b5b3a0fda99cb0d29c18133bca1e18aea9effe55adb3da1adef80d3", size = 219109 },
+    { url = "https://files.pythonhosted.org/packages/c9/c0/e7ae0df76343d5e107d81e59acc085cea5fd36a48aa53ef09add7503e888/propcache-0.2.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3156628250f46a0895f1f36e1d4fbe062a1af8718ec3ebeb746f1d23f0c5dc4d", size = 217368 },
+    { url = "https://files.pythonhosted.org/packages/fc/e1/e0a2ed6394b5772508868a977d3238f4afb2eebaf9976f0b44a8d347ad63/propcache-0.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b6fb63ae352e13748289f04f37868099e69dba4c2b3e271c46061e82c745634", size = 205124 },
+    { url = "https://files.pythonhosted.org/packages/50/c1/e388c232d15ca10f233c778bbdc1034ba53ede14c207a72008de45b2db2e/propcache-0.2.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:887d9b0a65404929641a9fabb6452b07fe4572b269d901d622d8a34a4e9043b2", size = 195463 },
+    { url = "https://files.pythonhosted.org/packages/0a/fd/71b349b9def426cc73813dbd0f33e266de77305e337c8c12bfb0a2a82bfb/propcache-0.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a96dc1fa45bd8c407a0af03b2d5218392729e1822b0c32e62c5bf7eeb5fb3958", size = 198358 },
+    { url = "https://files.pythonhosted.org/packages/02/f2/d7c497cd148ebfc5b0ae32808e6c1af5922215fe38c7a06e4e722fe937c8/propcache-0.2.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a7e65eb5c003a303b94aa2c3852ef130230ec79e349632d030e9571b87c4698c", size = 195560 },
+    { url = "https://files.pythonhosted.org/packages/bb/57/f37041bbe5e0dfed80a3f6be2612a3a75b9cfe2652abf2c99bef3455bbad/propcache-0.2.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:999779addc413181912e984b942fbcc951be1f5b3663cd80b2687758f434c583", size = 196895 },
+    { url = "https://files.pythonhosted.org/packages/83/36/ae3cc3e4f310bff2f064e3d2ed5558935cc7778d6f827dce74dcfa125304/propcache-0.2.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:19a0f89a7bb9d8048d9c4370c9c543c396e894c76be5525f5e1ad287f1750ddf", size = 207124 },
+    { url = "https://files.pythonhosted.org/packages/8c/c4/811b9f311f10ce9d31a32ff14ce58500458443627e4df4ae9c264defba7f/propcache-0.2.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1ac2f5fe02fa75f56e1ad473f1175e11f475606ec9bd0be2e78e4734ad575034", size = 210442 },
+    { url = "https://files.pythonhosted.org/packages/18/dd/a1670d483a61ecac0d7fc4305d91caaac7a8fc1b200ea3965a01cf03bced/propcache-0.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:574faa3b79e8ebac7cb1d7930f51184ba1ccf69adfdec53a12f319a06030a68b", size = 203219 },
+    { url = "https://files.pythonhosted.org/packages/f9/2d/30ced5afde41b099b2dc0c6573b66b45d16d73090e85655f1a30c5a24e07/propcache-0.2.1-cp310-cp310-win32.whl", hash = "sha256:03ff9d3f665769b2a85e6157ac8b439644f2d7fd17615a82fa55739bc97863f4", size = 40313 },
+    { url = "https://files.pythonhosted.org/packages/23/84/bd9b207ac80da237af77aa6e153b08ffa83264b1c7882495984fcbfcf85c/propcache-0.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:2d3af2e79991102678f53e0dbf4c35de99b6b8b58f29a27ca0325816364caaba", size = 44428 },
+    { url = "https://files.pythonhosted.org/packages/bc/0f/2913b6791ebefb2b25b4efd4bb2299c985e09786b9f5b19184a88e5778dd/propcache-0.2.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1ffc3cca89bb438fb9c95c13fc874012f7b9466b89328c3c8b1aa93cdcfadd16", size = 79297 },
+    { url = "https://files.pythonhosted.org/packages/cf/73/af2053aeccd40b05d6e19058419ac77674daecdd32478088b79375b9ab54/propcache-0.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f174bbd484294ed9fdf09437f889f95807e5f229d5d93588d34e92106fbf6717", size = 45611 },
+    { url = "https://files.pythonhosted.org/packages/3c/09/8386115ba7775ea3b9537730e8cf718d83bbf95bffe30757ccf37ec4e5da/propcache-0.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:70693319e0b8fd35dd863e3e29513875eb15c51945bf32519ef52927ca883bc3", size = 45146 },
+    { url = "https://files.pythonhosted.org/packages/03/7a/793aa12f0537b2e520bf09f4c6833706b63170a211ad042ca71cbf79d9cb/propcache-0.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b480c6a4e1138e1aa137c0079b9b6305ec6dcc1098a8ca5196283e8a49df95a9", size = 232136 },
+    { url = "https://files.pythonhosted.org/packages/f1/38/b921b3168d72111769f648314100558c2ea1d52eb3d1ba7ea5c4aa6f9848/propcache-0.2.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d27b84d5880f6d8aa9ae3edb253c59d9f6642ffbb2c889b78b60361eed449787", size = 239706 },
+    { url = "https://files.pythonhosted.org/packages/14/29/4636f500c69b5edea7786db3c34eb6166f3384b905665ce312a6e42c720c/propcache-0.2.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:857112b22acd417c40fa4595db2fe28ab900c8c5fe4670c7989b1c0230955465", size = 238531 },
+    { url = "https://files.pythonhosted.org/packages/85/14/01fe53580a8e1734ebb704a3482b7829a0ef4ea68d356141cf0994d9659b/propcache-0.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf6c4150f8c0e32d241436526f3c3f9cbd34429492abddbada2ffcff506c51af", size = 231063 },
+    { url = "https://files.pythonhosted.org/packages/33/5c/1d961299f3c3b8438301ccfbff0143b69afcc30c05fa28673cface692305/propcache-0.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66d4cfda1d8ed687daa4bc0274fcfd5267873db9a5bc0418c2da19273040eeb7", size = 220134 },
+    { url = "https://files.pythonhosted.org/packages/00/d0/ed735e76db279ba67a7d3b45ba4c654e7b02bc2f8050671ec365d8665e21/propcache-0.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c2f992c07c0fca81655066705beae35fc95a2fa7366467366db627d9f2ee097f", size = 220009 },
+    { url = "https://files.pythonhosted.org/packages/75/90/ee8fab7304ad6533872fee982cfff5a53b63d095d78140827d93de22e2d4/propcache-0.2.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:4a571d97dbe66ef38e472703067021b1467025ec85707d57e78711c085984e54", size = 212199 },
+    { url = "https://files.pythonhosted.org/packages/eb/ec/977ffaf1664f82e90737275873461695d4c9407d52abc2f3c3e24716da13/propcache-0.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:bb6178c241278d5fe853b3de743087be7f5f4c6f7d6d22a3b524d323eecec505", size = 214827 },
+    { url = "https://files.pythonhosted.org/packages/57/48/031fb87ab6081764054821a71b71942161619549396224cbb242922525e8/propcache-0.2.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ad1af54a62ffe39cf34db1aa6ed1a1873bd548f6401db39d8e7cd060b9211f82", size = 228009 },
+    { url = "https://files.pythonhosted.org/packages/1a/06/ef1390f2524850838f2390421b23a8b298f6ce3396a7cc6d39dedd4047b0/propcache-0.2.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e7048abd75fe40712005bcfc06bb44b9dfcd8e101dda2ecf2f5aa46115ad07ca", size = 231638 },
+    { url = "https://files.pythonhosted.org/packages/38/2a/101e6386d5a93358395da1d41642b79c1ee0f3b12e31727932b069282b1d/propcache-0.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:160291c60081f23ee43d44b08a7e5fb76681221a8e10b3139618c5a9a291b84e", size = 222788 },
+    { url = "https://files.pythonhosted.org/packages/db/81/786f687951d0979007e05ad9346cd357e50e3d0b0f1a1d6074df334b1bbb/propcache-0.2.1-cp311-cp311-win32.whl", hash = "sha256:819ce3b883b7576ca28da3861c7e1a88afd08cc8c96908e08a3f4dd64a228034", size = 40170 },
+    { url = "https://files.pythonhosted.org/packages/cf/59/7cc7037b295d5772eceb426358bb1b86e6cab4616d971bd74275395d100d/propcache-0.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:edc9fc7051e3350643ad929df55c451899bb9ae6d24998a949d2e4c87fb596d3", size = 44404 },
+    { url = "https://files.pythonhosted.org/packages/4c/28/1d205fe49be8b1b4df4c50024e62480a442b1a7b818e734308bb0d17e7fb/propcache-0.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:081a430aa8d5e8876c6909b67bd2d937bfd531b0382d3fdedb82612c618bc41a", size = 79588 },
+    { url = "https://files.pythonhosted.org/packages/21/ee/fc4d893f8d81cd4971affef2a6cb542b36617cd1d8ce56b406112cb80bf7/propcache-0.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d2ccec9ac47cf4e04897619c0e0c1a48c54a71bdf045117d3a26f80d38ab1fb0", size = 45825 },
+    { url = "https://files.pythonhosted.org/packages/4a/de/bbe712f94d088da1d237c35d735f675e494a816fd6f54e9db2f61ef4d03f/propcache-0.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:14d86fe14b7e04fa306e0c43cdbeebe6b2c2156a0c9ce56b815faacc193e320d", size = 45357 },
+    { url = "https://files.pythonhosted.org/packages/7f/14/7ae06a6cf2a2f1cb382586d5a99efe66b0b3d0c6f9ac2f759e6f7af9d7cf/propcache-0.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:049324ee97bb67285b49632132db351b41e77833678432be52bdd0289c0e05e4", size = 241869 },
+    { url = "https://files.pythonhosted.org/packages/cc/59/227a78be960b54a41124e639e2c39e8807ac0c751c735a900e21315f8c2b/propcache-0.2.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cd9a1d071158de1cc1c71a26014dcdfa7dd3d5f4f88c298c7f90ad6f27bb46d", size = 247884 },
+    { url = "https://files.pythonhosted.org/packages/84/58/f62b4ffaedf88dc1b17f04d57d8536601e4e030feb26617228ef930c3279/propcache-0.2.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98110aa363f1bb4c073e8dcfaefd3a5cea0f0834c2aab23dda657e4dab2f53b5", size = 248486 },
+    { url = "https://files.pythonhosted.org/packages/1c/07/ebe102777a830bca91bbb93e3479cd34c2ca5d0361b83be9dbd93104865e/propcache-0.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:647894f5ae99c4cf6bb82a1bb3a796f6e06af3caa3d32e26d2350d0e3e3faf24", size = 243649 },
+    { url = "https://files.pythonhosted.org/packages/ed/bc/4f7aba7f08f520376c4bb6a20b9a981a581b7f2e385fa0ec9f789bb2d362/propcache-0.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfd3223c15bebe26518d58ccf9a39b93948d3dcb3e57a20480dfdd315356baff", size = 229103 },
+    { url = "https://files.pythonhosted.org/packages/fe/d5/04ac9cd4e51a57a96f78795e03c5a0ddb8f23ec098b86f92de028d7f2a6b/propcache-0.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d71264a80f3fcf512eb4f18f59423fe82d6e346ee97b90625f283df56aee103f", size = 226607 },
+    { url = "https://files.pythonhosted.org/packages/e3/f0/24060d959ea41d7a7cc7fdbf68b31852331aabda914a0c63bdb0e22e96d6/propcache-0.2.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:e73091191e4280403bde6c9a52a6999d69cdfde498f1fdf629105247599b57ec", size = 221153 },
+    { url = "https://files.pythonhosted.org/packages/77/a7/3ac76045a077b3e4de4859a0753010765e45749bdf53bd02bc4d372da1a0/propcache-0.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3935bfa5fede35fb202c4b569bb9c042f337ca4ff7bd540a0aa5e37131659348", size = 222151 },
+    { url = "https://files.pythonhosted.org/packages/e7/af/5e29da6f80cebab3f5a4dcd2a3240e7f56f2c4abf51cbfcc99be34e17f0b/propcache-0.2.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f508b0491767bb1f2b87fdfacaba5f7eddc2f867740ec69ece6d1946d29029a6", size = 233812 },
+    { url = "https://files.pythonhosted.org/packages/8c/89/ebe3ad52642cc5509eaa453e9f4b94b374d81bae3265c59d5c2d98efa1b4/propcache-0.2.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:1672137af7c46662a1c2be1e8dc78cb6d224319aaa40271c9257d886be4363a6", size = 238829 },
+    { url = "https://files.pythonhosted.org/packages/e9/2f/6b32f273fa02e978b7577159eae7471b3cfb88b48563b1c2578b2d7ca0bb/propcache-0.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b74c261802d3d2b85c9df2dfb2fa81b6f90deeef63c2db9f0e029a3cac50b518", size = 230704 },
+    { url = "https://files.pythonhosted.org/packages/5c/2e/f40ae6ff5624a5f77edd7b8359b208b5455ea113f68309e2b00a2e1426b6/propcache-0.2.1-cp312-cp312-win32.whl", hash = "sha256:d09c333d36c1409d56a9d29b3a1b800a42c76a57a5a8907eacdbce3f18768246", size = 40050 },
+    { url = "https://files.pythonhosted.org/packages/3b/77/a92c3ef994e47180862b9d7d11e37624fb1c00a16d61faf55115d970628b/propcache-0.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:c214999039d4f2a5b2073ac506bba279945233da8c786e490d411dfc30f855c1", size = 44117 },
+    { url = "https://files.pythonhosted.org/packages/0f/2a/329e0547cf2def8857157f9477669043e75524cc3e6251cef332b3ff256f/propcache-0.2.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aca405706e0b0a44cc6bfd41fbe89919a6a56999157f6de7e182a990c36e37bc", size = 77002 },
+    { url = "https://files.pythonhosted.org/packages/12/2d/c4df5415e2382f840dc2ecbca0eeb2293024bc28e57a80392f2012b4708c/propcache-0.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:12d1083f001ace206fe34b6bdc2cb94be66d57a850866f0b908972f90996b3e9", size = 44639 },
+    { url = "https://files.pythonhosted.org/packages/d0/5a/21aaa4ea2f326edaa4e240959ac8b8386ea31dedfdaa636a3544d9e7a408/propcache-0.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d93f3307ad32a27bda2e88ec81134b823c240aa3abb55821a8da553eed8d9439", size = 44049 },
+    { url = "https://files.pythonhosted.org/packages/4e/3e/021b6cd86c0acc90d74784ccbb66808b0bd36067a1bf3e2deb0f3845f618/propcache-0.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba278acf14471d36316159c94a802933d10b6a1e117b8554fe0d0d9b75c9d536", size = 224819 },
+    { url = "https://files.pythonhosted.org/packages/3c/57/c2fdeed1b3b8918b1770a133ba5c43ad3d78e18285b0c06364861ef5cc38/propcache-0.2.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4e6281aedfca15301c41f74d7005e6e3f4ca143584ba696ac69df4f02f40d629", size = 229625 },
+    { url = "https://files.pythonhosted.org/packages/9d/81/70d4ff57bf2877b5780b466471bebf5892f851a7e2ca0ae7ffd728220281/propcache-0.2.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b750a8e5a1262434fb1517ddf64b5de58327f1adc3524a5e44c2ca43305eb0b", size = 232934 },
+    { url = "https://files.pythonhosted.org/packages/3c/b9/bb51ea95d73b3fb4100cb95adbd4e1acaf2cbb1fd1083f5468eeb4a099a8/propcache-0.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf72af5e0fb40e9babf594308911436c8efde3cb5e75b6f206c34ad18be5c052", size = 227361 },
+    { url = "https://files.pythonhosted.org/packages/f1/20/3c6d696cd6fd70b29445960cc803b1851a1131e7a2e4ee261ee48e002bcd/propcache-0.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b2d0a12018b04f4cb820781ec0dffb5f7c7c1d2a5cd22bff7fb055a2cb19ebce", size = 213904 },
+    { url = "https://files.pythonhosted.org/packages/a1/cb/1593bfc5ac6d40c010fa823f128056d6bc25b667f5393781e37d62f12005/propcache-0.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e800776a79a5aabdb17dcc2346a7d66d0777e942e4cd251defeb084762ecd17d", size = 212632 },
+    { url = "https://files.pythonhosted.org/packages/6d/5c/e95617e222be14a34c709442a0ec179f3207f8a2b900273720501a70ec5e/propcache-0.2.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:4160d9283bd382fa6c0c2b5e017acc95bc183570cd70968b9202ad6d8fc48dce", size = 207897 },
+    { url = "https://files.pythonhosted.org/packages/8e/3b/56c5ab3dc00f6375fbcdeefdede5adf9bee94f1fab04adc8db118f0f9e25/propcache-0.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:30b43e74f1359353341a7adb783c8f1b1c676367b011709f466f42fda2045e95", size = 208118 },
+    { url = "https://files.pythonhosted.org/packages/86/25/d7ef738323fbc6ebcbce33eb2a19c5e07a89a3df2fded206065bd5e868a9/propcache-0.2.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:58791550b27d5488b1bb52bc96328456095d96206a250d28d874fafe11b3dfaf", size = 217851 },
+    { url = "https://files.pythonhosted.org/packages/b3/77/763e6cef1852cf1ba740590364ec50309b89d1c818e3256d3929eb92fabf/propcache-0.2.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:0f022d381747f0dfe27e99d928e31bc51a18b65bb9e481ae0af1380a6725dd1f", size = 222630 },
+    { url = "https://files.pythonhosted.org/packages/4f/e9/0f86be33602089c701696fbed8d8c4c07b6ee9605c5b7536fd27ed540c5b/propcache-0.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:297878dc9d0a334358f9b608b56d02e72899f3b8499fc6044133f0d319e2ec30", size = 216269 },
+    { url = "https://files.pythonhosted.org/packages/cc/02/5ac83217d522394b6a2e81a2e888167e7ca629ef6569a3f09852d6dcb01a/propcache-0.2.1-cp313-cp313-win32.whl", hash = "sha256:ddfab44e4489bd79bda09d84c430677fc7f0a4939a73d2bba3073036f487a0a6", size = 39472 },
+    { url = "https://files.pythonhosted.org/packages/f4/33/d6f5420252a36034bc8a3a01171bc55b4bff5df50d1c63d9caa50693662f/propcache-0.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:556fc6c10989f19a179e4321e5d678db8eb2924131e64652a51fe83e4c3db0e1", size = 43363 },
+    { url = "https://files.pythonhosted.org/packages/0a/08/6ab7f65240a16fa01023125e65258acf7e4884f483f267cdd6fcc48f37db/propcache-0.2.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6a9a8c34fb7bb609419a211e59da8887eeca40d300b5ea8e56af98f6fbbb1541", size = 80403 },
+    { url = "https://files.pythonhosted.org/packages/34/fe/e7180285e21b4e6dff7d311fdf22490c9146a09a02834b5232d6248c6004/propcache-0.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ae1aa1cd222c6d205853b3013c69cd04515f9d6ab6de4b0603e2e1c33221303e", size = 46152 },
+    { url = "https://files.pythonhosted.org/packages/9c/36/aa74d884af826030ba9cee2ac109b0664beb7e9449c315c9c44db99efbb3/propcache-0.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:accb6150ce61c9c4b7738d45550806aa2b71c7668c6942f17b0ac182b6142fd4", size = 45674 },
+    { url = "https://files.pythonhosted.org/packages/22/59/6fe80a3fe7720f715f2c0f6df250dacbd7cad42832410dbd84c719c52f78/propcache-0.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5eee736daafa7af6d0a2dc15cc75e05c64f37fc37bafef2e00d77c14171c2097", size = 207792 },
+    { url = "https://files.pythonhosted.org/packages/4a/68/584cd51dd8f4d0f5fff5b128ce0cdb257cde903898eecfb92156bbc2c780/propcache-0.2.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7a31fc1e1bd362874863fdeed71aed92d348f5336fd84f2197ba40c59f061bd", size = 223280 },
+    { url = "https://files.pythonhosted.org/packages/85/cb/4c3528460c41e61b06ec3f970c0f89f87fa21f63acac8642ed81a886c164/propcache-0.2.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba4cfa1052819d16699e1d55d18c92b6e094d4517c41dd231a8b9f87b6fa681", size = 221293 },
+    { url = "https://files.pythonhosted.org/packages/69/c0/560e050aa6d31eeece3490d1174da508f05ab27536dfc8474af88b97160a/propcache-0.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f089118d584e859c62b3da0892b88a83d611c2033ac410e929cb6754eec0ed16", size = 208259 },
+    { url = "https://files.pythonhosted.org/packages/0c/87/d6c86a77632eb1ba86a328e3313159f246e7564cb5951e05ed77555826a0/propcache-0.2.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:781e65134efaf88feb447e8c97a51772aa75e48b794352f94cb7ea717dedda0d", size = 198632 },
+    { url = "https://files.pythonhosted.org/packages/3a/2b/3690ea7b662dc762ab7af5f3ef0e2d7513c823d193d7b2a1b4cda472c2be/propcache-0.2.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31f5af773530fd3c658b32b6bdc2d0838543de70eb9a2156c03e410f7b0d3aae", size = 203516 },
+    { url = "https://files.pythonhosted.org/packages/4d/b5/afe716c16c23c77657185c257a41918b83e03993b6ccdfa748e5e7d328e9/propcache-0.2.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:a7a078f5d37bee6690959c813977da5291b24286e7b962e62a94cec31aa5188b", size = 199402 },
+    { url = "https://files.pythonhosted.org/packages/a4/c0/2d2df3aa7f8660d0d4cc4f1e00490c48d5958da57082e70dea7af366f876/propcache-0.2.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:cea7daf9fc7ae6687cf1e2c049752f19f146fdc37c2cc376e7d0032cf4f25347", size = 200528 },
+    { url = "https://files.pythonhosted.org/packages/21/c8/65ac9142f5e40c8497f7176e71d18826b09e06dd4eb401c9a4ee41aa9c74/propcache-0.2.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:8b3489ff1ed1e8315674d0775dc7d2195fb13ca17b3808721b54dbe9fd020faf", size = 211254 },
+    { url = "https://files.pythonhosted.org/packages/09/e4/edb70b447a1d8142df51ec7511e84aa64d7f6ce0a0fdf5eb55363cdd0935/propcache-0.2.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9403db39be1393618dd80c746cb22ccda168efce239c73af13c3763ef56ffc04", size = 214589 },
+    { url = "https://files.pythonhosted.org/packages/cb/02/817f309ec8d8883287781d6d9390f80b14db6e6de08bc659dfe798a825c2/propcache-0.2.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5d97151bc92d2b2578ff7ce779cdb9174337390a535953cbb9452fb65164c587", size = 207283 },
+    { url = "https://files.pythonhosted.org/packages/d7/fe/2d18612096ed2212cfef821b6fccdba5d52efc1d64511c206c5c16be28fd/propcache-0.2.1-cp39-cp39-win32.whl", hash = "sha256:9caac6b54914bdf41bcc91e7eb9147d331d29235a7c967c150ef5df6464fd1bb", size = 40866 },
+    { url = "https://files.pythonhosted.org/packages/24/2e/b5134802e7b57c403c7b73c7a39374e7a6b7f128d1968b4a4b4c0b700250/propcache-0.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:92fc4500fcb33899b05ba73276dfb684a20d31caa567b7cb5252d48f896a91b1", size = 44975 },
+    { url = "https://files.pythonhosted.org/packages/41/b6/c5319caea262f4821995dca2107483b94a3345d4607ad797c76cb9c36bcc/propcache-0.2.1-py3-none-any.whl", hash = "sha256:52277518d6aae65536e9cea52d4e7fd2f7a66f4aa2d30ed3f2fcea620ace3c54", size = 11818 },
+]
+
+[[package]]
+name = "protobuf"
+version = "5.29.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f7/d1/e0a911544ca9993e0f17ce6d3cc0932752356c1b0a834397f28e63479344/protobuf-5.29.3.tar.gz", hash = "sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620", size = 424945 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dc/7a/1e38f3cafa022f477ca0f57a1f49962f21ad25850c3ca0acd3b9d0091518/protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888", size = 422708 },
+    { url = "https://files.pythonhosted.org/packages/61/fa/aae8e10512b83de633f2646506a6d835b151edf4b30d18d73afd01447253/protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a", size = 434508 },
+    { url = "https://files.pythonhosted.org/packages/dd/04/3eaedc2ba17a088961d0e3bd396eac764450f431621b58a04ce898acd126/protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e", size = 417825 },
+    { url = "https://files.pythonhosted.org/packages/4f/06/7c467744d23c3979ce250397e26d8ad8eeb2bea7b18ca12ad58313c1b8d5/protobuf-5.29.3-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84", size = 319573 },
+    { url = "https://files.pythonhosted.org/packages/a8/45/2ebbde52ad2be18d3675b6bee50e68cd73c9e0654de77d595540b5129df8/protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f", size = 319672 },
+    { url = "https://files.pythonhosted.org/packages/85/a6/bf65a38f8be5ab8c3b575822acfd338702fdf7ac9abd8c81630cc7c9f4bd/protobuf-5.29.3-cp39-cp39-win32.whl", hash = "sha256:0eb32bfa5219fc8d4111803e9a690658aa2e6366384fd0851064b963b6d1f2a7", size = 422676 },
+    { url = "https://files.pythonhosted.org/packages/ac/e2/48d46adc86369ff092eaece3e537f76b3baaab45ca3dde257838cde831d2/protobuf-5.29.3-cp39-cp39-win_amd64.whl", hash = "sha256:6ce8cc3389a20693bfde6c6562e03474c40851b44975c9b2bf6df7d8c4f864da", size = 434593 },
+    { url = "https://files.pythonhosted.org/packages/fd/b2/ab07b09e0f6d143dfb839693aa05765257bceaa13d03bf1a696b78323e7a/protobuf-5.29.3-py3-none-any.whl", hash = "sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f", size = 172550 },
+]
+
+[[package]]
+name = "psutil"
+version = "6.1.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1f/5a/07871137bb752428aa4b659f910b399ba6f291156bdea939be3e96cae7cb/psutil-6.1.1.tar.gz", hash = "sha256:cf8496728c18f2d0b45198f06895be52f36611711746b7f30c464b422b50e2f5", size = 508502 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/61/99/ca79d302be46f7bdd8321089762dd4476ee725fce16fc2b2e1dbba8cac17/psutil-6.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8", size = 247511 },
+    { url = "https://files.pythonhosted.org/packages/0b/6b/73dbde0dd38f3782905d4587049b9be64d76671042fdcaf60e2430c6796d/psutil-6.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377", size = 248985 },
+    { url = "https://files.pythonhosted.org/packages/17/38/c319d31a1d3f88c5b79c68b3116c129e5133f1822157dd6da34043e32ed6/psutil-6.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003", size = 284488 },
+    { url = "https://files.pythonhosted.org/packages/9c/39/0f88a830a1c8a3aba27fededc642da37613c57cbff143412e3536f89784f/psutil-6.1.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97f7cb9921fbec4904f522d972f0c0e1f4fabbdd4e0287813b21215074a0f160", size = 287477 },
+    { url = "https://files.pythonhosted.org/packages/47/da/99f4345d4ddf2845cb5b5bd0d93d554e84542d116934fde07a0c50bd4e9f/psutil-6.1.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33431e84fee02bc84ea36d9e2c4a6d395d479c9dd9bba2376c1f6ee8f3a4e0b3", size = 289017 },
+    { url = "https://files.pythonhosted.org/packages/38/53/bd755c2896f4461fd4f36fa6a6dcb66a88a9e4b9fd4e5b66a77cf9d4a584/psutil-6.1.1-cp37-abi3-win32.whl", hash = "sha256:eaa912e0b11848c4d9279a93d7e2783df352b082f40111e078388701fd479e53", size = 250602 },
+    { url = "https://files.pythonhosted.org/packages/7b/d7/7831438e6c3ebbfa6e01a927127a6cb42ad3ab844247f3c5b96bea25d73d/psutil-6.1.1-cp37-abi3-win_amd64.whl", hash = "sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649", size = 254444 },
+]
+
+[[package]]
+name = "py-cpuinfo"
+version = "9.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/37/a8/d832f7293ebb21690860d2e01d8115e5ff6f2ae8bbdc953f0eb0fa4bd2c7/py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690", size = 104716 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335 },
+]
+
+[[package]]
+name = "pyarrow"
+version = "19.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7b/01/fe1fd04744c2aa038e5a11c7a4adb3d62bce09798695e54f7274b5977134/pyarrow-19.0.0.tar.gz", hash = "sha256:8d47c691765cf497aaeed4954d226568563f1b3b74ff61139f2d77876717084b", size = 1129096 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1c/02/1ad80ffd3c558916858a49c83b6e494a9d93009bbebc603cf0cb8263bea7/pyarrow-19.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c318eda14f6627966997a7d8c374a87d084a94e4e38e9abbe97395c215830e0c", size = 30686262 },
+    { url = "https://files.pythonhosted.org/packages/1b/f0/adab5f142eb8203db8bfbd3a816816e37a85423ae684567e7f3555658315/pyarrow-19.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:62ef8360ff256e960f57ce0299090fb86423afed5e46f18f1225f960e05aae3d", size = 32100005 },
+    { url = "https://files.pythonhosted.org/packages/94/8b/e674083610e5efc48d2f205c568d842cdfdf683d12f9ff0d546e38757722/pyarrow-19.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2795064647add0f16563e57e3d294dbfc067b723f0fd82ecd80af56dad15f503", size = 41144815 },
+    { url = "https://files.pythonhosted.org/packages/d5/fb/2726241a792b7f8a58789e5a63d1be9a5a4059206318fd0ff9485a578952/pyarrow-19.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a218670b26fb1bc74796458d97bcab072765f9b524f95b2fccad70158feb8b17", size = 42180380 },
+    { url = "https://files.pythonhosted.org/packages/7d/09/7aef12446d8e7002dfc07bb7bc71f594c1d5844ca78b364a49f07efb65b1/pyarrow-19.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:66732e39eaa2247996a6b04c8aa33e3503d351831424cdf8d2e9a0582ac54b34", size = 40515021 },
+    { url = "https://files.pythonhosted.org/packages/31/55/f05fc5608cc96060c2b24de505324d641888bd62d4eed2fa1dacd872a1e1/pyarrow-19.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:e675a3ad4732b92d72e4d24009707e923cab76b0d088e5054914f11a797ebe44", size = 42067488 },
+    { url = "https://files.pythonhosted.org/packages/f0/01/097653cec7a944c16313cb748a326771133c142034b252076bd84743b98d/pyarrow-19.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:f094742275586cdd6b1a03655ccff3b24b2610c3af76f810356c4c71d24a2a6c", size = 25276726 },
+    { url = "https://files.pythonhosted.org/packages/82/42/fba3a35bef5833bf88ed35e6a810dc1781236e1d4f808d2df824a7d21819/pyarrow-19.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8e3a839bf36ec03b4315dc924d36dcde5444a50066f1c10f8290293c0427b46a", size = 30711936 },
+    { url = "https://files.pythonhosted.org/packages/88/7a/0da93a3eaaf251a30e32f3221e874263cdcd366c2cd6b7c05293aad91152/pyarrow-19.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:ce42275097512d9e4e4a39aade58ef2b3798a93aa3026566b7892177c266f735", size = 32133182 },
+    { url = "https://files.pythonhosted.org/packages/2f/df/fe43b1c50d3100d0de53f988344118bc20362d0de005f8a407454fa565f8/pyarrow-19.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9348a0137568c45601b031a8d118275069435f151cbb77e6a08a27e8125f59d4", size = 41145489 },
+    { url = "https://files.pythonhosted.org/packages/45/bb/6f73b41b342a0342f2516a02db4aa97a4f9569cc35482a5c288090140cd4/pyarrow-19.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a0144a712d990d60f7f42b7a31f0acaccf4c1e43e957f7b1ad58150d6f639c1", size = 42177823 },
+    { url = "https://files.pythonhosted.org/packages/23/7b/f038a96f421e453a71bd7a0f78d62b1b2ae9bcac06ed51179ca532e6a0a2/pyarrow-19.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2a1a109dfda558eb011e5f6385837daffd920d54ca00669f7a11132d0b1e6042", size = 40530609 },
+    { url = "https://files.pythonhosted.org/packages/b8/39/a2a6714b471c000e6dd6af4495dce00d7d1332351b8e3170dfb9f91dad1f/pyarrow-19.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:be686bf625aa7b9bada18defb3a3ea3981c1099697239788ff111d87f04cd263", size = 42081534 },
+    { url = "https://files.pythonhosted.org/packages/6c/a3/8396fb06ca05d807e89980c177be26617aad15211ece3184e0caa730b8a6/pyarrow-19.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:239ca66d9a05844bdf5af128861af525e14df3c9591bcc05bac25918e650d3a2", size = 25281090 },
+    { url = "https://files.pythonhosted.org/packages/bc/2e/152885f5ef421e80dae68b9c133ab261934f93a6d5e16b61d79c0ed597fb/pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:a7bbe7109ab6198688b7079cbad5a8c22de4d47c4880d8e4847520a83b0d1b68", size = 30667964 },
+    { url = "https://files.pythonhosted.org/packages/80/c2/08bbee9a8610a47c9a1466845f405baf53a639ddd947c5133d8ba13544b6/pyarrow-19.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:4624c89d6f777c580e8732c27bb8e77fd1433b89707f17c04af7635dd9638351", size = 32125039 },
+    { url = "https://files.pythonhosted.org/packages/d2/56/06994df823212f5688d3c8bf4294928b12c9be36681872853655724d28c6/pyarrow-19.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b6d3ce4288793350dc2d08d1e184fd70631ea22a4ff9ea5c4ff182130249d9b", size = 41140729 },
+    { url = "https://files.pythonhosted.org/packages/94/65/38ad577c98140a9db71e9e1e594b6adb58a7478a5afec6456a8ca2df7f70/pyarrow-19.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:450a7d27e840e4d9a384b5c77199d489b401529e75a3b7a3799d4cd7957f2f9c", size = 42202267 },
+    { url = "https://files.pythonhosted.org/packages/b6/1f/966b722251a7354114ccbb71cf1a83922023e69efd8945ebf628a851ec4c/pyarrow-19.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a08e2a8a039a3f72afb67a6668180f09fddaa38fe0d21f13212b4aba4b5d2451", size = 40505858 },
+    { url = "https://files.pythonhosted.org/packages/3b/5e/6bc81aa7fc9affc7d1c03b912fbcc984ca56c2a18513684da267715dab7b/pyarrow-19.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:f43f5aef2a13d4d56adadae5720d1fed4c1356c993eda8b59dace4b5983843c1", size = 42084973 },
+    { url = "https://files.pythonhosted.org/packages/53/c3/2f56da818b6a4758cbd514957c67bd0f078ebffa5390ee2e2bf0f9e8defc/pyarrow-19.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:2f672f5364b2d7829ef7c94be199bb88bf5661dd485e21d2d37de12ccb78a136", size = 25241976 },
+    { url = "https://files.pythonhosted.org/packages/f5/b9/ba07ed3dd6b6e4f379b78e9c47c50c8886e07862ab7fa6339ac38622d755/pyarrow-19.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:cf3bf0ce511b833f7bc5f5bb3127ba731e97222023a444b7359f3a22e2a3b463", size = 30651291 },
+    { url = "https://files.pythonhosted.org/packages/ad/10/0d304243c8277035298a68a70807efb76199c6c929bb3363c92ac9be6a0d/pyarrow-19.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:4d8b0c0de0a73df1f1bf439af1b60f273d719d70648e898bc077547649bb8352", size = 32100461 },
+    { url = "https://files.pythonhosted.org/packages/8a/61/bcfc5182e11831bca3f849945b9b106e09fd10ded773dff466658e972a45/pyarrow-19.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92aff08e23d281c69835e4a47b80569242a504095ef6a6223c1f6bb8883431d", size = 41132491 },
+    { url = "https://files.pythonhosted.org/packages/8e/87/2915a29049ec352dc69a967fbcbd76b0180319233de0daf8bd368df37099/pyarrow-19.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3b78eff5968a1889a0f3bc81ca57e1e19b75f664d9c61a42a604bf9d8402aae", size = 42192529 },
+    { url = "https://files.pythonhosted.org/packages/48/18/44e5542b2707a8afaf78b5b88c608f261871ae77787eac07b7c679ca6f0f/pyarrow-19.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b34d3bde38eba66190b215bae441646330f8e9da05c29e4b5dd3e41bde701098", size = 40495363 },
+    { url = "https://files.pythonhosted.org/packages/ba/d6/5096deb7599bbd20bc2768058fe23bc725b88eb41bee58303293583a2935/pyarrow-19.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5418d4d0fab3a0ed497bad21d17a7973aad336d66ad4932a3f5f7480d4ca0c04", size = 42074075 },
+    { url = "https://files.pythonhosted.org/packages/2c/df/e3c839c04c284c9ec3d62b02a8c452b795d9b07b04079ab91ce33484d4c5/pyarrow-19.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e82c3d5e44e969c217827b780ed8faf7ac4c53f934ae9238872e749fa531f7c9", size = 25239803 },
+    { url = "https://files.pythonhosted.org/packages/6a/d3/a6d4088e906c7b5d47792256212606d2ae679046dc750eee0ae167338e5c/pyarrow-19.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f208c3b58a6df3b239e0bb130e13bc7487ed14f39a9ff357b6415e3f6339b560", size = 30695401 },
+    { url = "https://files.pythonhosted.org/packages/94/25/70040fd0e397dd1b937f459eaeeec942a76027357491dca0ada09d1322af/pyarrow-19.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:c751c1c93955b7a84c06794df46f1cec93e18610dcd5ab7d08e89a81df70a849", size = 32104680 },
+    { url = "https://files.pythonhosted.org/packages/4e/f9/92783290cc0d80ca16d34b0c126305bfacca4b87dd889c8f16c6ef2a8fd7/pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b903afaa5df66d50fc38672ad095806443b05f202c792694f3a604ead7c6ea6e", size = 41076754 },
+    { url = "https://files.pythonhosted.org/packages/05/46/2c9870f50a495c72e2b8982ae29a9b1680707ea936edc0de444cec48f875/pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a22a4bc0937856263df8b94f2f2781b33dd7f876f787ed746608e06902d691a5", size = 42163133 },
+    { url = "https://files.pythonhosted.org/packages/7b/2f/437922b902549228fb15814e8a26105bff2787ece466a8d886eb6699efad/pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:5e8a28b918e2e878c918f6d89137386c06fe577cd08d73a6be8dafb317dc2d73", size = 40452210 },
+    { url = "https://files.pythonhosted.org/packages/36/ef/1d7975053af9d106da973bac142d0d4da71b7550a3576cc3e0b3f444d21a/pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:29cd86c8001a94f768f79440bf83fee23963af5e7bc68ce3a7e5f120e17edf89", size = 42077618 },
+    { url = "https://files.pythonhosted.org/packages/59/13/e39417005ee632e131d0246cf5c1149618a55554ccdf2a4d887065e672a7/pyarrow-19.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:c0423393e4a07ff6fea08feb44153302dd261d0551cc3b538ea7a5dc853af43a", size = 30698254 },
+    { url = "https://files.pythonhosted.org/packages/06/87/1f9d7df296dd5c065e52ae3e9070dfe611f6bd97e90f28b6a45c410dcb67/pyarrow-19.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:718947fb6d82409013a74b176bf93e0f49ef952d8a2ecd068fecd192a97885b7", size = 32114467 },
+    { url = "https://files.pythonhosted.org/packages/b2/b1/9e7babf5d469bd35f1d062a04721ead72456101f6d851a2e8a43bb07a580/pyarrow-19.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c1c162c4660e0978411a4761f91113dde8da3433683efa473501254563dcbe8", size = 41157781 },
+    { url = "https://files.pythonhosted.org/packages/9f/25/fa8e882a6c06e6d8dd640d3acce3912d7c39358940eb0c7b3c8b962457d0/pyarrow-19.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c73268cf557e688efb60f1ccbc7376f7e18cd8e2acae9e663e98b194c40c1a2d", size = 42190773 },
+    { url = "https://files.pythonhosted.org/packages/b8/d6/7fd60aa79cada815306d9804403386b06893ef63e73876174717a62002c4/pyarrow-19.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:edfe6d3916e915ada9acc4e48f6dafca7efdbad2e6283db6fd9385a1b23055f1", size = 40528310 },
+    { url = "https://files.pythonhosted.org/packages/0e/f1/c1ec7620a5768b8c7f9083572fda7b05b77ea083c3400fbd9e4ae40e63bd/pyarrow-19.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:da410b70a7ab8eb524112f037a7a35da7128b33d484f7671a264a4c224ac131d", size = 42080774 },
+    { url = "https://files.pythonhosted.org/packages/c4/28/c51c9af2703b5a592d1b66546611b24de8ca01e04c3f5da769c3318bca6c/pyarrow-19.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:597360ffc71fc8cceea1aec1fb60cb510571a744fffc87db33d551d5de919bec", size = 25464978 },
+]
+
+[[package]]
+name = "pycountry"
+version = "24.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/76/57/c389fa68c50590881a75b7883eeb3dc15e9e73a0fdc001cdd45c13290c92/pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221", size = 6043910 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b1/ec/1fb891d8a2660716aadb2143235481d15ed1cbfe3ad669194690b0604492/pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f", size = 6335189 },
+]
+
+[[package]]
+name = "pydantic"
+version = "2.10.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6a/c7/ca334c2ef6f2e046b1144fe4bb2a5da8a4c574e7f2ebf7e16b34a6a2fa92/pydantic-2.10.5.tar.gz", hash = "sha256:278b38dbbaec562011d659ee05f63346951b3a248a6f3642e1bc68894ea2b4ff", size = 761287 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/58/26/82663c79010b28eddf29dcdd0ea723439535fa917fce5905885c0e9ba562/pydantic-2.10.5-py3-none-any.whl", hash = "sha256:4dd4e322dbe55472cb7ca7e73f4b63574eecccf2835ffa2af9021ce113c83c53", size = 431426 },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.27.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fc/01/f3e5ac5e7c25833db5eb555f7b7ab24cd6f8c322d3a3ad2d67a952dc0abc/pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39", size = 413443 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3a/bc/fed5f74b5d802cf9a03e83f60f18864e90e3aed7223adaca5ffb7a8d8d64/pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa", size = 1895938 },
+    { url = "https://files.pythonhosted.org/packages/71/2a/185aff24ce844e39abb8dd680f4e959f0006944f4a8a0ea372d9f9ae2e53/pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c", size = 1815684 },
+    { url = "https://files.pythonhosted.org/packages/c3/43/fafabd3d94d159d4f1ed62e383e264f146a17dd4d48453319fd782e7979e/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7969e133a6f183be60e9f6f56bfae753585680f3b7307a8e555a948d443cc05a", size = 1829169 },
+    { url = "https://files.pythonhosted.org/packages/a2/d1/f2dfe1a2a637ce6800b799aa086d079998959f6f1215eb4497966efd2274/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3de9961f2a346257caf0aa508a4da705467f53778e9ef6fe744c038119737ef5", size = 1867227 },
+    { url = "https://files.pythonhosted.org/packages/7d/39/e06fcbcc1c785daa3160ccf6c1c38fea31f5754b756e34b65f74e99780b5/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2bb4d3e5873c37bb3dd58714d4cd0b0e6238cebc4177ac8fe878f8b3aa8e74c", size = 2037695 },
+    { url = "https://files.pythonhosted.org/packages/7a/67/61291ee98e07f0650eb756d44998214231f50751ba7e13f4f325d95249ab/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:280d219beebb0752699480fe8f1dc61ab6615c2046d76b7ab7ee38858de0a4e7", size = 2741662 },
+    { url = "https://files.pythonhosted.org/packages/32/90/3b15e31b88ca39e9e626630b4c4a1f5a0dfd09076366f4219429e6786076/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47956ae78b6422cbd46f772f1746799cbb862de838fd8d1fbd34a82e05b0983a", size = 1993370 },
+    { url = "https://files.pythonhosted.org/packages/ff/83/c06d333ee3a67e2e13e07794995c1535565132940715931c1c43bfc85b11/pydantic_core-2.27.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:14d4a5c49d2f009d62a2a7140d3064f686d17a5d1a268bc641954ba181880236", size = 1996813 },
+    { url = "https://files.pythonhosted.org/packages/7c/f7/89be1c8deb6e22618a74f0ca0d933fdcb8baa254753b26b25ad3acff8f74/pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:337b443af21d488716f8d0b6164de833e788aa6bd7e3a39c005febc1284f4962", size = 2005287 },
+    { url = "https://files.pythonhosted.org/packages/b7/7d/8eb3e23206c00ef7feee17b83a4ffa0a623eb1a9d382e56e4aa46fd15ff2/pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:03d0f86ea3184a12f41a2d23f7ccb79cdb5a18e06993f8a45baa8dfec746f0e9", size = 2128414 },
+    { url = "https://files.pythonhosted.org/packages/4e/99/fe80f3ff8dd71a3ea15763878d464476e6cb0a2db95ff1c5c554133b6b83/pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7041c36f5680c6e0f08d922aed302e98b3745d97fe1589db0a3eebf6624523af", size = 2155301 },
+    { url = "https://files.pythonhosted.org/packages/2b/a3/e50460b9a5789ca1451b70d4f52546fa9e2b420ba3bfa6100105c0559238/pydantic_core-2.27.2-cp310-cp310-win32.whl", hash = "sha256:50a68f3e3819077be2c98110c1f9dcb3817e93f267ba80a2c05bb4f8799e2ff4", size = 1816685 },
+    { url = "https://files.pythonhosted.org/packages/57/4c/a8838731cb0f2c2a39d3535376466de6049034d7b239c0202a64aaa05533/pydantic_core-2.27.2-cp310-cp310-win_amd64.whl", hash = "sha256:e0fd26b16394ead34a424eecf8a31a1f5137094cabe84a1bcb10fa6ba39d3d31", size = 1982876 },
+    { url = "https://files.pythonhosted.org/packages/c2/89/f3450af9d09d44eea1f2c369f49e8f181d742f28220f88cc4dfaae91ea6e/pydantic_core-2.27.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8e10c99ef58cfdf2a66fc15d66b16c4a04f62bca39db589ae8cba08bc55331bc", size = 1893421 },
+    { url = "https://files.pythonhosted.org/packages/9e/e3/71fe85af2021f3f386da42d291412e5baf6ce7716bd7101ea49c810eda90/pydantic_core-2.27.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26f32e0adf166a84d0cb63be85c562ca8a6fa8de28e5f0d92250c6b7e9e2aff7", size = 1814998 },
+    { url = "https://files.pythonhosted.org/packages/a6/3c/724039e0d848fd69dbf5806894e26479577316c6f0f112bacaf67aa889ac/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c19d1ea0673cd13cc2f872f6c9ab42acc4e4f492a7ca9d3795ce2b112dd7e15", size = 1826167 },
+    { url = "https://files.pythonhosted.org/packages/2b/5b/1b29e8c1fb5f3199a9a57c1452004ff39f494bbe9bdbe9a81e18172e40d3/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e68c4446fe0810e959cdff46ab0a41ce2f2c86d227d96dc3847af0ba7def306", size = 1865071 },
+    { url = "https://files.pythonhosted.org/packages/89/6c/3985203863d76bb7d7266e36970d7e3b6385148c18a68cc8915fd8c84d57/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9640b0059ff4f14d1f37321b94061c6db164fbe49b334b31643e0528d100d99", size = 2036244 },
+    { url = "https://files.pythonhosted.org/packages/0e/41/f15316858a246b5d723f7d7f599f79e37493b2e84bfc789e58d88c209f8a/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40d02e7d45c9f8af700f3452f329ead92da4c5f4317ca9b896de7ce7199ea459", size = 2737470 },
+    { url = "https://files.pythonhosted.org/packages/a8/7c/b860618c25678bbd6d1d99dbdfdf0510ccb50790099b963ff78a124b754f/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c1fd185014191700554795c99b347d64f2bb637966c4cfc16998a0ca700d048", size = 1992291 },
+    { url = "https://files.pythonhosted.org/packages/bf/73/42c3742a391eccbeab39f15213ecda3104ae8682ba3c0c28069fbcb8c10d/pydantic_core-2.27.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d81d2068e1c1228a565af076598f9e7451712700b673de8f502f0334f281387d", size = 1994613 },
+    { url = "https://files.pythonhosted.org/packages/94/7a/941e89096d1175d56f59340f3a8ebaf20762fef222c298ea96d36a6328c5/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a4207639fb02ec2dbb76227d7c751a20b1a6b4bc52850568e52260cae64ca3b", size = 2002355 },
+    { url = "https://files.pythonhosted.org/packages/6e/95/2359937a73d49e336a5a19848713555605d4d8d6940c3ec6c6c0ca4dcf25/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:3de3ce3c9ddc8bbd88f6e0e304dea0e66d843ec9de1b0042b0911c1663ffd474", size = 2126661 },
+    { url = "https://files.pythonhosted.org/packages/2b/4c/ca02b7bdb6012a1adef21a50625b14f43ed4d11f1fc237f9d7490aa5078c/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:30c5f68ded0c36466acede341551106821043e9afaad516adfb6e8fa80a4e6a6", size = 2153261 },
+    { url = "https://files.pythonhosted.org/packages/72/9d/a241db83f973049a1092a079272ffe2e3e82e98561ef6214ab53fe53b1c7/pydantic_core-2.27.2-cp311-cp311-win32.whl", hash = "sha256:c70c26d2c99f78b125a3459f8afe1aed4d9687c24fd677c6a4436bc042e50d6c", size = 1812361 },
+    { url = "https://files.pythonhosted.org/packages/e8/ef/013f07248041b74abd48a385e2110aa3a9bbfef0fbd97d4e6d07d2f5b89a/pydantic_core-2.27.2-cp311-cp311-win_amd64.whl", hash = "sha256:08e125dbdc505fa69ca7d9c499639ab6407cfa909214d500897d02afb816e7cc", size = 1982484 },
+    { url = "https://files.pythonhosted.org/packages/10/1c/16b3a3e3398fd29dca77cea0a1d998d6bde3902fa2706985191e2313cc76/pydantic_core-2.27.2-cp311-cp311-win_arm64.whl", hash = "sha256:26f0d68d4b235a2bae0c3fc585c585b4ecc51382db0e3ba402a22cbc440915e4", size = 1867102 },
+    { url = "https://files.pythonhosted.org/packages/d6/74/51c8a5482ca447871c93e142d9d4a92ead74de6c8dc5e66733e22c9bba89/pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0", size = 1893127 },
+    { url = "https://files.pythonhosted.org/packages/d3/f3/c97e80721735868313c58b89d2de85fa80fe8dfeeed84dc51598b92a135e/pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef", size = 1811340 },
+    { url = "https://files.pythonhosted.org/packages/9e/91/840ec1375e686dbae1bd80a9e46c26a1e0083e1186abc610efa3d9a36180/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7", size = 1822900 },
+    { url = "https://files.pythonhosted.org/packages/f6/31/4240bc96025035500c18adc149aa6ffdf1a0062a4b525c932065ceb4d868/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934", size = 1869177 },
+    { url = "https://files.pythonhosted.org/packages/fa/20/02fbaadb7808be578317015c462655c317a77a7c8f0ef274bc016a784c54/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6", size = 2038046 },
+    { url = "https://files.pythonhosted.org/packages/06/86/7f306b904e6c9eccf0668248b3f272090e49c275bc488a7b88b0823444a4/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c", size = 2685386 },
+    { url = "https://files.pythonhosted.org/packages/8d/f0/49129b27c43396581a635d8710dae54a791b17dfc50c70164866bbf865e3/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2", size = 1997060 },
+    { url = "https://files.pythonhosted.org/packages/0d/0f/943b4af7cd416c477fd40b187036c4f89b416a33d3cc0ab7b82708a667aa/pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4", size = 2004870 },
+    { url = "https://files.pythonhosted.org/packages/35/40/aea70b5b1a63911c53a4c8117c0a828d6790483f858041f47bab0b779f44/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3", size = 1999822 },
+    { url = "https://files.pythonhosted.org/packages/f2/b3/807b94fd337d58effc5498fd1a7a4d9d59af4133e83e32ae39a96fddec9d/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4", size = 2130364 },
+    { url = "https://files.pythonhosted.org/packages/fc/df/791c827cd4ee6efd59248dca9369fb35e80a9484462c33c6649a8d02b565/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57", size = 2158303 },
+    { url = "https://files.pythonhosted.org/packages/9b/67/4e197c300976af185b7cef4c02203e175fb127e414125916bf1128b639a9/pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc", size = 1834064 },
+    { url = "https://files.pythonhosted.org/packages/1f/ea/cd7209a889163b8dcca139fe32b9687dd05249161a3edda62860430457a5/pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9", size = 1989046 },
+    { url = "https://files.pythonhosted.org/packages/bc/49/c54baab2f4658c26ac633d798dab66b4c3a9bbf47cff5284e9c182f4137a/pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b", size = 1885092 },
+    { url = "https://files.pythonhosted.org/packages/41/b1/9bc383f48f8002f99104e3acff6cba1231b29ef76cfa45d1506a5cad1f84/pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b", size = 1892709 },
+    { url = "https://files.pythonhosted.org/packages/10/6c/e62b8657b834f3eb2961b49ec8e301eb99946245e70bf42c8817350cbefc/pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154", size = 1811273 },
+    { url = "https://files.pythonhosted.org/packages/ba/15/52cfe49c8c986e081b863b102d6b859d9defc63446b642ccbbb3742bf371/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9", size = 1823027 },
+    { url = "https://files.pythonhosted.org/packages/b1/1c/b6f402cfc18ec0024120602bdbcebc7bdd5b856528c013bd4d13865ca473/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9", size = 1868888 },
+    { url = "https://files.pythonhosted.org/packages/bd/7b/8cb75b66ac37bc2975a3b7de99f3c6f355fcc4d89820b61dffa8f1e81677/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1", size = 2037738 },
+    { url = "https://files.pythonhosted.org/packages/c8/f1/786d8fe78970a06f61df22cba58e365ce304bf9b9f46cc71c8c424e0c334/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a", size = 2685138 },
+    { url = "https://files.pythonhosted.org/packages/a6/74/d12b2cd841d8724dc8ffb13fc5cef86566a53ed358103150209ecd5d1999/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e", size = 1997025 },
+    { url = "https://files.pythonhosted.org/packages/a0/6e/940bcd631bc4d9a06c9539b51f070b66e8f370ed0933f392db6ff350d873/pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4", size = 2004633 },
+    { url = "https://files.pythonhosted.org/packages/50/cc/a46b34f1708d82498c227d5d80ce615b2dd502ddcfd8376fc14a36655af1/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27", size = 1999404 },
+    { url = "https://files.pythonhosted.org/packages/ca/2d/c365cfa930ed23bc58c41463bae347d1005537dc8db79e998af8ba28d35e/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee", size = 2130130 },
+    { url = "https://files.pythonhosted.org/packages/f4/d7/eb64d015c350b7cdb371145b54d96c919d4db516817f31cd1c650cae3b21/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1", size = 2157946 },
+    { url = "https://files.pythonhosted.org/packages/a4/99/bddde3ddde76c03b65dfd5a66ab436c4e58ffc42927d4ff1198ffbf96f5f/pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130", size = 1834387 },
+    { url = "https://files.pythonhosted.org/packages/71/47/82b5e846e01b26ac6f1893d3c5f9f3a2eb6ba79be26eef0b759b4fe72946/pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee", size = 1990453 },
+    { url = "https://files.pythonhosted.org/packages/51/b2/b2b50d5ecf21acf870190ae5d093602d95f66c9c31f9d5de6062eb329ad1/pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b", size = 1885186 },
+    { url = "https://files.pythonhosted.org/packages/27/97/3aef1ddb65c5ccd6eda9050036c956ff6ecbfe66cb7eb40f280f121a5bb0/pydantic_core-2.27.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c10eb4f1659290b523af58fa7cffb452a61ad6ae5613404519aee4bfbf1df993", size = 1896475 },
+    { url = "https://files.pythonhosted.org/packages/ad/d3/5668da70e373c9904ed2f372cb52c0b996426f302e0dee2e65634c92007d/pydantic_core-2.27.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef592d4bad47296fb11f96cd7dc898b92e795032b4894dfb4076cfccd43a9308", size = 1772279 },
+    { url = "https://files.pythonhosted.org/packages/8a/9e/e44b8cb0edf04a2f0a1f6425a65ee089c1d6f9c4c2dcab0209127b6fdfc2/pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c61709a844acc6bf0b7dce7daae75195a10aac96a596ea1b776996414791ede4", size = 1829112 },
+    { url = "https://files.pythonhosted.org/packages/1c/90/1160d7ac700102effe11616e8119e268770f2a2aa5afb935f3ee6832987d/pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c5f762659e47fdb7b16956c71598292f60a03aa92f8b6351504359dbdba6cf", size = 1866780 },
+    { url = "https://files.pythonhosted.org/packages/ee/33/13983426df09a36d22c15980008f8d9c77674fc319351813b5a2739b70f3/pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c9775e339e42e79ec99c441d9730fccf07414af63eac2f0e48e08fd38a64d76", size = 2037943 },
+    { url = "https://files.pythonhosted.org/packages/01/d7/ced164e376f6747e9158c89988c293cd524ab8d215ae4e185e9929655d5c/pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57762139821c31847cfb2df63c12f725788bd9f04bc2fb392790959b8f70f118", size = 2740492 },
+    { url = "https://files.pythonhosted.org/packages/8b/1f/3dc6e769d5b7461040778816aab2b00422427bcaa4b56cc89e9c653b2605/pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d1e85068e818c73e048fe28cfc769040bb1f475524f4745a5dc621f75ac7630", size = 1995714 },
+    { url = "https://files.pythonhosted.org/packages/07/d7/a0bd09bc39283530b3f7c27033a814ef254ba3bd0b5cfd040b7abf1fe5da/pydantic_core-2.27.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:097830ed52fd9e427942ff3b9bc17fab52913b2f50f2880dc4a5611446606a54", size = 1997163 },
+    { url = "https://files.pythonhosted.org/packages/2d/bb/2db4ad1762e1c5699d9b857eeb41959191980de6feb054e70f93085e1bcd/pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:044a50963a614ecfae59bb1eaf7ea7efc4bc62f49ed594e18fa1e5d953c40e9f", size = 2005217 },
+    { url = "https://files.pythonhosted.org/packages/53/5f/23a5a3e7b8403f8dd8fc8a6f8b49f6b55c7d715b77dcf1f8ae919eeb5628/pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:4e0b4220ba5b40d727c7f879eac379b822eee5d8fff418e9d3381ee45b3b0362", size = 2127899 },
+    { url = "https://files.pythonhosted.org/packages/c2/ae/aa38bb8dd3d89c2f1d8362dd890ee8f3b967330821d03bbe08fa01ce3766/pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e4f4bb20d75e9325cc9696c6802657b58bc1dbbe3022f32cc2b2b632c3fbb96", size = 2155726 },
+    { url = "https://files.pythonhosted.org/packages/98/61/4f784608cc9e98f70839187117ce840480f768fed5d386f924074bf6213c/pydantic_core-2.27.2-cp39-cp39-win32.whl", hash = "sha256:cca63613e90d001b9f2f9a9ceb276c308bfa2a43fafb75c8031c4f66039e8c6e", size = 1817219 },
+    { url = "https://files.pythonhosted.org/packages/57/82/bb16a68e4a1a858bb3768c2c8f1ff8d8978014e16598f001ea29a25bf1d1/pydantic_core-2.27.2-cp39-cp39-win_amd64.whl", hash = "sha256:77d1bca19b0f7021b3a982e6f903dcd5b2b06076def36a652e3907f596e29f67", size = 1985382 },
+    { url = "https://files.pythonhosted.org/packages/46/72/af70981a341500419e67d5cb45abe552a7c74b66326ac8877588488da1ac/pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:2bf14caea37e91198329b828eae1618c068dfb8ef17bb33287a7ad4b61ac314e", size = 1891159 },
+    { url = "https://files.pythonhosted.org/packages/ad/3d/c5913cccdef93e0a6a95c2d057d2c2cba347815c845cda79ddd3c0f5e17d/pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0cb791f5b45307caae8810c2023a184c74605ec3bcbb67d13846c28ff731ff8", size = 1768331 },
+    { url = "https://files.pythonhosted.org/packages/f6/f0/a3ae8fbee269e4934f14e2e0e00928f9346c5943174f2811193113e58252/pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:688d3fd9fcb71f41c4c015c023d12a79d1c4c0732ec9eb35d96e3388a120dcf3", size = 1822467 },
+    { url = "https://files.pythonhosted.org/packages/d7/7a/7bbf241a04e9f9ea24cd5874354a83526d639b02674648af3f350554276c/pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d591580c34f4d731592f0e9fe40f9cc1b430d297eecc70b962e93c5c668f15f", size = 1979797 },
+    { url = "https://files.pythonhosted.org/packages/4f/5f/4784c6107731f89e0005a92ecb8a2efeafdb55eb992b8e9d0a2be5199335/pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:82f986faf4e644ffc189a7f1aafc86e46ef70372bb153e7001e8afccc6e54133", size = 1987839 },
+    { url = "https://files.pythonhosted.org/packages/6d/a7/61246562b651dff00de86a5f01b6e4befb518df314c54dec187a78d81c84/pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bec317a27290e2537f922639cafd54990551725fc844249e64c523301d0822fc", size = 1998861 },
+    { url = "https://files.pythonhosted.org/packages/86/aa/837821ecf0c022bbb74ca132e117c358321e72e7f9702d1b6a03758545e2/pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:0296abcb83a797db256b773f45773da397da75a08f5fcaef41f2044adec05f50", size = 2116582 },
+    { url = "https://files.pythonhosted.org/packages/81/b0/5e74656e95623cbaa0a6278d16cf15e10a51f6002e3ec126541e95c29ea3/pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0d75070718e369e452075a6017fbf187f788e17ed67a3abd47fa934d001863d9", size = 2151985 },
+    { url = "https://files.pythonhosted.org/packages/63/37/3e32eeb2a451fddaa3898e2163746b0cffbbdbb4740d38372db0490d67f3/pydantic_core-2.27.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7e17b560be3c98a8e3aa66ce828bdebb9e9ac6ad5466fba92eb74c4c95cb1151", size = 2004715 },
+    { url = "https://files.pythonhosted.org/packages/29/0e/dcaea00c9dbd0348b723cae82b0e0c122e0fa2b43fa933e1622fd237a3ee/pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c33939a82924da9ed65dab5a65d427205a73181d8098e79b6b426bdf8ad4e656", size = 1891733 },
+    { url = "https://files.pythonhosted.org/packages/86/d3/e797bba8860ce650272bda6383a9d8cad1d1c9a75a640c9d0e848076f85e/pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:00bad2484fa6bda1e216e7345a798bd37c68fb2d97558edd584942aa41b7d278", size = 1768375 },
+    { url = "https://files.pythonhosted.org/packages/41/f7/f847b15fb14978ca2b30262548f5fc4872b2724e90f116393eb69008299d/pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c817e2b40aba42bac6f457498dacabc568c3b7a986fc9ba7c8d9d260b71485fb", size = 1822307 },
+    { url = "https://files.pythonhosted.org/packages/9c/63/ed80ec8255b587b2f108e514dc03eed1546cd00f0af281e699797f373f38/pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:251136cdad0cb722e93732cb45ca5299fb56e1344a833640bf93b2803f8d1bfd", size = 1979971 },
+    { url = "https://files.pythonhosted.org/packages/a9/6d/6d18308a45454a0de0e975d70171cadaf454bc7a0bf86b9c7688e313f0bb/pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d2088237af596f0a524d3afc39ab3b036e8adb054ee57cbb1dcf8e09da5b29cc", size = 1987616 },
+    { url = "https://files.pythonhosted.org/packages/82/8a/05f8780f2c1081b800a7ca54c1971e291c2d07d1a50fb23c7e4aef4ed403/pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d4041c0b966a84b4ae7a09832eb691a35aec90910cd2dbe7a208de59be77965b", size = 1998943 },
+    { url = "https://files.pythonhosted.org/packages/5e/3e/fe5b6613d9e4c0038434396b46c5303f5ade871166900b357ada4766c5b7/pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:8083d4e875ebe0b864ffef72a4304827015cff328a1be6e22cc850753bfb122b", size = 2116654 },
+    { url = "https://files.pythonhosted.org/packages/db/ad/28869f58938fad8cc84739c4e592989730bfb69b7c90a8fff138dff18e1e/pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f141ee28a0ad2123b6611b6ceff018039df17f32ada8b534e6aa039545a3efb2", size = 2152292 },
+    { url = "https://files.pythonhosted.org/packages/a1/0c/c5c5cd3689c32ed1fe8c5d234b079c12c281c051759770c05b8bed6412b5/pydantic_core-2.27.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7d0c8399fcc1848491f00e0314bd59fb34a9c008761bcb422a057670c3f65e35", size = 2004961 },
+]
+
+[[package]]
+name = "pygments"
+version = "2.19.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
+]
+
+[[package]]
+name = "pytest"
+version = "7.4.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/80/1f/9d8e98e4133ffb16c90f3b405c43e38d3abb715bb5d7a63a5a684f7e46a3/pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280", size = 1357116 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/51/ff/f6e8b8f39e08547faece4bd80f89d5a8de68a38b2d179cc1c4490ffa3286/pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8", size = 325287 },
+]
+
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 },
+]
+
+[[package]]
+name = "pytz"
+version = "2024.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3a/31/3c70bf7603cc2dca0f19bdc53b4537a797747a58875b552c8c413d963a3f/pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a", size = 319692 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725", size = 508002 },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/95/a3fac87cb7158e231b5a6012e438c647e1a87f09f8e0d123acec8ab8bf71/PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086", size = 184199 },
+    { url = "https://files.pythonhosted.org/packages/c7/7a/68bd47624dab8fd4afbfd3c48e3b79efe09098ae941de5b58abcbadff5cb/PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf", size = 171758 },
+    { url = "https://files.pythonhosted.org/packages/49/ee/14c54df452143b9ee9f0f29074d7ca5516a36edb0b4cc40c3f280131656f/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237", size = 718463 },
+    { url = "https://files.pythonhosted.org/packages/4d/61/de363a97476e766574650d742205be468921a7b532aa2499fcd886b62530/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b", size = 719280 },
+    { url = "https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", size = 751239 },
+    { url = "https://files.pythonhosted.org/packages/b7/33/5504b3a9a4464893c32f118a9cc045190a91637b119a9c881da1cf6b7a72/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180", size = 695802 },
+    { url = "https://files.pythonhosted.org/packages/5c/20/8347dcabd41ef3a3cdc4f7b7a2aff3d06598c8779faa189cdbf878b626a4/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68", size = 720527 },
+    { url = "https://files.pythonhosted.org/packages/be/aa/5afe99233fb360d0ff37377145a949ae258aaab831bde4792b32650a4378/PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99", size = 144052 },
+    { url = "https://files.pythonhosted.org/packages/b5/84/0fa4b06f6d6c958d207620fc60005e241ecedceee58931bb20138e1e5776/PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e", size = 161774 },
+    { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612 },
+    { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040 },
+    { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829 },
+    { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167 },
+    { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952 },
+    { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301 },
+    { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638 },
+    { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850 },
+    { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980 },
+    { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873 },
+    { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302 },
+    { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154 },
+    { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223 },
+    { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542 },
+    { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164 },
+    { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611 },
+    { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591 },
+    { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338 },
+    { url = "https://files.pythonhosted.org/packages/ef/e3/3af305b830494fa85d95f6d95ef7fa73f2ee1cc8ef5b495c7c3269fb835f/PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", size = 181309 },
+    { url = "https://files.pythonhosted.org/packages/45/9f/3b1c20a0b7a3200524eb0076cc027a970d320bd3a6592873c85c92a08731/PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", size = 171679 },
+    { url = "https://files.pythonhosted.org/packages/7c/9a/337322f27005c33bcb656c655fa78325b730324c78620e8328ae28b64d0c/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", size = 733428 },
+    { url = "https://files.pythonhosted.org/packages/a3/69/864fbe19e6c18ea3cc196cbe5d392175b4cf3d5d0ac1403ec3f2d237ebb5/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", size = 763361 },
+    { url = "https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", size = 759523 },
+    { url = "https://files.pythonhosted.org/packages/2b/b2/e3234f59ba06559c6ff63c4e10baea10e5e7df868092bf9ab40e5b9c56b6/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", size = 726660 },
+    { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597 },
+    { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527 },
+    { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446 },
+    { url = "https://files.pythonhosted.org/packages/65/d8/b7a1db13636d7fb7d4ff431593c510c8b8fca920ade06ca8ef20015493c5/PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", size = 184777 },
+    { url = "https://files.pythonhosted.org/packages/0a/02/6ec546cd45143fdf9840b2c6be8d875116a64076218b61d68e12548e5839/PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", size = 172318 },
+    { url = "https://files.pythonhosted.org/packages/0e/9a/8cc68be846c972bda34f6c2a93abb644fb2476f4dcc924d52175786932c9/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", size = 720891 },
+    { url = "https://files.pythonhosted.org/packages/e9/6c/6e1b7f40181bc4805e2e07f4abc10a88ce4648e7e95ff1abe4ae4014a9b2/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12", size = 722614 },
+    { url = "https://files.pythonhosted.org/packages/3d/32/e7bd8535d22ea2874cef6a81021ba019474ace0d13a4819c2a4bce79bd6a/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19", size = 737360 },
+    { url = "https://files.pythonhosted.org/packages/d7/12/7322c1e30b9be969670b672573d45479edef72c9a0deac3bb2868f5d7469/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e", size = 699006 },
+    { url = "https://files.pythonhosted.org/packages/82/72/04fcad41ca56491995076630c3ec1e834be241664c0c09a64c9a2589b507/PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725", size = 723577 },
+    { url = "https://files.pythonhosted.org/packages/ed/5e/46168b1f2757f1fcd442bc3029cd8767d88a98c9c05770d8b420948743bb/PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631", size = 144593 },
+    { url = "https://files.pythonhosted.org/packages/19/87/5124b1c1f2412bb95c59ec481eaf936cd32f0fe2a7b16b97b81c4c017a6a/PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8", size = 162312 },
+]
+
+[[package]]
+name = "referencing"
+version = "0.35.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "attrs" },
+    { name = "rpds-py" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/99/5b/73ca1f8e72fff6fa52119dbd185f73a907b1989428917b24cff660129b6d/referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c", size = 62991 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/59/2056f61236782a2c86b33906c025d4f4a0b17be0161b63b70fd9e8775d36/referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de", size = 26684 },
+]
+
+[[package]]
+name = "regex"
+version = "2024.11.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8e/5f/bd69653fbfb76cf8604468d3b4ec4c403197144c7bfe0e6a5fc9e02a07cb/regex-2024.11.6.tar.gz", hash = "sha256:7ab159b063c52a0333c884e4679f8d7a85112ee3078fe3d9004b2dd875585519", size = 399494 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/95/3c/4651f6b130c6842a8f3df82461a8950f923925db8b6961063e82744bddcc/regex-2024.11.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91", size = 482674 },
+    { url = "https://files.pythonhosted.org/packages/15/51/9f35d12da8434b489c7b7bffc205c474a0a9432a889457026e9bc06a297a/regex-2024.11.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0", size = 287684 },
+    { url = "https://files.pythonhosted.org/packages/bd/18/b731f5510d1b8fb63c6b6d3484bfa9a59b84cc578ac8b5172970e05ae07c/regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:164d8b7b3b4bcb2068b97428060b2a53be050085ef94eca7f240e7947f1b080e", size = 284589 },
+    { url = "https://files.pythonhosted.org/packages/78/a2/6dd36e16341ab95e4c6073426561b9bfdeb1a9c9b63ab1b579c2e96cb105/regex-2024.11.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3660c82f209655a06b587d55e723f0b813d3a7db2e32e5e7dc64ac2a9e86fde", size = 782511 },
+    { url = "https://files.pythonhosted.org/packages/1b/2b/323e72d5d2fd8de0d9baa443e1ed70363ed7e7b2fb526f5950c5cb99c364/regex-2024.11.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d22326fcdef5e08c154280b71163ced384b428343ae16a5ab2b3354aed12436e", size = 821149 },
+    { url = "https://files.pythonhosted.org/packages/90/30/63373b9ea468fbef8a907fd273e5c329b8c9535fee36fc8dba5fecac475d/regex-2024.11.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1ac758ef6aebfc8943560194e9fd0fa18bcb34d89fd8bd2af18183afd8da3a2", size = 809707 },
+    { url = "https://files.pythonhosted.org/packages/f2/98/26d3830875b53071f1f0ae6d547f1d98e964dd29ad35cbf94439120bb67a/regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:997d6a487ff00807ba810e0f8332c18b4eb8d29463cfb7c820dc4b6e7562d0cf", size = 781702 },
+    { url = "https://files.pythonhosted.org/packages/87/55/eb2a068334274db86208ab9d5599ffa63631b9f0f67ed70ea7c82a69bbc8/regex-2024.11.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:02a02d2bb04fec86ad61f3ea7f49c015a0681bf76abb9857f945d26159d2968c", size = 771976 },
+    { url = "https://files.pythonhosted.org/packages/74/c0/be707bcfe98254d8f9d2cff55d216e946f4ea48ad2fd8cf1428f8c5332ba/regex-2024.11.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f02f93b92358ee3f78660e43b4b0091229260c5d5c408d17d60bf26b6c900e86", size = 697397 },
+    { url = "https://files.pythonhosted.org/packages/49/dc/bb45572ceb49e0f6509f7596e4ba7031f6819ecb26bc7610979af5a77f45/regex-2024.11.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:06eb1be98df10e81ebaded73fcd51989dcf534e3c753466e4b60c4697a003b67", size = 768726 },
+    { url = "https://files.pythonhosted.org/packages/5a/db/f43fd75dc4c0c2d96d0881967897926942e935d700863666f3c844a72ce6/regex-2024.11.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:040df6fe1a5504eb0f04f048e6d09cd7c7110fef851d7c567a6b6e09942feb7d", size = 775098 },
+    { url = "https://files.pythonhosted.org/packages/99/d7/f94154db29ab5a89d69ff893159b19ada89e76b915c1293e98603d39838c/regex-2024.11.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabbfc59f2c6edba2a6622c647b716e34e8e3867e0ab975412c5c2f79b82da2", size = 839325 },
+    { url = "https://files.pythonhosted.org/packages/f7/17/3cbfab1f23356fbbf07708220ab438a7efa1e0f34195bf857433f79f1788/regex-2024.11.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8447d2d39b5abe381419319f942de20b7ecd60ce86f16a23b0698f22e1b70008", size = 843277 },
+    { url = "https://files.pythonhosted.org/packages/7e/f2/48b393b51900456155de3ad001900f94298965e1cad1c772b87f9cfea011/regex-2024.11.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:da8f5fc57d1933de22a9e23eec290a0d8a5927a5370d24bda9a6abe50683fe62", size = 773197 },
+    { url = "https://files.pythonhosted.org/packages/45/3f/ef9589aba93e084cd3f8471fded352826dcae8489b650d0b9b27bc5bba8a/regex-2024.11.6-cp310-cp310-win32.whl", hash = "sha256:b489578720afb782f6ccf2840920f3a32e31ba28a4b162e13900c3e6bd3f930e", size = 261714 },
+    { url = "https://files.pythonhosted.org/packages/42/7e/5f1b92c8468290c465fd50c5318da64319133231415a8aa6ea5ab995a815/regex-2024.11.6-cp310-cp310-win_amd64.whl", hash = "sha256:5071b2093e793357c9d8b2929dfc13ac5f0a6c650559503bb81189d0a3814519", size = 274042 },
+    { url = "https://files.pythonhosted.org/packages/58/58/7e4d9493a66c88a7da6d205768119f51af0f684fe7be7bac8328e217a52c/regex-2024.11.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5478c6962ad548b54a591778e93cd7c456a7a29f8eca9c49e4f9a806dcc5d638", size = 482669 },
+    { url = "https://files.pythonhosted.org/packages/34/4c/8f8e631fcdc2ff978609eaeef1d6994bf2f028b59d9ac67640ed051f1218/regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c89a8cc122b25ce6945f0423dc1352cb9593c68abd19223eebbd4e56612c5b7", size = 287684 },
+    { url = "https://files.pythonhosted.org/packages/c5/1b/f0e4d13e6adf866ce9b069e191f303a30ab1277e037037a365c3aad5cc9c/regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94d87b689cdd831934fa3ce16cc15cd65748e6d689f5d2b8f4f4df2065c9fa20", size = 284589 },
+    { url = "https://files.pythonhosted.org/packages/25/4d/ab21047f446693887f25510887e6820b93f791992994f6498b0318904d4a/regex-2024.11.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1062b39a0a2b75a9c694f7a08e7183a80c63c0d62b301418ffd9c35f55aaa114", size = 792121 },
+    { url = "https://files.pythonhosted.org/packages/45/ee/c867e15cd894985cb32b731d89576c41a4642a57850c162490ea34b78c3b/regex-2024.11.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:167ed4852351d8a750da48712c3930b031f6efdaa0f22fa1933716bfcd6bf4a3", size = 831275 },
+    { url = "https://files.pythonhosted.org/packages/b3/12/b0f480726cf1c60f6536fa5e1c95275a77624f3ac8fdccf79e6727499e28/regex-2024.11.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d548dafee61f06ebdb584080621f3e0c23fff312f0de1afc776e2a2ba99a74f", size = 818257 },
+    { url = "https://files.pythonhosted.org/packages/bf/ce/0d0e61429f603bac433910d99ef1a02ce45a8967ffbe3cbee48599e62d88/regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a19f302cd1ce5dd01a9099aaa19cae6173306d1302a43b627f62e21cf18ac0", size = 792727 },
+    { url = "https://files.pythonhosted.org/packages/e4/c1/243c83c53d4a419c1556f43777ccb552bccdf79d08fda3980e4e77dd9137/regex-2024.11.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bec9931dfb61ddd8ef2ebc05646293812cb6b16b60cf7c9511a832b6f1854b55", size = 780667 },
+    { url = "https://files.pythonhosted.org/packages/c5/f4/75eb0dd4ce4b37f04928987f1d22547ddaf6c4bae697623c1b05da67a8aa/regex-2024.11.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9714398225f299aa85267fd222f7142fcb5c769e73d7733344efc46f2ef5cf89", size = 776963 },
+    { url = "https://files.pythonhosted.org/packages/16/5d/95c568574e630e141a69ff8a254c2f188b4398e813c40d49228c9bbd9875/regex-2024.11.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:202eb32e89f60fc147a41e55cb086db2a3f8cb82f9a9a88440dcfc5d37faae8d", size = 784700 },
+    { url = "https://files.pythonhosted.org/packages/8e/b5/f8495c7917f15cc6fee1e7f395e324ec3e00ab3c665a7dc9d27562fd5290/regex-2024.11.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4181b814e56078e9b00427ca358ec44333765f5ca1b45597ec7446d3a1ef6e34", size = 848592 },
+    { url = "https://files.pythonhosted.org/packages/1c/80/6dd7118e8cb212c3c60b191b932dc57db93fb2e36fb9e0e92f72a5909af9/regex-2024.11.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:068376da5a7e4da51968ce4c122a7cd31afaaec4fccc7856c92f63876e57b51d", size = 852929 },
+    { url = "https://files.pythonhosted.org/packages/11/9b/5a05d2040297d2d254baf95eeeb6df83554e5e1df03bc1a6687fc4ba1f66/regex-2024.11.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f2c4184420d881a3475fb2c6f4d95d53a8d50209a2500723d831036f7c45", size = 781213 },
+    { url = "https://files.pythonhosted.org/packages/26/b7/b14e2440156ab39e0177506c08c18accaf2b8932e39fb092074de733d868/regex-2024.11.6-cp311-cp311-win32.whl", hash = "sha256:c36f9b6f5f8649bb251a5f3f66564438977b7ef8386a52460ae77e6070d309d9", size = 261734 },
+    { url = "https://files.pythonhosted.org/packages/80/32/763a6cc01d21fb3819227a1cc3f60fd251c13c37c27a73b8ff4315433a8e/regex-2024.11.6-cp311-cp311-win_amd64.whl", hash = "sha256:02e28184be537f0e75c1f9b2f8847dc51e08e6e171c6bde130b2687e0c33cf60", size = 274052 },
+    { url = "https://files.pythonhosted.org/packages/ba/30/9a87ce8336b172cc232a0db89a3af97929d06c11ceaa19d97d84fa90a8f8/regex-2024.11.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:52fb28f528778f184f870b7cf8f225f5eef0a8f6e3778529bdd40c7b3920796a", size = 483781 },
+    { url = "https://files.pythonhosted.org/packages/01/e8/00008ad4ff4be8b1844786ba6636035f7ef926db5686e4c0f98093612add/regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9", size = 288455 },
+    { url = "https://files.pythonhosted.org/packages/60/85/cebcc0aff603ea0a201667b203f13ba75d9fc8668fab917ac5b2de3967bc/regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:805e6b60c54bf766b251e94526ebad60b7de0c70f70a4e6210ee2891acb70bf2", size = 284759 },
+    { url = "https://files.pythonhosted.org/packages/94/2b/701a4b0585cb05472a4da28ee28fdfe155f3638f5e1ec92306d924e5faf0/regex-2024.11.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b85c2530be953a890eaffde05485238f07029600e8f098cdf1848d414a8b45e4", size = 794976 },
+    { url = "https://files.pythonhosted.org/packages/4b/bf/fa87e563bf5fee75db8915f7352e1887b1249126a1be4813837f5dbec965/regex-2024.11.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb26437975da7dc36b7efad18aa9dd4ea569d2357ae6b783bf1118dabd9ea577", size = 833077 },
+    { url = "https://files.pythonhosted.org/packages/a1/56/7295e6bad94b047f4d0834e4779491b81216583c00c288252ef625c01d23/regex-2024.11.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abfa5080c374a76a251ba60683242bc17eeb2c9818d0d30117b4486be10c59d3", size = 823160 },
+    { url = "https://files.pythonhosted.org/packages/fb/13/e3b075031a738c9598c51cfbc4c7879e26729c53aa9cca59211c44235314/regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b7fa6606c2881c1db9479b0eaa11ed5dfa11c8d60a474ff0e095099f39d98e", size = 796896 },
+    { url = "https://files.pythonhosted.org/packages/24/56/0b3f1b66d592be6efec23a795b37732682520b47c53da5a32c33ed7d84e3/regex-2024.11.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c32f75920cf99fe6b6c539c399a4a128452eaf1af27f39bce8909c9a3fd8cbe", size = 783997 },
+    { url = "https://files.pythonhosted.org/packages/f9/a1/eb378dada8b91c0e4c5f08ffb56f25fcae47bf52ad18f9b2f33b83e6d498/regex-2024.11.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:982e6d21414e78e1f51cf595d7f321dcd14de1f2881c5dc6a6e23bbbbd68435e", size = 781725 },
+    { url = "https://files.pythonhosted.org/packages/83/f2/033e7dec0cfd6dda93390089864732a3409246ffe8b042e9554afa9bff4e/regex-2024.11.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a7c2155f790e2fb448faed6dd241386719802296ec588a8b9051c1f5c481bc29", size = 789481 },
+    { url = "https://files.pythonhosted.org/packages/83/23/15d4552ea28990a74e7696780c438aadd73a20318c47e527b47a4a5a596d/regex-2024.11.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149f5008d286636e48cd0b1dd65018548944e495b0265b45e1bffecce1ef7f39", size = 852896 },
+    { url = "https://files.pythonhosted.org/packages/e3/39/ed4416bc90deedbfdada2568b2cb0bc1fdb98efe11f5378d9892b2a88f8f/regex-2024.11.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:e5364a4502efca094731680e80009632ad6624084aff9a23ce8c8c6820de3e51", size = 860138 },
+    { url = "https://files.pythonhosted.org/packages/93/2d/dd56bb76bd8e95bbce684326302f287455b56242a4f9c61f1bc76e28360e/regex-2024.11.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0a86e7eeca091c09e021db8eb72d54751e527fa47b8d5787caf96d9831bd02ad", size = 787692 },
+    { url = "https://files.pythonhosted.org/packages/0b/55/31877a249ab7a5156758246b9c59539abbeba22461b7d8adc9e8475ff73e/regex-2024.11.6-cp312-cp312-win32.whl", hash = "sha256:32f9a4c643baad4efa81d549c2aadefaeba12249b2adc5af541759237eee1c54", size = 262135 },
+    { url = "https://files.pythonhosted.org/packages/38/ec/ad2d7de49a600cdb8dd78434a1aeffe28b9d6fc42eb36afab4a27ad23384/regex-2024.11.6-cp312-cp312-win_amd64.whl", hash = "sha256:a93c194e2df18f7d264092dc8539b8ffb86b45b899ab976aa15d48214138e81b", size = 273567 },
+    { url = "https://files.pythonhosted.org/packages/90/73/bcb0e36614601016552fa9344544a3a2ae1809dc1401b100eab02e772e1f/regex-2024.11.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a6ba92c0bcdf96cbf43a12c717eae4bc98325ca3730f6b130ffa2e3c3c723d84", size = 483525 },
+    { url = "https://files.pythonhosted.org/packages/0f/3f/f1a082a46b31e25291d830b369b6b0c5576a6f7fb89d3053a354c24b8a83/regex-2024.11.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:525eab0b789891ac3be914d36893bdf972d483fe66551f79d3e27146191a37d4", size = 288324 },
+    { url = "https://files.pythonhosted.org/packages/09/c9/4e68181a4a652fb3ef5099e077faf4fd2a694ea6e0f806a7737aff9e758a/regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:086a27a0b4ca227941700e0b31425e7a28ef1ae8e5e05a33826e17e47fbfdba0", size = 284617 },
+    { url = "https://files.pythonhosted.org/packages/fc/fd/37868b75eaf63843165f1d2122ca6cb94bfc0271e4428cf58c0616786dce/regex-2024.11.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde01f35767c4a7899b7eb6e823b125a64de314a8ee9791367c9a34d56af18d0", size = 795023 },
+    { url = "https://files.pythonhosted.org/packages/c4/7c/d4cd9c528502a3dedb5c13c146e7a7a539a3853dc20209c8e75d9ba9d1b2/regex-2024.11.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b583904576650166b3d920d2bcce13971f6f9e9a396c673187f49811b2769dc7", size = 833072 },
+    { url = "https://files.pythonhosted.org/packages/4f/db/46f563a08f969159c5a0f0e722260568425363bea43bb7ae370becb66a67/regex-2024.11.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c4de13f06a0d54fa0d5ab1b7138bfa0d883220965a29616e3ea61b35d5f5fc7", size = 823130 },
+    { url = "https://files.pythonhosted.org/packages/db/60/1eeca2074f5b87df394fccaa432ae3fc06c9c9bfa97c5051aed70e6e00c2/regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cde6e9f2580eb1665965ce9bf17ff4952f34f5b126beb509fee8f4e994f143c", size = 796857 },
+    { url = "https://files.pythonhosted.org/packages/10/db/ac718a08fcee981554d2f7bb8402f1faa7e868c1345c16ab1ebec54b0d7b/regex-2024.11.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d7f453dca13f40a02b79636a339c5b62b670141e63efd511d3f8f73fba162b3", size = 784006 },
+    { url = "https://files.pythonhosted.org/packages/c2/41/7da3fe70216cea93144bf12da2b87367590bcf07db97604edeea55dac9ad/regex-2024.11.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59dfe1ed21aea057a65c6b586afd2a945de04fc7db3de0a6e3ed5397ad491b07", size = 781650 },
+    { url = "https://files.pythonhosted.org/packages/a7/d5/880921ee4eec393a4752e6ab9f0fe28009435417c3102fc413f3fe81c4e5/regex-2024.11.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b97c1e0bd37c5cd7902e65f410779d39eeda155800b65fc4d04cc432efa9bc6e", size = 789545 },
+    { url = "https://files.pythonhosted.org/packages/dc/96/53770115e507081122beca8899ab7f5ae28ae790bfcc82b5e38976df6a77/regex-2024.11.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d1e379028e0fc2ae3654bac3cbbef81bf3fd571272a42d56c24007979bafb6", size = 853045 },
+    { url = "https://files.pythonhosted.org/packages/31/d3/1372add5251cc2d44b451bd94f43b2ec78e15a6e82bff6a290ef9fd8f00a/regex-2024.11.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:13291b39131e2d002a7940fb176e120bec5145f3aeb7621be6534e46251912c4", size = 860182 },
+    { url = "https://files.pythonhosted.org/packages/ed/e3/c446a64984ea9f69982ba1a69d4658d5014bc7a0ea468a07e1a1265db6e2/regex-2024.11.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f51f88c126370dcec4908576c5a627220da6c09d0bff31cfa89f2523843316d", size = 787733 },
+    { url = "https://files.pythonhosted.org/packages/2b/f1/e40c8373e3480e4f29f2692bd21b3e05f296d3afebc7e5dcf21b9756ca1c/regex-2024.11.6-cp313-cp313-win32.whl", hash = "sha256:63b13cfd72e9601125027202cad74995ab26921d8cd935c25f09c630436348ff", size = 262122 },
+    { url = "https://files.pythonhosted.org/packages/45/94/bc295babb3062a731f52621cdc992d123111282e291abaf23faa413443ea/regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a", size = 273545 },
+    { url = "https://files.pythonhosted.org/packages/89/23/c4a86df398e57e26f93b13ae63acce58771e04bdde86092502496fa57f9c/regex-2024.11.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5704e174f8ccab2026bd2f1ab6c510345ae8eac818b613d7d73e785f1310f839", size = 482682 },
+    { url = "https://files.pythonhosted.org/packages/3c/8b/45c24ab7a51a1658441b961b86209c43e6bb9d39caf1e63f46ce6ea03bc7/regex-2024.11.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:220902c3c5cc6af55d4fe19ead504de80eb91f786dc102fbd74894b1551f095e", size = 287679 },
+    { url = "https://files.pythonhosted.org/packages/7a/d1/598de10b17fdafc452d11f7dada11c3be4e379a8671393e4e3da3c4070df/regex-2024.11.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e7e351589da0850c125f1600a4c4ba3c722efefe16b297de54300f08d734fbf", size = 284578 },
+    { url = "https://files.pythonhosted.org/packages/49/70/c7eaa219efa67a215846766fde18d92d54cb590b6a04ffe43cef30057622/regex-2024.11.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5056b185ca113c88e18223183aa1a50e66507769c9640a6ff75859619d73957b", size = 782012 },
+    { url = "https://files.pythonhosted.org/packages/89/e5/ef52c7eb117dd20ff1697968219971d052138965a4d3d9b95e92e549f505/regex-2024.11.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e34b51b650b23ed3354b5a07aab37034d9f923db2a40519139af34f485f77d0", size = 820580 },
+    { url = "https://files.pythonhosted.org/packages/5f/3f/9f5da81aff1d4167ac52711acf789df13e789fe6ac9545552e49138e3282/regex-2024.11.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5670bce7b200273eee1840ef307bfa07cda90b38ae56e9a6ebcc9f50da9c469b", size = 809110 },
+    { url = "https://files.pythonhosted.org/packages/86/44/2101cc0890c3621b90365c9ee8d7291a597c0722ad66eccd6ffa7f1bcc09/regex-2024.11.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08986dce1339bc932923e7d1232ce9881499a0e02925f7402fb7c982515419ef", size = 780919 },
+    { url = "https://files.pythonhosted.org/packages/ce/2e/3e0668d8d1c7c3c0d397bf54d92fc182575b3a26939aed5000d3cc78760f/regex-2024.11.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93c0b12d3d3bc25af4ebbf38f9ee780a487e8bf6954c115b9f015822d3bb8e48", size = 771515 },
+    { url = "https://files.pythonhosted.org/packages/a6/49/1bc4584254355e3dba930a3a2fd7ad26ccba3ebbab7d9100db0aff2eedb0/regex-2024.11.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:764e71f22ab3b305e7f4c21f1a97e1526a25ebdd22513e251cf376760213da13", size = 696957 },
+    { url = "https://files.pythonhosted.org/packages/c8/dd/42879c1fc8a37a887cd08e358af3d3ba9e23038cd77c7fe044a86d9450ba/regex-2024.11.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f056bf21105c2515c32372bbc057f43eb02aae2fda61052e2f7622c801f0b4e2", size = 768088 },
+    { url = "https://files.pythonhosted.org/packages/89/96/c05a0fe173cd2acd29d5e13c1adad8b706bcaa71b169e1ee57dcf2e74584/regex-2024.11.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:69ab78f848845569401469da20df3e081e6b5a11cb086de3eed1d48f5ed57c95", size = 774752 },
+    { url = "https://files.pythonhosted.org/packages/b5/f3/a757748066255f97f14506483436c5f6aded7af9e37bca04ec30c90ca683/regex-2024.11.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:86fddba590aad9208e2fa8b43b4c098bb0ec74f15718bb6a704e3c63e2cef3e9", size = 838862 },
+    { url = "https://files.pythonhosted.org/packages/5c/93/c6d2092fd479dcaeea40fc8fa673822829181ded77d294a7f950f1dda6e2/regex-2024.11.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:684d7a212682996d21ca12ef3c17353c021fe9de6049e19ac8481ec35574a70f", size = 842622 },
+    { url = "https://files.pythonhosted.org/packages/ff/9c/daa99532c72f25051a90ef90e1413a8d54413a9e64614d9095b0c1c154d0/regex-2024.11.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a03e02f48cd1abbd9f3b7e3586d97c8f7a9721c436f51a5245b3b9483044480b", size = 772713 },
+    { url = "https://files.pythonhosted.org/packages/13/5d/61a533ccb8c231b474ac8e3a7d70155b00dfc61af6cafdccd1947df6d735/regex-2024.11.6-cp39-cp39-win32.whl", hash = "sha256:41758407fc32d5c3c5de163888068cfee69cb4c2be844e7ac517a52770f9af57", size = 261756 },
+    { url = "https://files.pythonhosted.org/packages/dc/7b/e59b7f7c91ae110d154370c24133f947262525b5d6406df65f23422acc17/regex-2024.11.6-cp39-cp39-win_amd64.whl", hash = "sha256:b2837718570f95dd41675328e111345f9b7095d821bac435aac173ac80b19983", size = 274110 },
+]
+
+[[package]]
+name = "requests"
+version = "2.32.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
+]
+
+[[package]]
+name = "rich"
+version = "13.9.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ab/3a/0316b28d0761c6734d6bc14e770d85506c986c85ffb239e688eeaab2c2bc/rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098", size = 223149 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424 },
+]
+
+[[package]]
+name = "rpds-py"
+version = "0.22.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/80/cce854d0921ff2f0a9fa831ba3ad3c65cee3a46711addf39a2af52df2cfd/rpds_py-0.22.3.tar.gz", hash = "sha256:e32fee8ab45d3c2db6da19a5323bc3362237c8b653c70194414b892fd06a080d", size = 26771 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/42/2a/ead1d09e57449b99dcc190d8d2323e3a167421d8f8fdf0f217c6f6befe47/rpds_py-0.22.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:6c7b99ca52c2c1752b544e310101b98a659b720b21db00e65edca34483259967", size = 359514 },
+    { url = "https://files.pythonhosted.org/packages/8f/7e/1254f406b7793b586c68e217a6a24ec79040f85e030fff7e9049069284f4/rpds_py-0.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:be2eb3f2495ba669d2a985f9b426c1797b7d48d6963899276d22f23e33d47e37", size = 349031 },
+    { url = "https://files.pythonhosted.org/packages/aa/da/17c6a2c73730d426df53675ff9cc6653ac7a60b6438d03c18e1c822a576a/rpds_py-0.22.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70eb60b3ae9245ddea20f8a4190bd79c705a22f8028aaf8bbdebe4716c3fab24", size = 381485 },
+    { url = "https://files.pythonhosted.org/packages/aa/13/2dbacd820466aa2a3c4b747afb18d71209523d353cf865bf8f4796c969ea/rpds_py-0.22.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4041711832360a9b75cfb11b25a6a97c8fb49c07b8bd43d0d02b45d0b499a4ff", size = 386794 },
+    { url = "https://files.pythonhosted.org/packages/6d/62/96905d0a35ad4e4bc3c098b2f34b2e7266e211d08635baa690643d2227be/rpds_py-0.22.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:64607d4cbf1b7e3c3c8a14948b99345eda0e161b852e122c6bb71aab6d1d798c", size = 423523 },
+    { url = "https://files.pythonhosted.org/packages/eb/1b/d12770f2b6a9fc2c3ec0d810d7d440f6d465ccd8b7f16ae5385952c28b89/rpds_py-0.22.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e69b0a0e2537f26d73b4e43ad7bc8c8efb39621639b4434b76a3de50c6966e", size = 446695 },
+    { url = "https://files.pythonhosted.org/packages/4d/cf/96f1fd75512a017f8e07408b6d5dbeb492d9ed46bfe0555544294f3681b3/rpds_py-0.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc27863442d388870c1809a87507727b799c8460573cfbb6dc0eeaef5a11b5ec", size = 381959 },
+    { url = "https://files.pythonhosted.org/packages/ab/f0/d1c5b501c8aea85aeb938b555bfdf7612110a2f8cdc21ae0482c93dd0c24/rpds_py-0.22.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e79dd39f1e8c3504be0607e5fc6e86bb60fe3584bec8b782578c3b0fde8d932c", size = 410420 },
+    { url = "https://files.pythonhosted.org/packages/33/3b/45b6c58fb6aad5a569ae40fb890fc494c6b02203505a5008ee6dc68e65f7/rpds_py-0.22.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e0fa2d4ec53dc51cf7d3bb22e0aa0143966119f42a0c3e4998293a3dd2856b09", size = 557620 },
+    { url = "https://files.pythonhosted.org/packages/83/62/3fdd2d3d47bf0bb9b931c4c73036b4ab3ec77b25e016ae26fab0f02be2af/rpds_py-0.22.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:fda7cb070f442bf80b642cd56483b5548e43d366fe3f39b98e67cce780cded00", size = 584202 },
+    { url = "https://files.pythonhosted.org/packages/04/f2/5dced98b64874b84ca824292f9cee2e3f30f3bcf231d15a903126684f74d/rpds_py-0.22.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cff63a0272fcd259dcc3be1657b07c929c466b067ceb1c20060e8d10af56f5bf", size = 552787 },
+    { url = "https://files.pythonhosted.org/packages/67/13/2273dea1204eda0aea0ef55145da96a9aa28b3f88bb5c70e994f69eda7c3/rpds_py-0.22.3-cp310-cp310-win32.whl", hash = "sha256:9bd7228827ec7bb817089e2eb301d907c0d9827a9e558f22f762bb690b131652", size = 220088 },
+    { url = "https://files.pythonhosted.org/packages/4e/80/8c8176b67ad7f4a894967a7a4014ba039626d96f1d4874d53e409b58d69f/rpds_py-0.22.3-cp310-cp310-win_amd64.whl", hash = "sha256:9beeb01d8c190d7581a4d59522cd3d4b6887040dcfc744af99aa59fef3e041a8", size = 231737 },
+    { url = "https://files.pythonhosted.org/packages/15/ad/8d1ddf78f2805a71253fcd388017e7b4a0615c22c762b6d35301fef20106/rpds_py-0.22.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d20cfb4e099748ea39e6f7b16c91ab057989712d31761d3300d43134e26e165f", size = 359773 },
+    { url = "https://files.pythonhosted.org/packages/c8/75/68c15732293a8485d79fe4ebe9045525502a067865fa4278f178851b2d87/rpds_py-0.22.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:68049202f67380ff9aa52f12e92b1c30115f32e6895cd7198fa2a7961621fc5a", size = 349214 },
+    { url = "https://files.pythonhosted.org/packages/3c/4c/7ce50f3070083c2e1b2bbd0fb7046f3da55f510d19e283222f8f33d7d5f4/rpds_py-0.22.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb4f868f712b2dd4bcc538b0a0c1f63a2b1d584c925e69a224d759e7070a12d5", size = 380477 },
+    { url = "https://files.pythonhosted.org/packages/9a/e9/835196a69cb229d5c31c13b8ae603bd2da9a6695f35fe4270d398e1db44c/rpds_py-0.22.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bc51abd01f08117283c5ebf64844a35144a0843ff7b2983e0648e4d3d9f10dbb", size = 386171 },
+    { url = "https://files.pythonhosted.org/packages/f9/8e/33fc4eba6683db71e91e6d594a2cf3a8fbceb5316629f0477f7ece5e3f75/rpds_py-0.22.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f3cec041684de9a4684b1572fe28c7267410e02450f4561700ca5a3bc6695a2", size = 422676 },
+    { url = "https://files.pythonhosted.org/packages/37/47/2e82d58f8046a98bb9497a8319604c92b827b94d558df30877c4b3c6ccb3/rpds_py-0.22.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7ef9d9da710be50ff6809fed8f1963fecdfecc8b86656cadfca3bc24289414b0", size = 446152 },
+    { url = "https://files.pythonhosted.org/packages/e1/78/79c128c3e71abbc8e9739ac27af11dc0f91840a86fce67ff83c65d1ba195/rpds_py-0.22.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59f4a79c19232a5774aee369a0c296712ad0e77f24e62cad53160312b1c1eaa1", size = 381300 },
+    { url = "https://files.pythonhosted.org/packages/c9/5b/2e193be0e8b228c1207f31fa3ea79de64dadb4f6a4833111af8145a6bc33/rpds_py-0.22.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1a60bce91f81ddaac922a40bbb571a12c1070cb20ebd6d49c48e0b101d87300d", size = 409636 },
+    { url = "https://files.pythonhosted.org/packages/c2/3f/687c7100b762d62186a1c1100ffdf99825f6fa5ea94556844bbbd2d0f3a9/rpds_py-0.22.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e89391e6d60251560f0a8f4bd32137b077a80d9b7dbe6d5cab1cd80d2746f648", size = 556708 },
+    { url = "https://files.pythonhosted.org/packages/8c/a2/c00cbc4b857e8b3d5e7f7fc4c81e23afd8c138b930f4f3ccf9a41a23e9e4/rpds_py-0.22.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e3fb866d9932a3d7d0c82da76d816996d1667c44891bd861a0f97ba27e84fc74", size = 583554 },
+    { url = "https://files.pythonhosted.org/packages/d0/08/696c9872cf56effdad9ed617ac072f6774a898d46b8b8964eab39ec562d2/rpds_py-0.22.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1352ae4f7c717ae8cba93421a63373e582d19d55d2ee2cbb184344c82d2ae55a", size = 552105 },
+    { url = "https://files.pythonhosted.org/packages/18/1f/4df560be1e994f5adf56cabd6c117e02de7c88ee238bb4ce03ed50da9d56/rpds_py-0.22.3-cp311-cp311-win32.whl", hash = "sha256:b0b4136a252cadfa1adb705bb81524eee47d9f6aab4f2ee4fa1e9d3cd4581f64", size = 220199 },
+    { url = "https://files.pythonhosted.org/packages/b8/1b/c29b570bc5db8237553002788dc734d6bd71443a2ceac2a58202ec06ef12/rpds_py-0.22.3-cp311-cp311-win_amd64.whl", hash = "sha256:8bd7c8cfc0b8247c8799080fbff54e0b9619e17cdfeb0478ba7295d43f635d7c", size = 231775 },
+    { url = "https://files.pythonhosted.org/packages/75/47/3383ee3bd787a2a5e65a9b9edc37ccf8505c0a00170e3a5e6ea5fbcd97f7/rpds_py-0.22.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:27e98004595899949bd7a7b34e91fa7c44d7a97c40fcaf1d874168bb652ec67e", size = 352334 },
+    { url = "https://files.pythonhosted.org/packages/40/14/aa6400fa8158b90a5a250a77f2077c0d0cd8a76fce31d9f2b289f04c6dec/rpds_py-0.22.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1978d0021e943aae58b9b0b196fb4895a25cc53d3956b8e35e0b7682eefb6d56", size = 342111 },
+    { url = "https://files.pythonhosted.org/packages/7d/06/395a13bfaa8a28b302fb433fb285a67ce0ea2004959a027aea8f9c52bad4/rpds_py-0.22.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:655ca44a831ecb238d124e0402d98f6212ac527a0ba6c55ca26f616604e60a45", size = 384286 },
+    { url = "https://files.pythonhosted.org/packages/43/52/d8eeaffab047e6b7b7ef7f00d5ead074a07973968ffa2d5820fa131d7852/rpds_py-0.22.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:feea821ee2a9273771bae61194004ee2fc33f8ec7db08117ef9147d4bbcbca8e", size = 391739 },
+    { url = "https://files.pythonhosted.org/packages/83/31/52dc4bde85c60b63719610ed6f6d61877effdb5113a72007679b786377b8/rpds_py-0.22.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22bebe05a9ffc70ebfa127efbc429bc26ec9e9b4ee4d15a740033efda515cf3d", size = 427306 },
+    { url = "https://files.pythonhosted.org/packages/70/d5/1bab8e389c2261dba1764e9e793ed6830a63f830fdbec581a242c7c46bda/rpds_py-0.22.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3af6e48651c4e0d2d166dc1b033b7042ea3f871504b6805ba5f4fe31581d8d38", size = 442717 },
+    { url = "https://files.pythonhosted.org/packages/82/a1/a45f3e30835b553379b3a56ea6c4eb622cf11e72008229af840e4596a8ea/rpds_py-0.22.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e67ba3c290821343c192f7eae1d8fd5999ca2dc99994114643e2f2d3e6138b15", size = 385721 },
+    { url = "https://files.pythonhosted.org/packages/a6/27/780c942de3120bdd4d0e69583f9c96e179dfff082f6ecbb46b8d6488841f/rpds_py-0.22.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:02fbb9c288ae08bcb34fb41d516d5eeb0455ac35b5512d03181d755d80810059", size = 415824 },
+    { url = "https://files.pythonhosted.org/packages/94/0b/aa0542ca88ad20ea719b06520f925bae348ea5c1fdf201b7e7202d20871d/rpds_py-0.22.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f56a6b404f74ab372da986d240e2e002769a7d7102cc73eb238a4f72eec5284e", size = 561227 },
+    { url = "https://files.pythonhosted.org/packages/0d/92/3ed77d215f82c8f844d7f98929d56cc321bb0bcfaf8f166559b8ec56e5f1/rpds_py-0.22.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0a0461200769ab3b9ab7e513f6013b7a97fdeee41c29b9db343f3c5a8e2b9e61", size = 587424 },
+    { url = "https://files.pythonhosted.org/packages/09/42/cacaeb047a22cab6241f107644f230e2935d4efecf6488859a7dd82fc47d/rpds_py-0.22.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8633e471c6207a039eff6aa116e35f69f3156b3989ea3e2d755f7bc41754a4a7", size = 555953 },
+    { url = "https://files.pythonhosted.org/packages/e6/52/c921dc6d5f5d45b212a456c1f5b17df1a471127e8037eb0972379e39dff4/rpds_py-0.22.3-cp312-cp312-win32.whl", hash = "sha256:593eba61ba0c3baae5bc9be2f5232430453fb4432048de28399ca7376de9c627", size = 221339 },
+    { url = "https://files.pythonhosted.org/packages/f2/c7/f82b5be1e8456600395366f86104d1bd8d0faed3802ad511ef6d60c30d98/rpds_py-0.22.3-cp312-cp312-win_amd64.whl", hash = "sha256:d115bffdd417c6d806ea9069237a4ae02f513b778e3789a359bc5856e0404cc4", size = 235786 },
+    { url = "https://files.pythonhosted.org/packages/d0/bf/36d5cc1f2c609ae6e8bf0fc35949355ca9d8790eceb66e6385680c951e60/rpds_py-0.22.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:ea7433ce7e4bfc3a85654aeb6747babe3f66eaf9a1d0c1e7a4435bbdf27fea84", size = 351657 },
+    { url = "https://files.pythonhosted.org/packages/24/2a/f1e0fa124e300c26ea9382e59b2d582cba71cedd340f32d1447f4f29fa4e/rpds_py-0.22.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6dd9412824c4ce1aca56c47b0991e65bebb7ac3f4edccfd3f156150c96a7bf25", size = 341829 },
+    { url = "https://files.pythonhosted.org/packages/cf/c2/0da1231dd16953845bed60d1a586fcd6b15ceaeb965f4d35cdc71f70f606/rpds_py-0.22.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20070c65396f7373f5df4005862fa162db5d25d56150bddd0b3e8214e8ef45b4", size = 384220 },
+    { url = "https://files.pythonhosted.org/packages/c7/73/a4407f4e3a00a9d4b68c532bf2d873d6b562854a8eaff8faa6133b3588ec/rpds_py-0.22.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0b09865a9abc0ddff4e50b5ef65467cd94176bf1e0004184eb915cbc10fc05c5", size = 391009 },
+    { url = "https://files.pythonhosted.org/packages/a9/c3/04b7353477ab360fe2563f5f0b176d2105982f97cd9ae80a9c5a18f1ae0f/rpds_py-0.22.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3453e8d41fe5f17d1f8e9c383a7473cd46a63661628ec58e07777c2fff7196dc", size = 426989 },
+    { url = "https://files.pythonhosted.org/packages/8d/e6/e4b85b722bcf11398e17d59c0f6049d19cd606d35363221951e6d625fcb0/rpds_py-0.22.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f5d36399a1b96e1a5fdc91e0522544580dbebeb1f77f27b2b0ab25559e103b8b", size = 441544 },
+    { url = "https://files.pythonhosted.org/packages/27/fc/403e65e56f65fff25f2973216974976d3f0a5c3f30e53758589b6dc9b79b/rpds_py-0.22.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009de23c9c9ee54bf11303a966edf4d9087cd43a6003672e6aa7def643d06518", size = 385179 },
+    { url = "https://files.pythonhosted.org/packages/57/9b/2be9ff9700d664d51fd96b33d6595791c496d2778cb0b2a634f048437a55/rpds_py-0.22.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1aef18820ef3e4587ebe8b3bc9ba6e55892a6d7b93bac6d29d9f631a3b4befbd", size = 415103 },
+    { url = "https://files.pythonhosted.org/packages/bb/a5/03c2ad8ca10994fcf22dd2150dd1d653bc974fa82d9a590494c84c10c641/rpds_py-0.22.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f60bd8423be1d9d833f230fdbccf8f57af322d96bcad6599e5a771b151398eb2", size = 560916 },
+    { url = "https://files.pythonhosted.org/packages/ba/2e/be4fdfc8b5b576e588782b56978c5b702c5a2307024120d8aeec1ab818f0/rpds_py-0.22.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:62d9cfcf4948683a18a9aff0ab7e1474d407b7bab2ca03116109f8464698ab16", size = 587062 },
+    { url = "https://files.pythonhosted.org/packages/67/e0/2034c221937709bf9c542603d25ad43a68b4b0a9a0c0b06a742f2756eb66/rpds_py-0.22.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9253fc214112405f0afa7db88739294295f0e08466987f1d70e29930262b4c8f", size = 555734 },
+    { url = "https://files.pythonhosted.org/packages/ea/ce/240bae07b5401a22482b58e18cfbabaa392409b2797da60223cca10d7367/rpds_py-0.22.3-cp313-cp313-win32.whl", hash = "sha256:fb0ba113b4983beac1a2eb16faffd76cb41e176bf58c4afe3e14b9c681f702de", size = 220663 },
+    { url = "https://files.pythonhosted.org/packages/cb/f0/d330d08f51126330467edae2fa4efa5cec8923c87551a79299380fdea30d/rpds_py-0.22.3-cp313-cp313-win_amd64.whl", hash = "sha256:c58e2339def52ef6b71b8f36d13c3688ea23fa093353f3a4fee2556e62086ec9", size = 235503 },
+    { url = "https://files.pythonhosted.org/packages/f7/c4/dbe1cc03df013bf2feb5ad00615038050e7859f381e96fb5b7b4572cd814/rpds_py-0.22.3-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:f82a116a1d03628a8ace4859556fb39fd1424c933341a08ea3ed6de1edb0283b", size = 347698 },
+    { url = "https://files.pythonhosted.org/packages/a4/3a/684f66dd6b0f37499cad24cd1c0e523541fd768576fa5ce2d0a8799c3cba/rpds_py-0.22.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3dfcbc95bd7992b16f3f7ba05af8a64ca694331bd24f9157b49dadeeb287493b", size = 337330 },
+    { url = "https://files.pythonhosted.org/packages/82/eb/e022c08c2ce2e8f7683baa313476492c0e2c1ca97227fe8a75d9f0181e95/rpds_py-0.22.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59259dc58e57b10e7e18ce02c311804c10c5a793e6568f8af4dead03264584d1", size = 380022 },
+    { url = "https://files.pythonhosted.org/packages/e4/21/5a80e653e4c86aeb28eb4fea4add1f72e1787a3299687a9187105c3ee966/rpds_py-0.22.3-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5725dd9cc02068996d4438d397e255dcb1df776b7ceea3b9cb972bdb11260a83", size = 390754 },
+    { url = "https://files.pythonhosted.org/packages/37/a4/d320a04ae90f72d080b3d74597074e62be0a8ecad7d7321312dfe2dc5a6a/rpds_py-0.22.3-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99b37292234e61325e7a5bb9689e55e48c3f5f603af88b1642666277a81f1fbd", size = 423840 },
+    { url = "https://files.pythonhosted.org/packages/87/70/674dc47d93db30a6624279284e5631be4c3a12a0340e8e4f349153546728/rpds_py-0.22.3-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:27b1d3b3915a99208fee9ab092b8184c420f2905b7d7feb4aeb5e4a9c509b8a1", size = 438970 },
+    { url = "https://files.pythonhosted.org/packages/3f/64/9500f4d66601d55cadd21e90784cfd5d5f4560e129d72e4339823129171c/rpds_py-0.22.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f612463ac081803f243ff13cccc648578e2279295048f2a8d5eb430af2bae6e3", size = 383146 },
+    { url = "https://files.pythonhosted.org/packages/4d/45/630327addb1d17173adcf4af01336fd0ee030c04798027dfcb50106001e0/rpds_py-0.22.3-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f73d3fef726b3243a811121de45193c0ca75f6407fe66f3f4e183c983573e130", size = 408294 },
+    { url = "https://files.pythonhosted.org/packages/5f/ef/8efb3373cee54ea9d9980b772e5690a0c9e9214045a4e7fa35046e399fee/rpds_py-0.22.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:3f21f0495edea7fdbaaa87e633a8689cd285f8f4af5c869f27bc8074638ad69c", size = 556345 },
+    { url = "https://files.pythonhosted.org/packages/54/01/151d3b9ef4925fc8f15bfb131086c12ec3c3d6dd4a4f7589c335bf8e85ba/rpds_py-0.22.3-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:1e9663daaf7a63ceccbbb8e3808fe90415b0757e2abddbfc2e06c857bf8c5e2b", size = 582292 },
+    { url = "https://files.pythonhosted.org/packages/30/89/35fc7a6cdf3477d441c7aca5e9bbf5a14e0f25152aed7f63f4e0b141045d/rpds_py-0.22.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a76e42402542b1fae59798fab64432b2d015ab9d0c8c47ba7addddbaf7952333", size = 553855 },
+    { url = "https://files.pythonhosted.org/packages/8f/e0/830c02b2457c4bd20a8c5bb394d31d81f57fbefce2dbdd2e31feff4f7003/rpds_py-0.22.3-cp313-cp313t-win32.whl", hash = "sha256:69803198097467ee7282750acb507fba35ca22cc3b85f16cf45fb01cb9097730", size = 219100 },
+    { url = "https://files.pythonhosted.org/packages/f8/30/7ac943f69855c2db77407ae363484b915d861702dbba1aa82d68d57f42be/rpds_py-0.22.3-cp313-cp313t-win_amd64.whl", hash = "sha256:f5cf2a0c2bdadf3791b5c205d55a37a54025c6e18a71c71f82bb536cf9a454bf", size = 233794 },
+    { url = "https://files.pythonhosted.org/packages/db/0f/a8ad17ddac7c880f48d5da50733dd25bfc35ba2be1bec9f23453e8c7a123/rpds_py-0.22.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:378753b4a4de2a7b34063d6f95ae81bfa7b15f2c1a04a9518e8644e81807ebea", size = 359735 },
+    { url = "https://files.pythonhosted.org/packages/0c/41/430903669397ea3ee76865e0b53ea236e8dc0ffbecde47b2c4c783ad6759/rpds_py-0.22.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3445e07bf2e8ecfeef6ef67ac83de670358abf2996916039b16a218e3d95e97e", size = 348724 },
+    { url = "https://files.pythonhosted.org/packages/c9/5c/3496f4f0ee818297544f2d5f641c49dde8ae156392e6834b79c0609ba006/rpds_py-0.22.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b2513ba235829860b13faa931f3b6846548021846ac808455301c23a101689d", size = 381782 },
+    { url = "https://files.pythonhosted.org/packages/b6/dc/db0523ce0cd16ce579185cc9aa9141992de956d0a9c469ecfd1fb5d54ddc/rpds_py-0.22.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eaf16ae9ae519a0e237a0f528fd9f0197b9bb70f40263ee57ae53c2b8d48aeb3", size = 387036 },
+    { url = "https://files.pythonhosted.org/packages/85/2a/9525c2427d2c257f877348918136a5d4e1b945c205a256e53bec61e54551/rpds_py-0.22.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:583f6a1993ca3369e0f80ba99d796d8e6b1a3a2a442dd4e1a79e652116413091", size = 424566 },
+    { url = "https://files.pythonhosted.org/packages/b9/1c/f8c012a39794b84069635709f559c0309103d5d74b3f5013916e6ca4f174/rpds_py-0.22.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4617e1915a539a0d9a9567795023de41a87106522ff83fbfaf1f6baf8e85437e", size = 447203 },
+    { url = "https://files.pythonhosted.org/packages/93/f5/c1c772364570d35b98ba64f36ec90c3c6d0b932bc4d8b9b4efef6dc64b07/rpds_py-0.22.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c150c7a61ed4a4f4955a96626574e9baf1adf772c2fb61ef6a5027e52803543", size = 382283 },
+    { url = "https://files.pythonhosted.org/packages/10/06/f94f61313f94fc75c3c3aa74563f80bbd990e5b25a7c1a38cee7d5d0309b/rpds_py-0.22.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2fa4331c200c2521512595253f5bb70858b90f750d39b8cbfd67465f8d1b596d", size = 410022 },
+    { url = "https://files.pythonhosted.org/packages/3f/b0/37ab416a9528419920dfb64886c220f58fcbd66b978e0a91b66e9ee9a993/rpds_py-0.22.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:214b7a953d73b5e87f0ebece4a32a5bd83c60a3ecc9d4ec8f1dca968a2d91e99", size = 557817 },
+    { url = "https://files.pythonhosted.org/packages/2c/5d/9daa18adcd676dd3b2817c8a7cec3f3ebeeb0ce0d05a1b63bf994fc5114f/rpds_py-0.22.3-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f47ad3d5f3258bd7058d2d506852217865afefe6153a36eb4b6928758041d831", size = 585099 },
+    { url = "https://files.pythonhosted.org/packages/41/3f/ad4e58035d3f848410aa3d59857b5f238bafab81c8b4a844281f80445d62/rpds_py-0.22.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:f276b245347e6e36526cbd4a266a417796fc531ddf391e43574cf6466c492520", size = 552818 },
+    { url = "https://files.pythonhosted.org/packages/b8/19/123acae8f4cab3c9463097c3ced3cc87c46f405056e249c874940e045309/rpds_py-0.22.3-cp39-cp39-win32.whl", hash = "sha256:bbb232860e3d03d544bc03ac57855cd82ddf19c7a07651a7c0fdb95e9efea8b9", size = 220246 },
+    { url = "https://files.pythonhosted.org/packages/8b/8d/9db93e48d96ace1f6713c71ce72e2d94b71d82156c37b6a54e0930486f00/rpds_py-0.22.3-cp39-cp39-win_amd64.whl", hash = "sha256:cfbc454a2880389dbb9b5b398e50d439e2e58669160f27b60e5eca11f68ae17c", size = 231932 },
+    { url = "https://files.pythonhosted.org/packages/8b/63/e29f8ee14fcf383574f73b6bbdcbec0fbc2e5fc36b4de44d1ac389b1de62/rpds_py-0.22.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:d48424e39c2611ee1b84ad0f44fb3b2b53d473e65de061e3f460fc0be5f1939d", size = 360786 },
+    { url = "https://files.pythonhosted.org/packages/d3/e0/771ee28b02a24e81c8c0e645796a371350a2bb6672753144f36ae2d2afc9/rpds_py-0.22.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:24e8abb5878e250f2eb0d7859a8e561846f98910326d06c0d51381fed59357bd", size = 350589 },
+    { url = "https://files.pythonhosted.org/packages/cf/49/abad4c4a1e6f3adf04785a99c247bfabe55ed868133e2d1881200aa5d381/rpds_py-0.22.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b232061ca880db21fa14defe219840ad9b74b6158adb52ddf0e87bead9e8493", size = 381848 },
+    { url = "https://files.pythonhosted.org/packages/3a/7d/f4bc6d6fbe6af7a0d2b5f2ee77079efef7c8528712745659ec0026888998/rpds_py-0.22.3-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac0a03221cdb5058ce0167ecc92a8c89e8d0decdc9e99a2ec23380793c4dcb96", size = 387879 },
+    { url = "https://files.pythonhosted.org/packages/13/b0/575c797377fdcd26cedbb00a3324232e4cb2c5d121f6e4b0dbf8468b12ef/rpds_py-0.22.3-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb0c341fa71df5a4595f9501df4ac5abfb5a09580081dffbd1ddd4654e6e9123", size = 423916 },
+    { url = "https://files.pythonhosted.org/packages/54/78/87157fa39d58f32a68d3326f8a81ad8fb99f49fe2aa7ad9a1b7d544f9478/rpds_py-0.22.3-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bf9db5488121b596dbfc6718c76092fda77b703c1f7533a226a5a9f65248f8ad", size = 448410 },
+    { url = "https://files.pythonhosted.org/packages/59/69/860f89996065a88be1b6ff2d60e96a02b920a262d8aadab99e7903986597/rpds_py-0.22.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b8db6b5b2d4491ad5b6bdc2bc7c017eec108acbf4e6785f42a9eb0ba234f4c9", size = 382841 },
+    { url = "https://files.pythonhosted.org/packages/bd/d7/bc144e10d27e3cb350f98df2492a319edd3caaf52ddfe1293f37a9afbfd7/rpds_py-0.22.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b3d504047aba448d70cf6fa22e06cb09f7cbd761939fdd47604f5e007675c24e", size = 409662 },
+    { url = "https://files.pythonhosted.org/packages/14/2a/6bed0b05233c291a94c7e89bc76ffa1c619d4e1979fbfe5d96024020c1fb/rpds_py-0.22.3-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:e61b02c3f7a1e0b75e20c3978f7135fd13cb6cf551bf4a6d29b999a88830a338", size = 558221 },
+    { url = "https://files.pythonhosted.org/packages/11/23/cd8f566de444a137bc1ee5795e47069a947e60810ba4152886fe5308e1b7/rpds_py-0.22.3-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:e35ba67d65d49080e8e5a1dd40101fccdd9798adb9b050ff670b7d74fa41c566", size = 583780 },
+    { url = "https://files.pythonhosted.org/packages/8d/63/79c3602afd14d501f751e615a74a59040328da5ef29ed5754ae80d236b84/rpds_py-0.22.3-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:26fd7cac7dd51011a245f29a2cc6489c4608b5a8ce8d75661bb4a1066c52dfbe", size = 553619 },
+    { url = "https://files.pythonhosted.org/packages/9f/2e/c5c1689e80298d4e94c75b70faada4c25445739d91b94c211244a3ed7ed1/rpds_py-0.22.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:177c7c0fce2855833819c98e43c262007f42ce86651ffbb84f37883308cb0e7d", size = 233338 },
+    { url = "https://files.pythonhosted.org/packages/bc/b7/d2c205723e3b4d75b03215694f0297a1b4b395bf834cb5896ad9bbb90f90/rpds_py-0.22.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bb47271f60660803ad11f4c61b42242b8c1312a31c98c578f79ef9387bbde21c", size = 360594 },
+    { url = "https://files.pythonhosted.org/packages/d8/8f/c3515f5234cf6055046d4cfe9c80a3742a20acfa7d0b1b290f0d7f56a8db/rpds_py-0.22.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:70fb28128acbfd264eda9bf47015537ba3fe86e40d046eb2963d75024be4d055", size = 349594 },
+    { url = "https://files.pythonhosted.org/packages/6b/98/5b487cb06afc484befe350c87fda37f4ce11333f04f3380aba43dcf5bce2/rpds_py-0.22.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44d61b4b7d0c2c9ac019c314e52d7cbda0ae31078aabd0f22e583af3e0d79723", size = 381138 },
+    { url = "https://files.pythonhosted.org/packages/5e/3a/12308d2c51b3fdfc173619943b7dc5ba41b4850c47112eeda38d9c54ed12/rpds_py-0.22.3-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f0e260eaf54380380ac3808aa4ebe2d8ca28b9087cf411649f96bad6900c728", size = 387828 },
+    { url = "https://files.pythonhosted.org/packages/17/b2/c242241ab5a2a206e093f24ccbfa519c4bbf10a762ac90bffe1766c225e0/rpds_py-0.22.3-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b25bc607423935079e05619d7de556c91fb6adeae9d5f80868dde3468657994b", size = 424634 },
+    { url = "https://files.pythonhosted.org/packages/d5/c7/52a1b15012139f3ba740f291f1d03c6b632938ba61bc605f24c101952493/rpds_py-0.22.3-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fb6116dfb8d1925cbdb52595560584db42a7f664617a1f7d7f6e32f138cdf37d", size = 447862 },
+    { url = "https://files.pythonhosted.org/packages/55/3e/4d3ed8fd01bad77e8ed101116fe63b03f1011940d9596a8f4d82ac80cacd/rpds_py-0.22.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a63cbdd98acef6570c62b92a1e43266f9e8b21e699c363c0fef13bd530799c11", size = 382506 },
+    { url = "https://files.pythonhosted.org/packages/30/78/df59d6f92470a84369a3757abeae1cfd7f7239c8beb6d948949bf78317d2/rpds_py-0.22.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2b8f60e1b739a74bab7e01fcbe3dddd4657ec685caa04681df9d562ef15b625f", size = 410534 },
+    { url = "https://files.pythonhosted.org/packages/38/97/ea45d1edd9b753b20084b52dd5db6ee5e1ac3e036a27149972398a413858/rpds_py-0.22.3-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:2e8b55d8517a2fda8d95cb45d62a5a8bbf9dd0ad39c5b25c8833efea07b880ca", size = 557453 },
+    { url = "https://files.pythonhosted.org/packages/08/cd/3a1b35eb9da27ffbb981cfffd32a01c7655c4431ccb278cb3064f8887462/rpds_py-0.22.3-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:2de29005e11637e7a2361fa151f780ff8eb2543a0da1413bb951e9f14b699ef3", size = 584412 },
+    { url = "https://files.pythonhosted.org/packages/87/91/31d1c5aeb1606f71188259e0ba6ed6f5c21a3c72f58b51db6a8bd0aa2b5d/rpds_py-0.22.3-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:666ecce376999bf619756a24ce15bb14c5bfaf04bf00abc7e663ce17c3f34fe7", size = 553446 },
+    { url = "https://files.pythonhosted.org/packages/e7/ad/03b5ccd1ab492c9dece85b3bf1c96453ab8c47983936fae6880f688f60b3/rpds_py-0.22.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:5246b14ca64a8675e0a7161f7af68fe3e910e6b90542b4bfb5439ba752191df6", size = 233013 },
+]
+
+[[package]]
+name = "safetensors"
+version = "0.5.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f4/4f/2ef9ef1766f8c194b01b67a63a444d2e557c8fe1d82faf3ebd85f370a917/safetensors-0.5.2.tar.gz", hash = "sha256:cb4a8d98ba12fa016f4241932b1fc5e702e5143f5374bba0bbcf7ddc1c4cf2b8", size = 66957 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/d1/017e31e75e274492a11a456a9e7c171f8f7911fe50735b4ec6ff37221220/safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2", size = 427067 },
+    { url = "https://files.pythonhosted.org/packages/24/84/e9d3ff57ae50dd0028f301c9ee064e5087fe8b00e55696677a0413c377a7/safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6d0d6a8ee2215a440e1296b843edf44fd377b055ba350eaba74655a2fe2c4bae", size = 408856 },
+    { url = "https://files.pythonhosted.org/packages/f1/1d/fe95f5dd73db16757b11915e8a5106337663182d0381811c81993e0014a9/safetensors-0.5.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86016d40bcaa3bcc9a56cd74d97e654b5f4f4abe42b038c71e4f00a089c4526c", size = 450088 },
+    { url = "https://files.pythonhosted.org/packages/cf/21/e527961b12d5ab528c6e47b92d5f57f33563c28a972750b238b871924e49/safetensors-0.5.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:990833f70a5f9c7d3fc82c94507f03179930ff7d00941c287f73b6fcbf67f19e", size = 458966 },
+    { url = "https://files.pythonhosted.org/packages/a5/8b/1a037d7a57f86837c0b41905040369aea7d8ca1ec4b2a77592372b2ec380/safetensors-0.5.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dfa7c2f3fe55db34eba90c29df94bcdac4821043fc391cb5d082d9922013869", size = 509915 },
+    { url = "https://files.pythonhosted.org/packages/61/3d/03dd5cfd33839df0ee3f4581a20bd09c40246d169c0e4518f20b21d5f077/safetensors-0.5.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46ff2116150ae70a4e9c490d2ab6b6e1b1b93f25e520e540abe1b81b48560c3a", size = 527664 },
+    { url = "https://files.pythonhosted.org/packages/c5/dc/8952caafa9a10a3c0f40fa86bacf3190ae7f55fa5eef87415b97b29cb97f/safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ab696dfdc060caffb61dbe4066b86419107a24c804a4e373ba59be699ebd8d5", size = 461978 },
+    { url = "https://files.pythonhosted.org/packages/60/da/82de1fcf1194e3dbefd4faa92dc98b33c06bed5d67890e0962dd98e18287/safetensors-0.5.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975", size = 491253 },
+    { url = "https://files.pythonhosted.org/packages/5a/9a/d90e273c25f90c3ba1b0196a972003786f04c39e302fbd6649325b1272bb/safetensors-0.5.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:a00e737948791b94dad83cf0eafc09a02c4d8c2171a239e8c8572fe04e25960e", size = 628644 },
+    { url = "https://files.pythonhosted.org/packages/70/3c/acb23e05aa34b4f5edd2e7f393f8e6480fbccd10601ab42cd03a57d4ab5f/safetensors-0.5.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:d3a06fae62418ec8e5c635b61a8086032c9e281f16c63c3af46a6efbab33156f", size = 721648 },
+    { url = "https://files.pythonhosted.org/packages/71/45/eaa3dba5253a7c6931230dc961641455710ab231f8a89cb3c4c2af70f8c8/safetensors-0.5.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1506e4c2eda1431099cebe9abf6c76853e95d0b7a95addceaa74c6019c65d8cf", size = 659588 },
+    { url = "https://files.pythonhosted.org/packages/b0/71/2f9851164f821064d43b481ddbea0149c2d676c4f4e077b178e7eeaa6660/safetensors-0.5.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5c5b5d9da594f638a259fca766046f44c97244cc7ab8bef161b3e80d04becc76", size = 632533 },
+    { url = "https://files.pythonhosted.org/packages/00/f1/5680e2ef61d9c61454fad82c344f0e40b8741a9dbd1e31484f0d31a9b1c3/safetensors-0.5.2-cp38-abi3-win32.whl", hash = "sha256:fe55c039d97090d1f85277d402954dd6ad27f63034fa81985a9cc59655ac3ee2", size = 291167 },
+    { url = "https://files.pythonhosted.org/packages/86/ca/aa489392ec6fb59223ffce825461e1f811a3affd417121a2088be7a5758b/safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589", size = 303756 },
+]
+
+[[package]]
+name = "scipy"
+version = "1.13.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.10'",
+]
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/00/48c2f661e2816ccf2ecd77982f6605b2950afe60f60a52b4cbbc2504aa8f/scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c", size = 57210720 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/33/59/41b2529908c002ade869623b87eecff3e11e3ce62e996d0bdcb536984187/scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca", size = 39328076 },
+    { url = "https://files.pythonhosted.org/packages/d5/33/f1307601f492f764062ce7dd471a14750f3360e33cd0f8c614dae208492c/scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f", size = 30306232 },
+    { url = "https://files.pythonhosted.org/packages/c0/66/9cd4f501dd5ea03e4a4572ecd874936d0da296bd04d1c45ae1a4a75d9c3a/scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989", size = 33743202 },
+    { url = "https://files.pythonhosted.org/packages/a3/ba/7255e5dc82a65adbe83771c72f384d99c43063648456796436c9a5585ec3/scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f", size = 38577335 },
+    { url = "https://files.pythonhosted.org/packages/49/a5/bb9ded8326e9f0cdfdc412eeda1054b914dfea952bda2097d174f8832cc0/scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94", size = 38820728 },
+    { url = "https://files.pythonhosted.org/packages/12/30/df7a8fcc08f9b4a83f5f27cfaaa7d43f9a2d2ad0b6562cced433e5b04e31/scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54", size = 46210588 },
+    { url = "https://files.pythonhosted.org/packages/b4/15/4a4bb1b15bbd2cd2786c4f46e76b871b28799b67891f23f455323a0cdcfb/scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9", size = 39333805 },
+    { url = "https://files.pythonhosted.org/packages/ba/92/42476de1af309c27710004f5cdebc27bec62c204db42e05b23a302cb0c9a/scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326", size = 30317687 },
+    { url = "https://files.pythonhosted.org/packages/80/ba/8be64fe225360a4beb6840f3cbee494c107c0887f33350d0a47d55400b01/scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299", size = 33694638 },
+    { url = "https://files.pythonhosted.org/packages/36/07/035d22ff9795129c5a847c64cb43c1fa9188826b59344fee28a3ab02e283/scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa", size = 38569931 },
+    { url = "https://files.pythonhosted.org/packages/d9/10/f9b43de37e5ed91facc0cfff31d45ed0104f359e4f9a68416cbf4e790241/scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59", size = 38838145 },
+    { url = "https://files.pythonhosted.org/packages/4a/48/4513a1a5623a23e95f94abd675ed91cfb19989c58e9f6f7d03990f6caf3d/scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b", size = 46196227 },
+    { url = "https://files.pythonhosted.org/packages/f2/7b/fb6b46fbee30fc7051913068758414f2721003a89dd9a707ad49174e3843/scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1", size = 39357301 },
+    { url = "https://files.pythonhosted.org/packages/dc/5a/2043a3bde1443d94014aaa41e0b50c39d046dda8360abd3b2a1d3f79907d/scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d", size = 30363348 },
+    { url = "https://files.pythonhosted.org/packages/e7/cb/26e4a47364bbfdb3b7fb3363be6d8a1c543bcd70a7753ab397350f5f189a/scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627", size = 33406062 },
+    { url = "https://files.pythonhosted.org/packages/88/ab/6ecdc526d509d33814835447bbbeedbebdec7cca46ef495a61b00a35b4bf/scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884", size = 38218311 },
+    { url = "https://files.pythonhosted.org/packages/0b/00/9f54554f0f8318100a71515122d8f4f503b1a2c4b4cfab3b4b68c0eb08fa/scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16", size = 38442493 },
+    { url = "https://files.pythonhosted.org/packages/3e/df/963384e90733e08eac978cd103c34df181d1fec424de383cdc443f418dd4/scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949", size = 45910955 },
+    { url = "https://files.pythonhosted.org/packages/7f/29/c2ea58c9731b9ecb30b6738113a95d147e83922986b34c685b8f6eefde21/scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:436bbb42a94a8aeef855d755ce5a465479c721e9d684de76bf61a62e7c2b81d5", size = 39352927 },
+    { url = "https://files.pythonhosted.org/packages/5c/c0/e71b94b20ccf9effb38d7147c0064c08c622309fd487b1b677771a97d18c/scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:8335549ebbca860c52bf3d02f80784e91a004b71b059e3eea9678ba994796a24", size = 30324538 },
+    { url = "https://files.pythonhosted.org/packages/6d/0f/aaa55b06d474817cea311e7b10aab2ea1fd5d43bc6a2861ccc9caec9f418/scipy-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d533654b7d221a6a97304ab63c41c96473ff04459e404b83275b60aa8f4b7004", size = 33732190 },
+    { url = "https://files.pythonhosted.org/packages/35/f5/d0ad1a96f80962ba65e2ce1de6a1e59edecd1f0a7b55990ed208848012e0/scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d", size = 38612244 },
+    { url = "https://files.pythonhosted.org/packages/8d/02/1165905f14962174e6569076bcc3315809ae1291ed14de6448cc151eedfd/scipy-1.13.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a014c2b3697bde71724244f63de2476925596c24285c7a637364761f8710891c", size = 38845637 },
+    { url = "https://files.pythonhosted.org/packages/3e/77/dab54fe647a08ee4253963bcd8f9cf17509c8ca64d6335141422fe2e2114/scipy-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:392e4ec766654852c25ebad4f64e4e584cf19820b980bc04960bca0b0cd6eaa2", size = 46227440 },
+]
+
+[[package]]
+name = "scipy"
+version = "1.15.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.13'",
+    "python_full_version == '3.12.*'",
+    "python_full_version == '3.11.*'",
+    "python_full_version == '3.10.*'",
+]
+dependencies = [
+    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/76/c6/8eb0654ba0c7d0bb1bf67bf8fbace101a8e4f250f7722371105e8b6f68fc/scipy-1.15.1.tar.gz", hash = "sha256:033a75ddad1463970c96a88063a1df87ccfddd526437136b6ee81ff0312ebdf6", size = 59407493 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/86/53/b204ce5a4433f1864001b9d16f103b9c25f5002a602ae83585d0ea5f9c4a/scipy-1.15.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:c64ded12dcab08afff9e805a67ff4480f5e69993310e093434b10e85dc9d43e1", size = 41414518 },
+    { url = "https://files.pythonhosted.org/packages/c7/fc/54ffa7a8847f7f303197a6ba65a66104724beba2e38f328135a78f0dc480/scipy-1.15.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5b190b935e7db569960b48840e5bef71dc513314cc4e79a1b7d14664f57fd4ff", size = 32519265 },
+    { url = "https://files.pythonhosted.org/packages/f1/77/a98b8ba03d6f371dc31a38719affd53426d4665729dcffbed4afe296784a/scipy-1.15.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:4b17d4220df99bacb63065c76b0d1126d82bbf00167d1730019d2a30d6ae01ea", size = 24792859 },
+    { url = "https://files.pythonhosted.org/packages/a7/78/70bb9f0df7444b18b108580934bfef774822e28fd34a68e5c263c7d2828a/scipy-1.15.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:63b9b6cd0333d0eb1a49de6f834e8aeaefe438df8f6372352084535ad095219e", size = 27886506 },
+    { url = "https://files.pythonhosted.org/packages/14/a7/f40f6033e06de4176ddd6cc8c3ae9f10a226c3bca5d6b4ab883bc9914a14/scipy-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f151e9fb60fbf8e52426132f473221a49362091ce7a5e72f8aa41f8e0da4f25", size = 38375041 },
+    { url = "https://files.pythonhosted.org/packages/17/03/390a1c5c61fd76b0fa4b3c5aa3bdd7e60f6c46f712924f1a9df5705ec046/scipy-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21e10b1dd56ce92fba3e786007322542361984f8463c6d37f6f25935a5a6ef52", size = 40597556 },
+    { url = "https://files.pythonhosted.org/packages/4e/70/fa95b3ae026b97eeca58204a90868802e5155ac71b9d7bdee92b68115dd3/scipy-1.15.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5dff14e75cdbcf07cdaa1c7707db6017d130f0af9ac41f6ce443a93318d6c6e0", size = 42938505 },
+    { url = "https://files.pythonhosted.org/packages/d6/07/427859116bdd71847c898180f01802691f203c3e2455a1eb496130ff07c5/scipy-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:f82fcf4e5b377f819542fbc8541f7b5fbcf1c0017d0df0bc22c781bf60abc4d8", size = 43909663 },
+    { url = "https://files.pythonhosted.org/packages/8e/2e/7b71312da9c2dabff53e7c9a9d08231bc34d9d8fdabe88a6f1155b44591c/scipy-1.15.1-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:5bd8d27d44e2c13d0c1124e6a556454f52cd3f704742985f6b09e75e163d20d2", size = 41424362 },
+    { url = "https://files.pythonhosted.org/packages/81/8c/ab85f1aa1cc200c796532a385b6ebf6a81089747adc1da7482a062acc46c/scipy-1.15.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:be3deeb32844c27599347faa077b359584ba96664c5c79d71a354b80a0ad0ce0", size = 32535910 },
+    { url = "https://files.pythonhosted.org/packages/3b/9c/6f4b787058daa8d8da21ddff881b4320e28de4704a65ec147adb50cb2230/scipy-1.15.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:5eb0ca35d4b08e95da99a9f9c400dc9f6c21c424298a0ba876fdc69c7afacedf", size = 24809398 },
+    { url = "https://files.pythonhosted.org/packages/16/2b/949460a796df75fc7a1ee1becea202cf072edbe325ebe29f6d2029947aa7/scipy-1.15.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:74bb864ff7640dea310a1377d8567dc2cb7599c26a79ca852fc184cc851954ac", size = 27918045 },
+    { url = "https://files.pythonhosted.org/packages/5f/36/67fe249dd7ccfcd2a38b25a640e3af7e59d9169c802478b6035ba91dfd6d/scipy-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:667f950bf8b7c3a23b4199db24cb9bf7512e27e86d0e3813f015b74ec2c6e3df", size = 38332074 },
+    { url = "https://files.pythonhosted.org/packages/fc/da/452e1119e6f720df3feb588cce3c42c5e3d628d4bfd4aec097bd30b7de0c/scipy-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395be70220d1189756068b3173853029a013d8c8dd5fd3d1361d505b2aa58fa7", size = 40588469 },
+    { url = "https://files.pythonhosted.org/packages/7f/71/5f94aceeac99a4941478af94fe9f459c6752d497035b6b0761a700f5f9ff/scipy-1.15.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ce3a000cd28b4430426db2ca44d96636f701ed12e2b3ca1f2b1dd7abdd84b39a", size = 42965214 },
+    { url = "https://files.pythonhosted.org/packages/af/25/caa430865749d504271757cafd24066d596217e83326155993980bc22f97/scipy-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:3fe1d95944f9cf6ba77aa28b82dd6bb2a5b52f2026beb39ecf05304b8392864b", size = 43896034 },
+    { url = "https://files.pythonhosted.org/packages/d8/6e/a9c42d0d39e09ed7fd203d0ac17adfea759cba61ab457671fe66e523dbec/scipy-1.15.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c09aa9d90f3500ea4c9b393ee96f96b0ccb27f2f350d09a47f533293c78ea776", size = 41478318 },
+    { url = "https://files.pythonhosted.org/packages/04/ee/e3e535c81828618878a7433992fecc92fa4df79393f31a8fea1d05615091/scipy-1.15.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:0ac102ce99934b162914b1e4a6b94ca7da0f4058b6d6fd65b0cef330c0f3346f", size = 32596696 },
+    { url = "https://files.pythonhosted.org/packages/c4/5e/b1b0124be8e76f87115f16b8915003eec4b7060298117715baf13f51942c/scipy-1.15.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:09c52320c42d7f5c7748b69e9f0389266fd4f82cf34c38485c14ee976cb8cb04", size = 24870366 },
+    { url = "https://files.pythonhosted.org/packages/14/36/c00cb73eefda85946172c27913ab995c6ad4eee00fa4f007572e8c50cd51/scipy-1.15.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:cdde8414154054763b42b74fe8ce89d7f3d17a7ac5dd77204f0e142cdc9239e9", size = 28007461 },
+    { url = "https://files.pythonhosted.org/packages/68/94/aff5c51b3799349a9d1e67a056772a0f8a47db371e83b498d43467806557/scipy-1.15.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c9d8fc81d6a3b6844235e6fd175ee1d4c060163905a2becce8e74cb0d7554ce", size = 38068174 },
+    { url = "https://files.pythonhosted.org/packages/b0/3c/0de11ca154e24a57b579fb648151d901326d3102115bc4f9a7a86526ce54/scipy-1.15.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fb57b30f0017d4afa5fe5f5b150b8f807618819287c21cbe51130de7ccdaed2", size = 40249869 },
+    { url = "https://files.pythonhosted.org/packages/15/09/472e8d0a6b33199d1bb95e49bedcabc0976c3724edd9b0ef7602ccacf41e/scipy-1.15.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:491d57fe89927fa1aafbe260f4cfa5ffa20ab9f1435025045a5315006a91b8f5", size = 42629068 },
+    { url = "https://files.pythonhosted.org/packages/ff/ba/31c7a8131152822b3a2cdeba76398ffb404d81d640de98287d236da90c49/scipy-1.15.1-cp312-cp312-win_amd64.whl", hash = "sha256:900f3fa3db87257510f011c292a5779eb627043dd89731b9c461cd16ef76ab3d", size = 43621992 },
+    { url = "https://files.pythonhosted.org/packages/2b/bf/dd68965a4c5138a630eeed0baec9ae96e5d598887835bdde96cdd2fe4780/scipy-1.15.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:100193bb72fbff37dbd0bf14322314fc7cbe08b7ff3137f11a34d06dc0ee6b85", size = 41441136 },
+    { url = "https://files.pythonhosted.org/packages/ef/5e/4928581312922d7e4d416d74c416a660addec4dd5ea185401df2269ba5a0/scipy-1.15.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:2114a08daec64980e4b4cbdf5bee90935af66d750146b1d2feb0d3ac30613692", size = 32533699 },
+    { url = "https://files.pythonhosted.org/packages/32/90/03f99c43041852837686898c66767787cd41c5843d7a1509c39ffef683e9/scipy-1.15.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:6b3e71893c6687fc5e29208d518900c24ea372a862854c9888368c0b267387ab", size = 24807289 },
+    { url = "https://files.pythonhosted.org/packages/9d/52/bfe82b42ae112eaba1af2f3e556275b8727d55ac6e4932e7aef337a9d9d4/scipy-1.15.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:837299eec3d19b7e042923448d17d95a86e43941104d33f00da7e31a0f715d3c", size = 27929844 },
+    { url = "https://files.pythonhosted.org/packages/f6/77/54ff610bad600462c313326acdb035783accc6a3d5f566d22757ad297564/scipy-1.15.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82add84e8a9fb12af5c2c1a3a3f1cb51849d27a580cb9e6bd66226195142be6e", size = 38031272 },
+    { url = "https://files.pythonhosted.org/packages/f1/26/98585cbf04c7cf503d7eb0a1966df8a268154b5d923c5fe0c1ed13154c49/scipy-1.15.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:070d10654f0cb6abd295bc96c12656f948e623ec5f9a4eab0ddb1466c000716e", size = 40210217 },
+    { url = "https://files.pythonhosted.org/packages/fd/3f/3d2285eb6fece8bc5dbb2f9f94d61157d61d155e854fd5fea825b8218f12/scipy-1.15.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:55cc79ce4085c702ac31e49b1e69b27ef41111f22beafb9b49fea67142b696c4", size = 42587785 },
+    { url = "https://files.pythonhosted.org/packages/48/7d/5b5251984bf0160d6533695a74a5fddb1fa36edd6f26ffa8c871fbd4782a/scipy-1.15.1-cp313-cp313-win_amd64.whl", hash = "sha256:c352c1b6d7cac452534517e022f8f7b8d139cd9f27e6fbd9f3cbd0bfd39f5bef", size = 43640439 },
+    { url = "https://files.pythonhosted.org/packages/e7/b8/0e092f592d280496de52e152582030f8a270b194f87f890e1a97c5599b81/scipy-1.15.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0458839c9f873062db69a03de9a9765ae2e694352c76a16be44f93ea45c28d2b", size = 41619862 },
+    { url = "https://files.pythonhosted.org/packages/f6/19/0b6e1173aba4db9e0b7aa27fe45019857fb90d6904038b83927cbe0a6c1d/scipy-1.15.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:af0b61c1de46d0565b4b39c6417373304c1d4f5220004058bdad3061c9fa8a95", size = 32610387 },
+    { url = "https://files.pythonhosted.org/packages/e7/02/754aae3bd1fa0f2479ade3cfdf1732ecd6b05853f63eee6066a32684563a/scipy-1.15.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:71ba9a76c2390eca6e359be81a3e879614af3a71dfdabb96d1d7ab33da6f2364", size = 24883814 },
+    { url = "https://files.pythonhosted.org/packages/1f/ac/d7906201604a2ea3b143bb0de51b3966f66441ba50b7dc182c4505b3edf9/scipy-1.15.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:14eaa373c89eaf553be73c3affb11ec6c37493b7eaaf31cf9ac5dffae700c2e0", size = 27944865 },
+    { url = "https://files.pythonhosted.org/packages/84/9d/8f539002b5e203723af6a6f513a45e0a7671e9dabeedb08f417ac17e4edc/scipy-1.15.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f735bc41bd1c792c96bc426dece66c8723283695f02df61dcc4d0a707a42fc54", size = 39883261 },
+    { url = "https://files.pythonhosted.org/packages/97/c0/62fd3bab828bcccc9b864c5997645a3b86372a35941cdaf677565c25c98d/scipy-1.15.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2722a021a7929d21168830790202a75dbb20b468a8133c74a2c0230c72626b6c", size = 42093299 },
+    { url = "https://files.pythonhosted.org/packages/e4/1f/5d46a8d94e9f6d2c913cbb109e57e7eed914de38ea99e2c4d69a9fc93140/scipy-1.15.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bc7136626261ac1ed988dca56cfc4ab5180f75e0ee52e58f1e6aa74b5f3eacd5", size = 43181730 },
+]
+
+[[package]]
+name = "sentencepiece"
+version = "0.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c9/d2/b9c7ca067c26d8ff085d252c89b5f69609ca93fb85a00ede95f4857865d4/sentencepiece-0.2.0.tar.gz", hash = "sha256:a52c19171daaf2e697dc6cbe67684e0fa341b1248966f6aebb541de654d15843", size = 2632106 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f6/71/98648c3b64b23edb5403f74bcc906ad21766872a6e1ada26ea3f1eb941ab/sentencepiece-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:188779e1298a1c8b8253c7d3ad729cb0a9891e5cef5e5d07ce4592c54869e227", size = 2408979 },
+    { url = "https://files.pythonhosted.org/packages/77/9f/7efbaa6d4c0c718a9affbecc536b03ca62f99f421bdffb531c16030e2d2b/sentencepiece-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bed9cf85b296fa2b76fc2547b9cbb691a523864cebaee86304c43a7b4cb1b452", size = 1238845 },
+    { url = "https://files.pythonhosted.org/packages/1c/e4/c2541027a43ec6962ba9b601805d17ba3f86b38bdeae0e8ac65a2981e248/sentencepiece-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7b67e724bead13f18db6e1d10b6bbdc454af574d70efbb36f27d90387be1ca3", size = 1181472 },
+    { url = "https://files.pythonhosted.org/packages/fd/46/316c1ba6c52b97de76aff7b9da678f7afbb52136afb2987c474d95630e65/sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fde4b08cfe237be4484c6c7c2e2c75fb862cfeab6bd5449ce4caeafd97b767a", size = 1259151 },
+    { url = "https://files.pythonhosted.org/packages/aa/5a/3c48738a0835d76dd06c62b6ac48d39c923cde78dd0f587353bdcbb99851/sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c378492056202d1c48a4979650981635fd97875a00eabb1f00c6a236b013b5e", size = 1355931 },
+    { url = "https://files.pythonhosted.org/packages/a6/27/33019685023221ca8ed98e8ceb7ae5e166032686fa3662c68f1f1edf334e/sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1380ce6540a368de2ef6d7e6ba14ba8f3258df650d39ba7d833b79ee68a52040", size = 1301537 },
+    { url = "https://files.pythonhosted.org/packages/ca/e4/55f97cef14293171fef5f96e96999919ab5b4d1ce95b53547ad653d7e3bf/sentencepiece-0.2.0-cp310-cp310-win32.whl", hash = "sha256:a1151d6a6dd4b43e552394aed0edfe9292820272f0194bd56c7c1660a0c06c3d", size = 936747 },
+    { url = "https://files.pythonhosted.org/packages/85/f4/4ef1a6e0e9dbd8a60780a91df8b7452ada14cfaa0e17b3b8dfa42cecae18/sentencepiece-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:d490142b0521ef22bc1085f061d922a2a6666175bb6b42e588ff95c0db6819b2", size = 991525 },
+    { url = "https://files.pythonhosted.org/packages/32/43/8f8885168a47a02eba1455bd3f4f169f50ad5b8cebd2402d0f5e20854d04/sentencepiece-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:17982700c4f6dbb55fa3594f3d7e5dd1c8659a274af3738e33c987d2a27c9d5c", size = 2409036 },
+    { url = "https://files.pythonhosted.org/packages/0f/35/e63ba28062af0a3d688a9f128e407a1a2608544b2f480cb49bf7f4b1cbb9/sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7c867012c0e8bcd5bdad0f791609101cb5c66acb303ab3270218d6debc68a65e", size = 1238921 },
+    { url = "https://files.pythonhosted.org/packages/de/42/ae30952c4a0bd773e90c9bf2579f5533037c886dfc8ec68133d5694f4dd2/sentencepiece-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7fd6071249c74f779c5b27183295b9202f8dedb68034e716784364443879eaa6", size = 1181477 },
+    { url = "https://files.pythonhosted.org/packages/e3/ac/2f2ab1d60bb2d795d054eebe5e3f24b164bc21b5a9b75fba7968b3b91b5a/sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f90c55a65013cbb8f4d7aab0599bf925cde4adc67ae43a0d323677b5a1c6cb", size = 1259182 },
+    { url = "https://files.pythonhosted.org/packages/45/fb/14633c6ecf262c468759ffcdb55c3a7ee38fe4eda6a70d75ee7c7d63c58b/sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b293734059ef656dcd65be62ff771507bea8fed0a711b6733976e1ed3add4553", size = 1355537 },
+    { url = "https://files.pythonhosted.org/packages/fb/12/2f5c8d4764b00033cf1c935b702d3bb878d10be9f0b87f0253495832d85f/sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e58b47f933aca74c6a60a79dcb21d5b9e47416256c795c2d58d55cec27f9551d", size = 1301464 },
+    { url = "https://files.pythonhosted.org/packages/4e/b1/67afc0bde24f6dcb3acdea0dd8dcdf4b8b0db240f6bacd39378bd32d09f8/sentencepiece-0.2.0-cp311-cp311-win32.whl", hash = "sha256:c581258cf346b327c62c4f1cebd32691826306f6a41d8c4bec43b010dee08e75", size = 936749 },
+    { url = "https://files.pythonhosted.org/packages/a2/f6/587c62fd21fc988555b85351f50bbde43a51524caafd63bc69240ded14fd/sentencepiece-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:0993dbc665f4113017892f1b87c3904a44d0640eda510abcacdfb07f74286d36", size = 991520 },
+    { url = "https://files.pythonhosted.org/packages/27/5a/141b227ed54293360a9ffbb7bf8252b4e5efc0400cdeac5809340e5d2b21/sentencepiece-0.2.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ea5f536e32ea8ec96086ee00d7a4a131ce583a1b18d130711707c10e69601cb2", size = 2409370 },
+    { url = "https://files.pythonhosted.org/packages/2e/08/a4c135ad6fc2ce26798d14ab72790d66e813efc9589fd30a5316a88ca8d5/sentencepiece-0.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d0cb51f53b6aae3c36bafe41e86167c71af8370a039f542c43b0cce5ef24a68c", size = 1239288 },
+    { url = "https://files.pythonhosted.org/packages/49/0a/2fe387f825ac5aad5a0bfe221904882106cac58e1b693ba7818785a882b6/sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3212121805afc58d8b00ab4e7dd1f8f76c203ddb9dc94aa4079618a31cf5da0f", size = 1181597 },
+    { url = "https://files.pythonhosted.org/packages/cc/38/e4698ee2293fe4835dc033c49796a39b3eebd8752098f6bd0aa53a14af1f/sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a3149e3066c2a75e0d68a43eb632d7ae728c7925b517f4c05c40f6f7280ce08", size = 1259220 },
+    { url = "https://files.pythonhosted.org/packages/12/24/fd7ef967c9dad2f6e6e5386d0cadaf65cda8b7be6e3861a9ab3121035139/sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:632f3594d3e7ac8b367bca204cb3fd05a01d5b21455acd097ea4c0e30e2f63d7", size = 1355962 },
+    { url = "https://files.pythonhosted.org/packages/4f/d2/18246f43ca730bb81918f87b7e886531eda32d835811ad9f4657c54eee35/sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f295105c6bdbb05bd5e1b0cafbd78ff95036f5d3641e7949455a3f4e5e7c3109", size = 1301706 },
+    { url = "https://files.pythonhosted.org/packages/8a/47/ca237b562f420044ab56ddb4c278672f7e8c866e183730a20e413b38a989/sentencepiece-0.2.0-cp312-cp312-win32.whl", hash = "sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251", size = 936941 },
+    { url = "https://files.pythonhosted.org/packages/c6/97/d159c32642306ee2b70732077632895438867b3b6df282354bd550cf2a67/sentencepiece-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a673a72aab81fef5ebe755c6e0cc60087d1f3a4700835d40537183c1703a45f", size = 991994 },
+    { url = "https://files.pythonhosted.org/packages/e9/18/eb620d94d63f62ca69cecccf4459529864ac3fbb35ec123190bd58dadb46/sentencepiece-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1e0f9c4d0a6b0af59b613175f019916e28ade076e21242fd5be24340d8a2f64a", size = 2409003 },
+    { url = "https://files.pythonhosted.org/packages/6e/a6/df28bc0b6a2a86416232c0a5f0d69a9cb7244bb95cb5dcdfcbf01cced8a6/sentencepiece-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:298f21cc1366eb60311aedba3169d30f885c363ddbf44214b0a587d2908141ad", size = 1238898 },
+    { url = "https://files.pythonhosted.org/packages/79/91/b54a528e0789cd7986341ed3909bec56365c3b672daef8b10aa4098238f0/sentencepiece-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f1ec95aa1e5dab11f37ac7eff190493fd87770f7a8b81ebc9dd768d1a3c8704", size = 1181534 },
+    { url = "https://files.pythonhosted.org/packages/a3/69/e96ef68261fa5b82379fdedb325ceaf1d353c6e839ec346d8244e0da5f2f/sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b06b70af54daa4b4904cbb90b4eb6d35c9f3252fdc86c9c32d5afd4d30118d8", size = 1259161 },
+    { url = "https://files.pythonhosted.org/packages/45/de/461d15856c29ba1ce778cf76e0462572661f647abc8a5373690c52e98a00/sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22e37bac44dd6603388cb598c64ff7a76e41ca774646f21c23aadfbf5a2228ab", size = 1355945 },
+    { url = "https://files.pythonhosted.org/packages/5f/01/c95e42eb86282b2c79305d3e0b0ca5a743f85a61262bb7130999c70b9374/sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5", size = 1301596 },
+    { url = "https://files.pythonhosted.org/packages/be/47/e16f368fe6327e873e8029aa539115025e9f61a4e8ca8f0f8eaf8e6a4c1c/sentencepiece-0.2.0-cp39-cp39-win32.whl", hash = "sha256:38aed822fb76435fa1f12185f10465a94ab9e51d5e8a9159e9a540ce926f0ffd", size = 936757 },
+    { url = "https://files.pythonhosted.org/packages/4b/36/497e6407700efd6b97f81bc160913a70d33b9b09227429f68fc86f387bbe/sentencepiece-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8cf876516548b5a1d6ac4745d8b554f5c07891d55da557925e5c13ff0b4e6ad", size = 991541 },
+]
+
+[[package]]
+name = "setuptools"
+version = "75.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/92/ec/089608b791d210aec4e7f97488e67ab0d33add3efccb83a056cbafe3a2a6/setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6", size = 1343222 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/8a/b9dc7678803429e4a3bc9ba462fa3dd9066824d3c607490235c6a796be5a/setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3", size = 1228782 },
+]
+
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755 },
+]
+
+[[package]]
+name = "six"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 },
+]
+
+[[package]]
+name = "sympy"
+version = "1.13.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mpmath" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ca/99/5a5b6f19ff9f083671ddf7b9632028436167cd3d33e11015754e41b249a4/sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f", size = 7533040 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177 },
+]
+
+[[package]]
+name = "text-generation-server"
+version = "2.0.5.dev0"
+source = { virtual = "." }
+dependencies = [
+    { name = "einops" },
+    { name = "grpc-interceptor" },
+    { name = "grpcio" },
+    { name = "grpcio-reflection" },
+    { name = "grpcio-status" },
+    { name = "hf-transfer" },
+    { name = "loguru" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp" },
+    { name = "opentelemetry-instrumentation-grpc" },
+    { name = "pillow" },
+    { name = "prometheus-client" },
+    { name = "protobuf" },
+    { name = "py-cpuinfo" },
+    { name = "rich" },
+    { name = "safetensors" },
+    { name = "scipy", version = "1.13.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "scipy", version = "1.15.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "sentencepiece" },
+    { name = "tokenizers" },
+    { name = "typer" },
+]
+
+[package.optional-dependencies]
+accelerate = [
+    { name = "accelerate" },
+]
+attention = [
+    { name = "attention-kernels", version = "0.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
+    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
+    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
+    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
+    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+]
+bnb = [
+    { name = "bitsandbytes" },
+]
+compressed-tensors = [
+    { name = "compressed-tensors" },
+]
+dev = [
+    { name = "grpcio-tools" },
+    { name = "pytest" },
+]
+gen = [
+    { name = "grpcio-tools" },
+    { name = "mypy-protobuf" },
+]
+marlin = [
+    { name = "marlin-kernels", version = "0.3.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+]
+moe = [
+    { name = "moe-kernels", version = "0.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+]
+outlines = [
+    { name = "outlines" },
+]
+peft = [
+    { name = "peft" },
+]
+quantize = [
+    { name = "datasets" },
+    { name = "texttable" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "accelerate", marker = "extra == 'accelerate'", specifier = ">=1.2.1,<2" },
+    { name = "attention-kernels", marker = "python_full_version == '3.9.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" },
+    { name = "attention-kernels", marker = "(python_full_version < '3.9' and extra == 'attention') or (python_full_version >= '3.13' and extra == 'attention')" },
+    { name = "attention-kernels", marker = "python_full_version == '3.10.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" },
+    { name = "attention-kernels", marker = "python_full_version == '3.11.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" },
+    { name = "attention-kernels", marker = "python_full_version == '3.12.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" },
+    { name = "bitsandbytes", marker = "extra == 'bnb'", specifier = ">=0.45.0" },
+    { name = "compressed-tensors", marker = "extra == 'compressed-tensors'", specifier = ">=0.9.0" },
+    { name = "datasets", marker = "extra == 'quantize'", specifier = ">=2.21,<3" },
+    { name = "einops", specifier = ">=0.8.0" },
+    { name = "grpc-interceptor", specifier = ">=0.15.4" },
+    { name = "grpcio", specifier = ">=1.69.0" },
+    { name = "grpcio-reflection", specifier = ">=1.69.0" },
+    { name = "grpcio-status", specifier = ">=1.69.0" },
+    { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" },
+    { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
+    { name = "hf-transfer", specifier = ">=0.1.9" },
+    { name = "loguru", specifier = ">=0.7.3" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.9.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "(python_full_version < '3.9' and extra == 'marlin') or (python_full_version >= '3.13' and extra == 'marlin')" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.9.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "(python_full_version < '3.9' and extra == 'moe') or (python_full_version >= '3.13' and extra == 'moe')" },
+    { name = "moe-kernels", marker = "python_full_version == '3.10.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.11.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.12.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" },
+    { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
+    { name = "numpy", specifier = ">=2.0.2" },
+    { name = "opentelemetry-api", specifier = ">=1.29.0" },
+    { name = "opentelemetry-exporter-otlp", specifier = ">=1.29.0" },
+    { name = "opentelemetry-instrumentation-grpc", specifier = ">=0.50b0" },
+    { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13" },
+    { name = "peft", marker = "extra == 'peft'", specifier = ">=0.14.0" },
+    { name = "pillow", specifier = ">=11.1.0" },
+    { name = "prometheus-client", specifier = ">=0.21.1" },
+    { name = "protobuf", specifier = ">=5.29.3" },
+    { name = "py-cpuinfo", specifier = ">=9.0.0" },
+    { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.3.0,<8" },
+    { name = "rich", specifier = ">=13.9.4" },
+    { name = "safetensors", specifier = ">=0.5.2" },
+    { name = "scipy", specifier = ">=1.13.1" },
+    { name = "sentencepiece", specifier = ">=0.2.0" },
+    { name = "texttable", marker = "extra == 'quantize'", specifier = ">=1.6.7,<2" },
+    { name = "tokenizers", specifier = ">=0.21.0" },
+    { name = "typer", specifier = ">=0.15.1" },
+]
+
+[[package]]
+name = "texttable"
+version = "1.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1c/dc/0aff23d6036a4d3bf4f1d8c8204c5c79c4437e25e0ae94ffe4bbb55ee3c2/texttable-1.7.0.tar.gz", hash = "sha256:2d2068fb55115807d3ac77a4ca68fa48803e84ebb0ee2340f858107a36522638", size = 12831 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/24/99/4772b8e00a136f3e01236de33b0efda31ee7077203ba5967fcc76da94d65/texttable-1.7.0-py2.py3-none-any.whl", hash = "sha256:72227d592c82b3d7f672731ae73e4d1f88cd8e2ef5b075a7a7f01a23a3743917", size = 10768 },
+]
+
+[[package]]
+name = "tokenizers"
+version = "0.21.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/20/41/c2be10975ca37f6ec40d7abd7e98a5213bb04f284b869c1a24e6504fd94d/tokenizers-0.21.0.tar.gz", hash = "sha256:ee0894bf311b75b0c03079f33859ae4b2334d675d4e93f5a4132e1eae2834fe4", size = 343021 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b0/5c/8b09607b37e996dc47e70d6a7b6f4bdd4e4d5ab22fe49d7374565c7fefaf/tokenizers-0.21.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c4c93eae637e7d2aaae3d376f06085164e1660f89304c0ab2b1d08a406636b2", size = 2647461 },
+    { url = "https://files.pythonhosted.org/packages/22/7a/88e58bb297c22633ed1c9d16029316e5b5ac5ee44012164c2edede599a5e/tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f53ea537c925422a2e0e92a24cce96f6bc5046bbef24a1652a5edc8ba975f62e", size = 2563639 },
+    { url = "https://files.pythonhosted.org/packages/f7/14/83429177c19364df27d22bc096d4c2e431e0ba43e56c525434f1f9b0fd00/tokenizers-0.21.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b177fb54c4702ef611de0c069d9169f0004233890e0c4c5bd5508ae05abf193", size = 2903304 },
+    { url = "https://files.pythonhosted.org/packages/7e/db/3433eab42347e0dc5452d8fcc8da03f638c9accffefe5a7c78146666964a/tokenizers-0.21.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6b43779a269f4629bebb114e19c3fca0223296ae9fea8bb9a7a6c6fb0657ff8e", size = 2804378 },
+    { url = "https://files.pythonhosted.org/packages/57/8b/7da5e6f89736c2ade02816b4733983fca1c226b0c42980b1ae9dc8fcf5cc/tokenizers-0.21.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9aeb255802be90acfd363626753fda0064a8df06031012fe7d52fd9a905eb00e", size = 3095488 },
+    { url = "https://files.pythonhosted.org/packages/4d/f6/5ed6711093dc2c04a4e03f6461798b12669bc5a17c8be7cce1240e0b5ce8/tokenizers-0.21.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d8b09dbeb7a8d73ee204a70f94fc06ea0f17dcf0844f16102b9f414f0b7463ba", size = 3121410 },
+    { url = "https://files.pythonhosted.org/packages/81/42/07600892d48950c5e80505b81411044a2d969368cdc0d929b1c847bf6697/tokenizers-0.21.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:400832c0904f77ce87c40f1a8a27493071282f785724ae62144324f171377273", size = 3388821 },
+    { url = "https://files.pythonhosted.org/packages/22/06/69d7ce374747edaf1695a4f61b83570d91cc8bbfc51ccfecf76f56ab4aac/tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e84ca973b3a96894d1707e189c14a774b701596d579ffc7e69debfc036a61a04", size = 3008868 },
+    { url = "https://files.pythonhosted.org/packages/c8/69/54a0aee4d576045b49a0eb8bffdc495634309c823bf886042e6f46b80058/tokenizers-0.21.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:eb7202d231b273c34ec67767378cd04c767e967fda12d4a9e36208a34e2f137e", size = 8975831 },
+    { url = "https://files.pythonhosted.org/packages/f7/f3/b776061e4f3ebf2905ba1a25d90380aafd10c02d406437a8ba22d1724d76/tokenizers-0.21.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:089d56db6782a73a27fd8abf3ba21779f5b85d4a9f35e3b493c7bbcbbf0d539b", size = 8920746 },
+    { url = "https://files.pythonhosted.org/packages/d8/ee/ce83d5ec8b6844ad4c3ecfe3333d58ecc1adc61f0878b323a15355bcab24/tokenizers-0.21.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:c87ca3dc48b9b1222d984b6b7490355a6fdb411a2d810f6f05977258400ddb74", size = 9161814 },
+    { url = "https://files.pythonhosted.org/packages/18/07/3e88e65c0ed28fa93aa0c4d264988428eef3df2764c3126dc83e243cb36f/tokenizers-0.21.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4145505a973116f91bc3ac45988a92e618a6f83eb458f49ea0790df94ee243ff", size = 9357138 },
+    { url = "https://files.pythonhosted.org/packages/15/b0/dc4572ca61555fc482ebc933f26cb407c6aceb3dc19c301c68184f8cad03/tokenizers-0.21.0-cp39-abi3-win32.whl", hash = "sha256:eb1702c2f27d25d9dd5b389cc1f2f51813e99f8ca30d9e25348db6585a97e24a", size = 2202266 },
+    { url = "https://files.pythonhosted.org/packages/44/69/d21eb253fa91622da25585d362a874fa4710be600f0ea9446d8d0217cec1/tokenizers-0.21.0-cp39-abi3-win_amd64.whl", hash = "sha256:87841da5a25a3a5f70c102de371db120f41873b854ba65e52bccd57df5a3780c", size = 2389192 },
+]
+
+[[package]]
+name = "tomli"
+version = "2.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077 },
+    { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429 },
+    { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067 },
+    { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030 },
+    { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898 },
+    { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894 },
+    { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319 },
+    { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273 },
+    { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310 },
+    { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309 },
+    { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762 },
+    { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453 },
+    { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486 },
+    { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349 },
+    { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159 },
+    { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243 },
+    { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645 },
+    { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584 },
+    { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875 },
+    { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418 },
+    { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708 },
+    { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582 },
+    { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543 },
+    { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691 },
+    { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170 },
+    { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530 },
+    { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666 },
+    { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954 },
+    { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724 },
+    { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383 },
+    { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 },
+]
+
+[[package]]
+name = "torch"
+version = "2.5.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "jinja2" },
+    { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setuptools", marker = "python_full_version >= '3.12'" },
+    { name = "sympy" },
+    { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "typing-extensions" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/ef/834af4a885b31a0b32fff2d80e1e40f771e1566ea8ded55347502440786a/torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744", size = 906446312 },
+    { url = "https://files.pythonhosted.org/packages/69/f0/46e74e0d145f43fa506cb336eaefb2d240547e4ce1f496e442711093ab25/torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601", size = 91919522 },
+    { url = "https://files.pythonhosted.org/packages/a5/13/1eb674c8efbd04d71e4a157ceba991904f633e009a584dd65dccbafbb648/torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa", size = 203088048 },
+    { url = "https://files.pythonhosted.org/packages/a9/9d/e0860474ee0ff8f6ef2c50ec8f71a250f38d78a9b9df9fd241ad3397a65b/torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86", size = 63877046 },
+    { url = "https://files.pythonhosted.org/packages/d1/35/e8b2daf02ce933e4518e6f5682c72fd0ed66c15910ea1fb4168f442b71c4/torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457", size = 906474467 },
+    { url = "https://files.pythonhosted.org/packages/40/04/bd91593a4ca178ece93ca55f27e2783aa524aaccbfda66831d59a054c31e/torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9", size = 91919450 },
+    { url = "https://files.pythonhosted.org/packages/0d/4a/e51420d46cfc90562e85af2fee912237c662ab31140ab179e49bd69401d6/torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a", size = 203098237 },
+    { url = "https://files.pythonhosted.org/packages/d0/db/5d9cbfbc7968d79c5c09a0bc0bc3735da079f2fd07cc10498a62b320a480/torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c", size = 63884466 },
+    { url = "https://files.pythonhosted.org/packages/8b/5c/36c114d120bfe10f9323ed35061bc5878cc74f3f594003854b0ea298942f/torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03", size = 906389343 },
+    { url = "https://files.pythonhosted.org/packages/6d/69/d8ada8b6e0a4257556d5b4ddeb4345ea8eeaaef3c98b60d1cca197c7ad8e/torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697", size = 91811673 },
+    { url = "https://files.pythonhosted.org/packages/5f/ba/607d013b55b9fd805db2a5c2662ec7551f1910b4eef39653eeaba182c5b2/torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c", size = 203046841 },
+    { url = "https://files.pythonhosted.org/packages/57/6c/bf52ff061da33deb9f94f4121fde7ff3058812cb7d2036c97bc167793bd1/torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1", size = 63858109 },
+    { url = "https://files.pythonhosted.org/packages/69/72/20cb30f3b39a9face296491a86adb6ff8f1a47a897e4d14667e6cf89d5c3/torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7", size = 906393265 },
+    { url = "https://files.pythonhosted.org/packages/a9/18/81c399e8f4f1580d34bf99d827cb5fb5cf7a18a266bb5d30ca3ec2e89ba6/torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1f3b7fb3cf7ab97fae52161423f81be8c6b8afac8d9760823fd623994581e1a3", size = 906479005 },
+    { url = "https://files.pythonhosted.org/packages/5d/86/1c4b168d52cddb8d17952a7b5b25f69ef0da1fc34de1223d73d0d9db1801/torch-2.5.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7974e3dce28b5a21fb554b73e1bc9072c25dde873fa00d54280861e7a009d7dc", size = 91846074 },
+    { url = "https://files.pythonhosted.org/packages/76/49/4a0a8b19ce8f9bf32fcab4e863c7e2366f519f9826c84ca250567b11a014/torch-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:46c817d3ea33696ad3b9df5e774dba2257e9a4cd3c4a3afbf92f6bb13ac5ce2d", size = 203000888 },
+    { url = "https://files.pythonhosted.org/packages/25/07/3548a7cfcf69d0eccec2ee79ee3913f1cdaadb27b36946774db86729ee47/torch-2.5.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8046768b7f6d35b85d101b4b38cba8aa2f3cd51952bc4c06a49580f2ce682291", size = 63876023 },
+]
+
+[[package]]
+name = "tqdm"
+version = "4.67.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540 },
+]
+
+[[package]]
+name = "transformers"
+version = "4.48.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "huggingface-hub" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "regex" },
+    { name = "requests" },
+    { name = "safetensors" },
+    { name = "tokenizers" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ea/71/93a6331682d6f15adf7d646956db0c43e5f1759bbbd05f2ef53029bae107/transformers-4.48.0.tar.gz", hash = "sha256:03fdfcbfb8b0367fb6c9fbe9d1c9aa54dfd847618be9b52400b2811d22799cb1", size = 8372101 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/d6/a69764e89fc5c2c957aa473881527c8c35521108d553df703e9ba703daeb/transformers-4.48.0-py3-none-any.whl", hash = "sha256:6d3de6d71cb5f2a10f9775ccc17abce9620195caaf32ec96542bd2a6937f25b0", size = 9673380 },
+]
+
+[[package]]
+name = "triton"
+version = "3.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 },
+    { url = "https://files.pythonhosted.org/packages/86/17/d9a5cf4fcf46291856d1e90762e36cbabd2a56c7265da0d1d9508c8e3943/triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c", size = 209506424 },
+    { url = "https://files.pythonhosted.org/packages/78/eb/65f5ba83c2a123f6498a3097746607e5b2f16add29e36765305e4ac7fdd8/triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc", size = 209551444 },
+    { url = "https://files.pythonhosted.org/packages/c4/69/57e0fed438d547524e08bfedc587078314176ad1c15c8be904d3f03149ec/triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11", size = 209460480 },
+]
+
+[[package]]
+name = "typer"
+version = "0.15.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "rich" },
+    { name = "shellingham" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cb/ce/dca7b219718afd37a0068f4f2530a727c2b74a8b6e8e0c0080a4c0de4fcd/typer-0.15.1.tar.gz", hash = "sha256:a0588c0a7fa68a1978a069818657778f86abe6ff5ea6abf472f940a08bfe4f0a", size = 99789 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/cc/0a838ba5ca64dc832aa43f727bd586309846b0ffb2ce52422543e6075e8a/typer-0.15.1-py3-none-any.whl", hash = "sha256:7994fb7b8155b64d3402518560648446072864beefd44aa2dc36972a5972e847", size = 44908 },
+]
+
+[[package]]
+name = "types-protobuf"
+version = "5.29.1.20241207"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/70/89/b661a447139f665ccea8e39bfdd52a92f803df4b5de0e6001a3537feaacb/types_protobuf-5.29.1.20241207.tar.gz", hash = "sha256:2ebcadb8ab3ef2e3e2f067e0882906d64ba0dc65fc5b0fd7a8b692315b4a0be9", size = 59190 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/6e/cdf152187019d6f6d04066b23e48659d961b527e9c6d43b48459d160e332/types_protobuf-5.29.1.20241207-py3-none-any.whl", hash = "sha256:92893c42083e9b718c678badc0af7a9a1307b92afe1599e5cba5f3d35b668b2f", size = 73902 },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.12.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 },
+]
+
+[[package]]
+name = "tzdata"
+version = "2024.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e1/34/943888654477a574a86a98e9896bae89c7aa15078ec29f490fef2f1e5384/tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc", size = 193282 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 },
+]
+
+[[package]]
+name = "win32-setctime"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083 },
+]
+
+[[package]]
+name = "wrapt"
+version = "1.17.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/fc/e91cc220803d7bc4db93fb02facd8461c37364151b8494762cc88b0fbcef/wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3", size = 55531 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/d1/1daec934997e8b160040c78d7b31789f19b122110a75eca3d4e8da0049e1/wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984", size = 53307 },
+    { url = "https://files.pythonhosted.org/packages/1b/7b/13369d42651b809389c1a7153baa01d9700430576c81a2f5c5e460df0ed9/wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22", size = 38486 },
+    { url = "https://files.pythonhosted.org/packages/62/bf/e0105016f907c30b4bd9e377867c48c34dc9c6c0c104556c9c9126bd89ed/wrapt-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80dd7db6a7cb57ffbc279c4394246414ec99537ae81ffd702443335a61dbf3a7", size = 38777 },
+    { url = "https://files.pythonhosted.org/packages/27/70/0f6e0679845cbf8b165e027d43402a55494779295c4b08414097b258ac87/wrapt-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a6e821770cf99cc586d33833b2ff32faebdbe886bd6322395606cf55153246c", size = 83314 },
+    { url = "https://files.pythonhosted.org/packages/0f/77/0576d841bf84af8579124a93d216f55d6f74374e4445264cb378a6ed33eb/wrapt-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b60fb58b90c6d63779cb0c0c54eeb38941bae3ecf7a73c764c52c88c2dcb9d72", size = 74947 },
+    { url = "https://files.pythonhosted.org/packages/90/ec/00759565518f268ed707dcc40f7eeec38637d46b098a1f5143bff488fe97/wrapt-1.17.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b870b5df5b71d8c3359d21be8f0d6c485fa0ebdb6477dda51a1ea54a9b558061", size = 82778 },
+    { url = "https://files.pythonhosted.org/packages/f8/5a/7cffd26b1c607b0b0c8a9ca9d75757ad7620c9c0a9b4a25d3f8a1480fafc/wrapt-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4011d137b9955791f9084749cba9a367c68d50ab8d11d64c50ba1688c9b457f2", size = 81716 },
+    { url = "https://files.pythonhosted.org/packages/7e/09/dccf68fa98e862df7e6a60a61d43d644b7d095a5fc36dbb591bbd4a1c7b2/wrapt-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1473400e5b2733e58b396a04eb7f35f541e1fb976d0c0724d0223dd607e0f74c", size = 74548 },
+    { url = "https://files.pythonhosted.org/packages/b7/8e/067021fa3c8814952c5e228d916963c1115b983e21393289de15128e867e/wrapt-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3cedbfa9c940fdad3e6e941db7138e26ce8aad38ab5fe9dcfadfed9db7a54e62", size = 81334 },
+    { url = "https://files.pythonhosted.org/packages/4b/0d/9d4b5219ae4393f718699ca1c05f5ebc0c40d076f7e65fd48f5f693294fb/wrapt-1.17.2-cp310-cp310-win32.whl", hash = "sha256:582530701bff1dec6779efa00c516496968edd851fba224fbd86e46cc6b73563", size = 36427 },
+    { url = "https://files.pythonhosted.org/packages/72/6a/c5a83e8f61aec1e1aeef939807602fb880e5872371e95df2137142f5c58e/wrapt-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:58705da316756681ad3c9c73fd15499aa4d8c69f9fd38dc8a35e06c12468582f", size = 38774 },
+    { url = "https://files.pythonhosted.org/packages/cd/f7/a2aab2cbc7a665efab072344a8949a71081eed1d2f451f7f7d2b966594a2/wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58", size = 53308 },
+    { url = "https://files.pythonhosted.org/packages/50/ff/149aba8365fdacef52b31a258c4dc1c57c79759c335eff0b3316a2664a64/wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda", size = 38488 },
+    { url = "https://files.pythonhosted.org/packages/65/46/5a917ce85b5c3b490d35c02bf71aedaa9f2f63f2d15d9949cc4ba56e8ba9/wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438", size = 38776 },
+    { url = "https://files.pythonhosted.org/packages/ca/74/336c918d2915a4943501c77566db41d1bd6e9f4dbc317f356b9a244dfe83/wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a", size = 83776 },
+    { url = "https://files.pythonhosted.org/packages/09/99/c0c844a5ccde0fe5761d4305485297f91d67cf2a1a824c5f282e661ec7ff/wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000", size = 75420 },
+    { url = "https://files.pythonhosted.org/packages/b4/b0/9fc566b0fe08b282c850063591a756057c3247b2362b9286429ec5bf1721/wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6", size = 83199 },
+    { url = "https://files.pythonhosted.org/packages/9d/4b/71996e62d543b0a0bd95dda485219856def3347e3e9380cc0d6cf10cfb2f/wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b", size = 82307 },
+    { url = "https://files.pythonhosted.org/packages/39/35/0282c0d8789c0dc9bcc738911776c762a701f95cfe113fb8f0b40e45c2b9/wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662", size = 75025 },
+    { url = "https://files.pythonhosted.org/packages/4f/6d/90c9fd2c3c6fee181feecb620d95105370198b6b98a0770cba090441a828/wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72", size = 81879 },
+    { url = "https://files.pythonhosted.org/packages/8f/fa/9fb6e594f2ce03ef03eddbdb5f4f90acb1452221a5351116c7c4708ac865/wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317", size = 36419 },
+    { url = "https://files.pythonhosted.org/packages/47/f8/fb1773491a253cbc123c5d5dc15c86041f746ed30416535f2a8df1f4a392/wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3", size = 38773 },
+    { url = "https://files.pythonhosted.org/packages/a1/bd/ab55f849fd1f9a58ed7ea47f5559ff09741b25f00c191231f9f059c83949/wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925", size = 53799 },
+    { url = "https://files.pythonhosted.org/packages/53/18/75ddc64c3f63988f5a1d7e10fb204ffe5762bc663f8023f18ecaf31a332e/wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392", size = 38821 },
+    { url = "https://files.pythonhosted.org/packages/48/2a/97928387d6ed1c1ebbfd4efc4133a0633546bec8481a2dd5ec961313a1c7/wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40", size = 38919 },
+    { url = "https://files.pythonhosted.org/packages/73/54/3bfe5a1febbbccb7a2f77de47b989c0b85ed3a6a41614b104204a788c20e/wrapt-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb1d0dbf99411f3d871deb6faa9aabb9d4e744d67dcaaa05399af89d847a91d", size = 88721 },
+    { url = "https://files.pythonhosted.org/packages/25/cb/7262bc1b0300b4b64af50c2720ef958c2c1917525238d661c3e9a2b71b7b/wrapt-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d18a4865f46b8579d44e4fe1e2bcbc6472ad83d98e22a26c963d46e4c125ef0b", size = 80899 },
+    { url = "https://files.pythonhosted.org/packages/2a/5a/04cde32b07a7431d4ed0553a76fdb7a61270e78c5fd5a603e190ac389f14/wrapt-1.17.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc570b5f14a79734437cb7b0500376b6b791153314986074486e0b0fa8d71d98", size = 89222 },
+    { url = "https://files.pythonhosted.org/packages/09/28/2e45a4f4771fcfb109e244d5dbe54259e970362a311b67a965555ba65026/wrapt-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6d9187b01bebc3875bac9b087948a2bccefe464a7d8f627cf6e48b1bbae30f82", size = 86707 },
+    { url = "https://files.pythonhosted.org/packages/c6/d2/dcb56bf5f32fcd4bd9aacc77b50a539abdd5b6536872413fd3f428b21bed/wrapt-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9e8659775f1adf02eb1e6f109751268e493c73716ca5761f8acb695e52a756ae", size = 79685 },
+    { url = "https://files.pythonhosted.org/packages/80/4e/eb8b353e36711347893f502ce91c770b0b0929f8f0bed2670a6856e667a9/wrapt-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8b2816ebef96d83657b56306152a93909a83f23994f4b30ad4573b00bd11bb9", size = 87567 },
+    { url = "https://files.pythonhosted.org/packages/17/27/4fe749a54e7fae6e7146f1c7d914d28ef599dacd4416566c055564080fe2/wrapt-1.17.2-cp312-cp312-win32.whl", hash = "sha256:468090021f391fe0056ad3e807e3d9034e0fd01adcd3bdfba977b6fdf4213ea9", size = 36672 },
+    { url = "https://files.pythonhosted.org/packages/15/06/1dbf478ea45c03e78a6a8c4be4fdc3c3bddea5c8de8a93bc971415e47f0f/wrapt-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:ec89ed91f2fa8e3f52ae53cd3cf640d6feff92ba90d62236a81e4e563ac0e991", size = 38865 },
+    { url = "https://files.pythonhosted.org/packages/ce/b9/0ffd557a92f3b11d4c5d5e0c5e4ad057bd9eb8586615cdaf901409920b14/wrapt-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6ed6ffac43aecfe6d86ec5b74b06a5be33d5bb9243d055141e8cabb12aa08125", size = 53800 },
+    { url = "https://files.pythonhosted.org/packages/c0/ef/8be90a0b7e73c32e550c73cfb2fa09db62234227ece47b0e80a05073b375/wrapt-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35621ae4c00e056adb0009f8e86e28eb4a41a4bfa8f9bfa9fca7d343fe94f998", size = 38824 },
+    { url = "https://files.pythonhosted.org/packages/36/89/0aae34c10fe524cce30fe5fc433210376bce94cf74d05b0d68344c8ba46e/wrapt-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a604bf7a053f8362d27eb9fefd2097f82600b856d5abe996d623babd067b1ab5", size = 38920 },
+    { url = "https://files.pythonhosted.org/packages/3b/24/11c4510de906d77e0cfb5197f1b1445d4fec42c9a39ea853d482698ac681/wrapt-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbabee4f083b6b4cd282f5b817a867cf0b1028c54d445b7ec7cfe6505057cf8", size = 88690 },
+    { url = "https://files.pythonhosted.org/packages/71/d7/cfcf842291267bf455b3e266c0c29dcb675b5540ee8b50ba1699abf3af45/wrapt-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49703ce2ddc220df165bd2962f8e03b84c89fee2d65e1c24a7defff6f988f4d6", size = 80861 },
+    { url = "https://files.pythonhosted.org/packages/d5/66/5d973e9f3e7370fd686fb47a9af3319418ed925c27d72ce16b791231576d/wrapt-1.17.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8112e52c5822fc4253f3901b676c55ddf288614dc7011634e2719718eaa187dc", size = 89174 },
+    { url = "https://files.pythonhosted.org/packages/a7/d3/8e17bb70f6ae25dabc1aaf990f86824e4fd98ee9cadf197054e068500d27/wrapt-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fee687dce376205d9a494e9c121e27183b2a3df18037f89d69bd7b35bcf59e2", size = 86721 },
+    { url = "https://files.pythonhosted.org/packages/6f/54/f170dfb278fe1c30d0ff864513cff526d624ab8de3254b20abb9cffedc24/wrapt-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:18983c537e04d11cf027fbb60a1e8dfd5190e2b60cc27bc0808e653e7b218d1b", size = 79763 },
+    { url = "https://files.pythonhosted.org/packages/4a/98/de07243751f1c4a9b15c76019250210dd3486ce098c3d80d5f729cba029c/wrapt-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:703919b1633412ab54bcf920ab388735832fdcb9f9a00ae49387f0fe67dad504", size = 87585 },
+    { url = "https://files.pythonhosted.org/packages/f9/f0/13925f4bd6548013038cdeb11ee2cbd4e37c30f8bfd5db9e5a2a370d6e20/wrapt-1.17.2-cp313-cp313-win32.whl", hash = "sha256:abbb9e76177c35d4e8568e58650aa6926040d6a9f6f03435b7a522bf1c487f9a", size = 36676 },
+    { url = "https://files.pythonhosted.org/packages/bf/ae/743f16ef8c2e3628df3ddfd652b7d4c555d12c84b53f3d8218498f4ade9b/wrapt-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:69606d7bb691b50a4240ce6b22ebb319c1cfb164e5f6569835058196e0f3a845", size = 38871 },
+    { url = "https://files.pythonhosted.org/packages/3d/bc/30f903f891a82d402ffb5fda27ec1d621cc97cb74c16fea0b6141f1d4e87/wrapt-1.17.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4a721d3c943dae44f8e243b380cb645a709ba5bd35d3ad27bc2ed947e9c68192", size = 56312 },
+    { url = "https://files.pythonhosted.org/packages/8a/04/c97273eb491b5f1c918857cd26f314b74fc9b29224521f5b83f872253725/wrapt-1.17.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:766d8bbefcb9e00c3ac3b000d9acc51f1b399513f44d77dfe0eb026ad7c9a19b", size = 40062 },
+    { url = "https://files.pythonhosted.org/packages/4e/ca/3b7afa1eae3a9e7fefe499db9b96813f41828b9fdb016ee836c4c379dadb/wrapt-1.17.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e496a8ce2c256da1eb98bd15803a79bee00fc351f5dfb9ea82594a3f058309e0", size = 40155 },
+    { url = "https://files.pythonhosted.org/packages/89/be/7c1baed43290775cb9030c774bc53c860db140397047cc49aedaf0a15477/wrapt-1.17.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d615e4fe22f4ad3528448c193b218e077656ca9ccb22ce2cb20db730f8d306", size = 113471 },
+    { url = "https://files.pythonhosted.org/packages/32/98/4ed894cf012b6d6aae5f5cc974006bdeb92f0241775addad3f8cd6ab71c8/wrapt-1.17.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5aaeff38654462bc4b09023918b7f21790efb807f54c000a39d41d69cf552cb", size = 101208 },
+    { url = "https://files.pythonhosted.org/packages/ea/fd/0c30f2301ca94e655e5e057012e83284ce8c545df7661a78d8bfca2fac7a/wrapt-1.17.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a7d15bbd2bc99e92e39f49a04653062ee6085c0e18b3b7512a4f2fe91f2d681", size = 109339 },
+    { url = "https://files.pythonhosted.org/packages/75/56/05d000de894c4cfcb84bcd6b1df6214297b8089a7bd324c21a4765e49b14/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e3890b508a23299083e065f435a492b5435eba6e304a7114d2f919d400888cc6", size = 110232 },
+    { url = "https://files.pythonhosted.org/packages/53/f8/c3f6b2cf9b9277fb0813418e1503e68414cd036b3b099c823379c9575e6d/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c8b293cd65ad716d13d8dd3624e42e5a19cc2a2f1acc74b30c2c13f15cb61a6", size = 100476 },
+    { url = "https://files.pythonhosted.org/packages/a7/b1/0bb11e29aa5139d90b770ebbfa167267b1fc548d2302c30c8f7572851738/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f", size = 106377 },
+    { url = "https://files.pythonhosted.org/packages/6a/e1/0122853035b40b3f333bbb25f1939fc1045e21dd518f7f0922b60c156f7c/wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555", size = 37986 },
+    { url = "https://files.pythonhosted.org/packages/09/5e/1655cf481e079c1f22d0cabdd4e51733679932718dc23bf2db175f329b76/wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c", size = 40750 },
+    { url = "https://files.pythonhosted.org/packages/8a/f4/6ed2b8f6f1c832933283974839b88ec7c983fd12905e01e97889dadf7559/wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a", size = 53308 },
+    { url = "https://files.pythonhosted.org/packages/a2/a9/712a53f8f4f4545768ac532619f6e56d5d0364a87b2212531685e89aeef8/wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061", size = 38489 },
+    { url = "https://files.pythonhosted.org/packages/fa/9b/e172c8f28a489a2888df18f953e2f6cb8d33b1a2e78c9dfc52d8bf6a5ead/wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82", size = 38776 },
+    { url = "https://files.pythonhosted.org/packages/cf/cb/7a07b51762dcd59bdbe07aa97f87b3169766cadf240f48d1cbe70a1be9db/wrapt-1.17.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62c2caa1585c82b3f7a7ab56afef7b3602021d6da34fbc1cf234ff139fed3cd9", size = 83050 },
+    { url = "https://files.pythonhosted.org/packages/a5/51/a42757dd41032afd6d8037617aa3bc6803ba971850733b24dfb7d5c627c4/wrapt-1.17.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c958bcfd59bacc2d0249dcfe575e71da54f9dcf4a8bdf89c4cb9a68a1170d73f", size = 74718 },
+    { url = "https://files.pythonhosted.org/packages/bf/bb/d552bfe47db02fcfc950fc563073a33500f8108efa5f7b41db2f83a59028/wrapt-1.17.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc78a84e2dfbc27afe4b2bd7c80c8db9bca75cc5b85df52bfe634596a1da846b", size = 82590 },
+    { url = "https://files.pythonhosted.org/packages/77/99/77b06b3c3c410dbae411105bf22496facf03a5496bfaca8fbcf9da381889/wrapt-1.17.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ba0f0eb61ef00ea10e00eb53a9129501f52385c44853dbd6c4ad3f403603083f", size = 81462 },
+    { url = "https://files.pythonhosted.org/packages/2d/21/cf0bd85ae66f92600829ea1de8e1da778e5e9f6e574ccbe74b66db0d95db/wrapt-1.17.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1e1fe0e6ab7775fd842bc39e86f6dcfc4507ab0ffe206093e76d61cde37225c8", size = 74309 },
+    { url = "https://files.pythonhosted.org/packages/6d/16/112d25e9092398a0dd6fec50ab7ac1b775a0c19b428f049785096067ada9/wrapt-1.17.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c86563182421896d73858e08e1db93afdd2b947a70064b813d515d66549e15f9", size = 81081 },
+    { url = "https://files.pythonhosted.org/packages/2b/49/364a615a0cc0872685646c495c7172e4fc7bf1959e3b12a1807a03014e05/wrapt-1.17.2-cp39-cp39-win32.whl", hash = "sha256:f393cda562f79828f38a819f4788641ac7c4085f30f1ce1a68672baa686482bb", size = 36423 },
+    { url = "https://files.pythonhosted.org/packages/00/ad/5d2c1b34ba3202cd833d9221833e74d6500ce66730974993a8dc9a94fb8c/wrapt-1.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:36ccae62f64235cf8ddb682073a60519426fdd4725524ae38874adf72b5f2aeb", size = 38772 },
+    { url = "https://files.pythonhosted.org/packages/2d/82/f56956041adef78f849db6b289b282e72b55ab8045a75abad81898c28d19/wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8", size = 23594 },
+]
+
+[[package]]
+name = "xxhash"
+version = "3.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/00/5e/d6e5258d69df8b4ed8c83b6664f2b47d30d2dec551a29ad72a6c69eafd31/xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f", size = 84241 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bb/8a/0e9feca390d512d293afd844d31670e25608c4a901e10202aa98785eab09/xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212", size = 31970 },
+    { url = "https://files.pythonhosted.org/packages/16/e6/be5aa49580cd064a18200ab78e29b88b1127e1a8c7955eb8ecf81f2626eb/xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520", size = 30801 },
+    { url = "https://files.pythonhosted.org/packages/20/ee/b8a99ebbc6d1113b3a3f09e747fa318c3cde5b04bd9c197688fadf0eeae8/xxhash-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680", size = 220927 },
+    { url = "https://files.pythonhosted.org/packages/58/62/15d10582ef159283a5c2b47f6d799fc3303fe3911d5bb0bcc820e1ef7ff4/xxhash-3.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da", size = 200360 },
+    { url = "https://files.pythonhosted.org/packages/23/41/61202663ea9b1bd8e53673b8ec9e2619989353dba8cfb68e59a9cbd9ffe3/xxhash-3.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23", size = 428528 },
+    { url = "https://files.pythonhosted.org/packages/f2/07/d9a3059f702dec5b3b703737afb6dda32f304f6e9da181a229dafd052c29/xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196", size = 194149 },
+    { url = "https://files.pythonhosted.org/packages/eb/58/27caadf78226ecf1d62dbd0c01d152ed381c14c1ee4ad01f0d460fc40eac/xxhash-3.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c", size = 207703 },
+    { url = "https://files.pythonhosted.org/packages/b1/08/32d558ce23e1e068453c39aed7b3c1cdc690c177873ec0ca3a90d5808765/xxhash-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482", size = 216255 },
+    { url = "https://files.pythonhosted.org/packages/3f/d4/2b971e2d2b0a61045f842b622ef11e94096cf1f12cd448b6fd426e80e0e2/xxhash-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296", size = 202744 },
+    { url = "https://files.pythonhosted.org/packages/19/ae/6a6438864a8c4c39915d7b65effd85392ebe22710412902487e51769146d/xxhash-3.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415", size = 210115 },
+    { url = "https://files.pythonhosted.org/packages/48/7d/b3c27c27d1fc868094d02fe4498ccce8cec9fcc591825c01d6bcb0b4fc49/xxhash-3.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198", size = 414247 },
+    { url = "https://files.pythonhosted.org/packages/a1/05/918f9e7d2fbbd334b829997045d341d6239b563c44e683b9a7ef8fe50f5d/xxhash-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442", size = 191419 },
+    { url = "https://files.pythonhosted.org/packages/08/29/dfe393805b2f86bfc47c290b275f0b7c189dc2f4e136fd4754f32eb18a8d/xxhash-3.5.0-cp310-cp310-win32.whl", hash = "sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da", size = 30114 },
+    { url = "https://files.pythonhosted.org/packages/7b/d7/aa0b22c4ebb7c3ccb993d4c565132abc641cd11164f8952d89eb6a501909/xxhash-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9", size = 30003 },
+    { url = "https://files.pythonhosted.org/packages/69/12/f969b81541ee91b55f1ce469d7ab55079593c80d04fd01691b550e535000/xxhash-3.5.0-cp310-cp310-win_arm64.whl", hash = "sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6", size = 26773 },
+    { url = "https://files.pythonhosted.org/packages/b8/c7/afed0f131fbda960ff15eee7f304fa0eeb2d58770fade99897984852ef23/xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1", size = 31969 },
+    { url = "https://files.pythonhosted.org/packages/8c/0c/7c3bc6d87e5235672fcc2fb42fd5ad79fe1033925f71bf549ee068c7d1ca/xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8", size = 30800 },
+    { url = "https://files.pythonhosted.org/packages/04/9e/01067981d98069eec1c20201f8c145367698e9056f8bc295346e4ea32dd1/xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166", size = 221566 },
+    { url = "https://files.pythonhosted.org/packages/d4/09/d4996de4059c3ce5342b6e1e6a77c9d6c91acce31f6ed979891872dd162b/xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7", size = 201214 },
+    { url = "https://files.pythonhosted.org/packages/62/f5/6d2dc9f8d55a7ce0f5e7bfef916e67536f01b85d32a9fbf137d4cadbee38/xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623", size = 429433 },
+    { url = "https://files.pythonhosted.org/packages/d9/72/9256303f10e41ab004799a4aa74b80b3c5977d6383ae4550548b24bd1971/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a", size = 194822 },
+    { url = "https://files.pythonhosted.org/packages/34/92/1a3a29acd08248a34b0e6a94f4e0ed9b8379a4ff471f1668e4dce7bdbaa8/xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88", size = 208538 },
+    { url = "https://files.pythonhosted.org/packages/53/ad/7fa1a109663366de42f724a1cdb8e796a260dbac45047bce153bc1e18abf/xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c", size = 216953 },
+    { url = "https://files.pythonhosted.org/packages/35/02/137300e24203bf2b2a49b48ce898ecce6fd01789c0fcd9c686c0a002d129/xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2", size = 203594 },
+    { url = "https://files.pythonhosted.org/packages/23/03/aeceb273933d7eee248c4322b98b8e971f06cc3880e5f7602c94e5578af5/xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084", size = 210971 },
+    { url = "https://files.pythonhosted.org/packages/e3/64/ed82ec09489474cbb35c716b189ddc1521d8b3de12b1b5ab41ce7f70253c/xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d", size = 415050 },
+    { url = "https://files.pythonhosted.org/packages/71/43/6db4c02dcb488ad4e03bc86d70506c3d40a384ee73c9b5c93338eb1f3c23/xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839", size = 192216 },
+    { url = "https://files.pythonhosted.org/packages/22/6d/db4abec29e7a567455344433d095fdb39c97db6955bb4a2c432e486b4d28/xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da", size = 30120 },
+    { url = "https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58", size = 30003 },
+    { url = "https://files.pythonhosted.org/packages/6b/8e/9e6fc572acf6e1cc7ccb01973c213f895cb8668a9d4c2b58a99350da14b7/xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3", size = 26777 },
+    { url = "https://files.pythonhosted.org/packages/07/0e/1bfce2502c57d7e2e787600b31c83535af83746885aa1a5f153d8c8059d6/xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00", size = 31969 },
+    { url = "https://files.pythonhosted.org/packages/3f/d6/8ca450d6fe5b71ce521b4e5db69622383d039e2b253e9b2f24f93265b52c/xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9", size = 30787 },
+    { url = "https://files.pythonhosted.org/packages/5b/84/de7c89bc6ef63d750159086a6ada6416cc4349eab23f76ab870407178b93/xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84", size = 220959 },
+    { url = "https://files.pythonhosted.org/packages/fe/86/51258d3e8a8545ff26468c977101964c14d56a8a37f5835bc0082426c672/xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793", size = 200006 },
+    { url = "https://files.pythonhosted.org/packages/02/0a/96973bd325412feccf23cf3680fd2246aebf4b789122f938d5557c54a6b2/xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be", size = 428326 },
+    { url = "https://files.pythonhosted.org/packages/11/a7/81dba5010f7e733de88af9555725146fc133be97ce36533867f4c7e75066/xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6", size = 194380 },
+    { url = "https://files.pythonhosted.org/packages/fb/7d/f29006ab398a173f4501c0e4977ba288f1c621d878ec217b4ff516810c04/xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90", size = 207934 },
+    { url = "https://files.pythonhosted.org/packages/8a/6e/6e88b8f24612510e73d4d70d9b0c7dff62a2e78451b9f0d042a5462c8d03/xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27", size = 216301 },
+    { url = "https://files.pythonhosted.org/packages/af/51/7862f4fa4b75a25c3b4163c8a873f070532fe5f2d3f9b3fc869c8337a398/xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2", size = 203351 },
+    { url = "https://files.pythonhosted.org/packages/22/61/8d6a40f288f791cf79ed5bb113159abf0c81d6efb86e734334f698eb4c59/xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d", size = 210294 },
+    { url = "https://files.pythonhosted.org/packages/17/02/215c4698955762d45a8158117190261b2dbefe9ae7e5b906768c09d8bc74/xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab", size = 414674 },
+    { url = "https://files.pythonhosted.org/packages/31/5c/b7a8db8a3237cff3d535261325d95de509f6a8ae439a5a7a4ffcff478189/xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e", size = 192022 },
+    { url = "https://files.pythonhosted.org/packages/78/e3/dd76659b2811b3fd06892a8beb850e1996b63e9235af5a86ea348f053e9e/xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8", size = 30170 },
+    { url = "https://files.pythonhosted.org/packages/d9/6b/1c443fe6cfeb4ad1dcf231cdec96eb94fb43d6498b4469ed8b51f8b59a37/xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e", size = 30040 },
+    { url = "https://files.pythonhosted.org/packages/0f/eb/04405305f290173acc0350eba6d2f1a794b57925df0398861a20fbafa415/xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2", size = 26796 },
+    { url = "https://files.pythonhosted.org/packages/c9/b8/e4b3ad92d249be5c83fa72916c9091b0965cb0faeff05d9a0a3870ae6bff/xxhash-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6", size = 31795 },
+    { url = "https://files.pythonhosted.org/packages/fc/d8/b3627a0aebfbfa4c12a41e22af3742cf08c8ea84f5cc3367b5de2d039cce/xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5", size = 30792 },
+    { url = "https://files.pythonhosted.org/packages/c3/cc/762312960691da989c7cd0545cb120ba2a4148741c6ba458aa723c00a3f8/xxhash-3.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc", size = 220950 },
+    { url = "https://files.pythonhosted.org/packages/fe/e9/cc266f1042c3c13750e86a535496b58beb12bf8c50a915c336136f6168dc/xxhash-3.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3", size = 199980 },
+    { url = "https://files.pythonhosted.org/packages/bf/85/a836cd0dc5cc20376de26b346858d0ac9656f8f730998ca4324921a010b9/xxhash-3.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c", size = 428324 },
+    { url = "https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb", size = 194370 },
+    { url = "https://files.pythonhosted.org/packages/87/a1/b028bb02636dfdc190da01951d0703b3d904301ed0ef6094d948983bef0e/xxhash-3.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f", size = 207911 },
+    { url = "https://files.pythonhosted.org/packages/80/d5/73c73b03fc0ac73dacf069fdf6036c9abad82de0a47549e9912c955ab449/xxhash-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7", size = 216352 },
+    { url = "https://files.pythonhosted.org/packages/b6/2a/5043dba5ddbe35b4fe6ea0a111280ad9c3d4ba477dd0f2d1fe1129bda9d0/xxhash-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326", size = 203410 },
+    { url = "https://files.pythonhosted.org/packages/a2/b2/9a8ded888b7b190aed75b484eb5c853ddd48aa2896e7b59bbfbce442f0a1/xxhash-3.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf", size = 210322 },
+    { url = "https://files.pythonhosted.org/packages/98/62/440083fafbc917bf3e4b67c2ade621920dd905517e85631c10aac955c1d2/xxhash-3.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7", size = 414725 },
+    { url = "https://files.pythonhosted.org/packages/75/db/009206f7076ad60a517e016bb0058381d96a007ce3f79fa91d3010f49cc2/xxhash-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c", size = 192070 },
+    { url = "https://files.pythonhosted.org/packages/1f/6d/c61e0668943a034abc3a569cdc5aeae37d686d9da7e39cf2ed621d533e36/xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637", size = 30172 },
+    { url = "https://files.pythonhosted.org/packages/96/14/8416dce965f35e3d24722cdf79361ae154fa23e2ab730e5323aa98d7919e/xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43", size = 30041 },
+    { url = "https://files.pythonhosted.org/packages/27/ee/518b72faa2073f5aa8e3262408d284892cb79cf2754ba0c3a5870645ef73/xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b", size = 26801 },
+    { url = "https://files.pythonhosted.org/packages/d4/f6/531dd6858adf8877675270b9d6989b6dacfd1c2d7135b17584fc29866df3/xxhash-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bfc8cdd7f33d57f0468b0614ae634cc38ab9202c6957a60e31d285a71ebe0301", size = 31971 },
+    { url = "https://files.pythonhosted.org/packages/7c/a8/b2a42b6c9ae46e233f474f3d307c2e7bca8d9817650babeca048d2ad01d6/xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0c48b6300cd0b0106bf49169c3e0536408dfbeb1ccb53180068a18b03c662ab", size = 30801 },
+    { url = "https://files.pythonhosted.org/packages/b4/92/9ac297e3487818f429bcf369c1c6a097edf5b56ed6fc1feff4c1882e87ef/xxhash-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe1a92cfbaa0a1253e339ccec42dbe6db262615e52df591b68726ab10338003f", size = 220644 },
+    { url = "https://files.pythonhosted.org/packages/86/48/c1426dd3c86fc4a52f983301867463472f6a9013fb32d15991e60c9919b6/xxhash-3.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33513d6cc3ed3b559134fb307aae9bdd94d7e7c02907b37896a6c45ff9ce51bd", size = 200021 },
+    { url = "https://files.pythonhosted.org/packages/f3/de/0ab8c79993765c94fc0d0c1a22b454483c58a0161e1b562f58b654f47660/xxhash-3.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eefc37f6138f522e771ac6db71a6d4838ec7933939676f3753eafd7d3f4c40bc", size = 428217 },
+    { url = "https://files.pythonhosted.org/packages/b4/b4/332647451ed7d2c021294b7c1e9c144dbb5586b1fb214ad4f5a404642835/xxhash-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a606c8070ada8aa2a88e181773fa1ef17ba65ce5dd168b9d08038e2a61b33754", size = 193868 },
+    { url = "https://files.pythonhosted.org/packages/f4/1c/a42c0a6cac752f84f7b44a90d1a9fa9047cf70bdba5198a304fde7cc471f/xxhash-3.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42eca420c8fa072cc1dd62597635d140e78e384a79bb4944f825fbef8bfeeef6", size = 207403 },
+    { url = "https://files.pythonhosted.org/packages/c4/d7/04e1b0daae9dc9b02c73c1664cc8aa527498c3f66ccbc586eeb25bbe9f14/xxhash-3.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:604253b2143e13218ff1ef0b59ce67f18b8bd1c4205d2ffda22b09b426386898", size = 215978 },
+    { url = "https://files.pythonhosted.org/packages/c4/f4/05e15e67505228fc19ee98a79e427b3a0b9695f5567cd66ced5d66389883/xxhash-3.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6e93a5ad22f434d7876665444a97e713a8f60b5b1a3521e8df11b98309bff833", size = 202416 },
+    { url = "https://files.pythonhosted.org/packages/94/fb/e9028d3645bba5412a09de13ee36df276a567e60bdb31d499dafa46d76ae/xxhash-3.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7a46e1d6d2817ba8024de44c4fd79913a90e5f7265434cef97026215b7d30df6", size = 209853 },
+    { url = "https://files.pythonhosted.org/packages/02/2c/18c6a622429368274739372d2f86c8125413ec169025c7d8ffb051784bba/xxhash-3.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:30eb2efe6503c379b7ab99c81ba4a779748e3830241f032ab46bd182bf5873af", size = 413926 },
+    { url = "https://files.pythonhosted.org/packages/72/bb/5b55c391084a0321c3809632a018b9b657e59d5966289664f85a645942ac/xxhash-3.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c8aa771ff2c13dd9cda8166d685d7333d389fae30a4d2bb39d63ab5775de8606", size = 191156 },
+    { url = "https://files.pythonhosted.org/packages/86/2b/915049db13401792fec159f57e4f4a5ca7a9768e83ef71d6645b9d0cd749/xxhash-3.5.0-cp39-cp39-win32.whl", hash = "sha256:5ed9ebc46f24cf91034544b26b131241b699edbfc99ec5e7f8f3d02d6eb7fba4", size = 30122 },
+    { url = "https://files.pythonhosted.org/packages/d5/87/382ef7b24917d7cf4c540ee30f29b283bc87ac5893d2f89b23ea3cdf7d77/xxhash-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:220f3f896c6b8d0316f63f16c077d52c412619e475f9372333474ee15133a558", size = 30021 },
+    { url = "https://files.pythonhosted.org/packages/e2/47/d06b24e2d9c3dcabccfd734d11b5bbebfdf59ceac2c61509d8205dd20ac6/xxhash-3.5.0-cp39-cp39-win_arm64.whl", hash = "sha256:a7b1d8315d9b5e9f89eb2933b73afae6ec9597a258d52190944437158b49d38e", size = 26780 },
+    { url = "https://files.pythonhosted.org/packages/ab/9a/233606bada5bd6f50b2b72c45de3d9868ad551e83893d2ac86dc7bb8553a/xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c", size = 29732 },
+    { url = "https://files.pythonhosted.org/packages/0c/67/f75276ca39e2c6604e3bee6c84e9db8a56a4973fde9bf35989787cf6e8aa/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986", size = 36214 },
+    { url = "https://files.pythonhosted.org/packages/0f/f8/f6c61fd794229cc3848d144f73754a0c107854372d7261419dcbbd286299/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6", size = 32020 },
+    { url = "https://files.pythonhosted.org/packages/79/d3/c029c99801526f859e6b38d34ab87c08993bf3dcea34b11275775001638a/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b", size = 40515 },
+    { url = "https://files.pythonhosted.org/packages/62/e3/bef7b82c1997579c94de9ac5ea7626d01ae5858aa22bf4fcb38bf220cb3e/xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da", size = 30064 },
+    { url = "https://files.pythonhosted.org/packages/c2/56/30d3df421814947f9d782b20c9b7e5e957f3791cbd89874578011daafcbd/xxhash-3.5.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:531af8845aaadcadf951b7e0c1345c6b9c68a990eeb74ff9acd8501a0ad6a1c9", size = 29734 },
+    { url = "https://files.pythonhosted.org/packages/82/dd/3c42a1f022ad0d82c852d3cb65493ebac03dcfa8c994465a5fb052b00e3c/xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ce379bcaa9fcc00f19affa7773084dd09f5b59947b3fb47a1ceb0179f91aaa1", size = 36216 },
+    { url = "https://files.pythonhosted.org/packages/b2/40/8f902ab3bebda228a9b4de69eba988280285a7f7f167b942bc20bb562df9/xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd1b2281d01723f076df3c8188f43f2472248a6b63118b036e641243656b1b0f", size = 32042 },
+    { url = "https://files.pythonhosted.org/packages/db/87/bd06beb8ccaa0e9e577c9b909a49cfa5c5cd2ca46034342d72dd9ce5bc56/xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c770750cc80e8694492244bca7251385188bc5597b6a39d98a9f30e8da984e0", size = 40516 },
+    { url = "https://files.pythonhosted.org/packages/bb/f8/505385e2fbd753ddcaafd5550eabe86f6232cbebabad3b2508d411b19153/xxhash-3.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b150b8467852e1bd844387459aa6fbe11d7f38b56e901f9f3b3e6aba0d660240", size = 30108 },
+]
+
+[[package]]
+name = "yarl"
+version = "1.18.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "multidict" },
+    { name = "propcache" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/9d/4b94a8e6d2b51b599516a5cb88e5bc99b4d8d4583e468057eaa29d5f0918/yarl-1.18.3.tar.gz", hash = "sha256:ac1801c45cbf77b6c99242eeff4fffb5e4e73a800b5c4ad4fc0be5def634d2e1", size = 181062 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/98/e005bc608765a8a5569f58e650961314873c8469c333616eb40bff19ae97/yarl-1.18.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34", size = 141458 },
+    { url = "https://files.pythonhosted.org/packages/df/5d/f8106b263b8ae8a866b46d9be869ac01f9b3fb7f2325f3ecb3df8003f796/yarl-1.18.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7", size = 94365 },
+    { url = "https://files.pythonhosted.org/packages/56/3e/d8637ddb9ba69bf851f765a3ee288676f7cf64fb3be13760c18cbc9d10bd/yarl-1.18.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:602d98f2c2d929f8e697ed274fbadc09902c4025c5a9963bf4e9edfc3ab6f7ed", size = 92181 },
+    { url = "https://files.pythonhosted.org/packages/76/f9/d616a5c2daae281171de10fba41e1c0e2d8207166fc3547252f7d469b4e1/yarl-1.18.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c654d5207c78e0bd6d749f6dae1dcbbfde3403ad3a4b11f3c5544d9906969dde", size = 315349 },
+    { url = "https://files.pythonhosted.org/packages/bb/b4/3ea5e7b6f08f698b3769a06054783e434f6d59857181b5c4e145de83f59b/yarl-1.18.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5094d9206c64181d0f6e76ebd8fb2f8fe274950a63890ee9e0ebfd58bf9d787b", size = 330494 },
+    { url = "https://files.pythonhosted.org/packages/55/f1/e0fc810554877b1b67420568afff51b967baed5b53bcc983ab164eebf9c9/yarl-1.18.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35098b24e0327fc4ebdc8ffe336cee0a87a700c24ffed13161af80124b7dc8e5", size = 326927 },
+    { url = "https://files.pythonhosted.org/packages/a9/42/b1753949b327b36f210899f2dd0a0947c0c74e42a32de3f8eb5c7d93edca/yarl-1.18.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3236da9272872443f81fedc389bace88408f64f89f75d1bdb2256069a8730ccc", size = 319703 },
+    { url = "https://files.pythonhosted.org/packages/f0/6d/e87c62dc9635daefb064b56f5c97df55a2e9cc947a2b3afd4fd2f3b841c7/yarl-1.18.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2c08cc9b16f4f4bc522771d96734c7901e7ebef70c6c5c35dd0f10845270bcd", size = 310246 },
+    { url = "https://files.pythonhosted.org/packages/e3/ef/e2e8d1785cdcbd986f7622d7f0098205f3644546da7919c24b95790ec65a/yarl-1.18.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80316a8bd5109320d38eef8833ccf5f89608c9107d02d2a7f985f98ed6876990", size = 319730 },
+    { url = "https://files.pythonhosted.org/packages/fc/15/8723e22345bc160dfde68c4b3ae8b236e868f9963c74015f1bc8a614101c/yarl-1.18.3-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:c1e1cc06da1491e6734f0ea1e6294ce00792193c463350626571c287c9a704db", size = 321681 },
+    { url = "https://files.pythonhosted.org/packages/86/09/bf764e974f1516efa0ae2801494a5951e959f1610dd41edbfc07e5e0f978/yarl-1.18.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62", size = 324812 },
+    { url = "https://files.pythonhosted.org/packages/f6/4c/20a0187e3b903c97d857cf0272d687c1b08b03438968ae8ffc50fe78b0d6/yarl-1.18.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e3b9fd71836999aad54084906f8663dffcd2a7fb5cdafd6c37713b2e72be1760", size = 337011 },
+    { url = "https://files.pythonhosted.org/packages/c9/71/6244599a6e1cc4c9f73254a627234e0dad3883ece40cc33dce6265977461/yarl-1.18.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:757e81cae69244257d125ff31663249b3013b5dc0a8520d73694aed497fb195b", size = 338132 },
+    { url = "https://files.pythonhosted.org/packages/af/f5/e0c3efaf74566c4b4a41cb76d27097df424052a064216beccae8d303c90f/yarl-1.18.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b1771de9944d875f1b98a745bc547e684b863abf8f8287da8466cf470ef52690", size = 331849 },
+    { url = "https://files.pythonhosted.org/packages/8a/b8/3d16209c2014c2f98a8f658850a57b716efb97930aebf1ca0d9325933731/yarl-1.18.3-cp310-cp310-win32.whl", hash = "sha256:8874027a53e3aea659a6d62751800cf6e63314c160fd607489ba5c2edd753cf6", size = 84309 },
+    { url = "https://files.pythonhosted.org/packages/fd/b7/2e9a5b18eb0fe24c3a0e8bae994e812ed9852ab4fd067c0107fadde0d5f0/yarl-1.18.3-cp310-cp310-win_amd64.whl", hash = "sha256:93b2e109287f93db79210f86deb6b9bbb81ac32fc97236b16f7433db7fc437d8", size = 90484 },
+    { url = "https://files.pythonhosted.org/packages/40/93/282b5f4898d8e8efaf0790ba6d10e2245d2c9f30e199d1a85cae9356098c/yarl-1.18.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8503ad47387b8ebd39cbbbdf0bf113e17330ffd339ba1144074da24c545f0069", size = 141555 },
+    { url = "https://files.pythonhosted.org/packages/6d/9c/0a49af78df099c283ca3444560f10718fadb8a18dc8b3edf8c7bd9fd7d89/yarl-1.18.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193", size = 94351 },
+    { url = "https://files.pythonhosted.org/packages/5a/a1/205ab51e148fdcedad189ca8dd587794c6f119882437d04c33c01a75dece/yarl-1.18.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:67a283dd2882ac98cc6318384f565bffc751ab564605959df4752d42483ad889", size = 92286 },
+    { url = "https://files.pythonhosted.org/packages/ed/fe/88b690b30f3f59275fb674f5f93ddd4a3ae796c2b62e5bb9ece8a4914b83/yarl-1.18.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d980e0325b6eddc81331d3f4551e2a333999fb176fd153e075c6d1c2530aa8a8", size = 340649 },
+    { url = "https://files.pythonhosted.org/packages/07/eb/3b65499b568e01f36e847cebdc8d7ccb51fff716dbda1ae83c3cbb8ca1c9/yarl-1.18.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b643562c12680b01e17239be267bc306bbc6aac1f34f6444d1bded0c5ce438ca", size = 356623 },
+    { url = "https://files.pythonhosted.org/packages/33/46/f559dc184280b745fc76ec6b1954de2c55595f0ec0a7614238b9ebf69618/yarl-1.18.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c017a3b6df3a1bd45b9fa49a0f54005e53fbcad16633870104b66fa1a30a29d8", size = 354007 },
+    { url = "https://files.pythonhosted.org/packages/af/ba/1865d85212351ad160f19fb99808acf23aab9a0f8ff31c8c9f1b4d671fc9/yarl-1.18.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75674776d96d7b851b6498f17824ba17849d790a44d282929c42dbb77d4f17ae", size = 344145 },
+    { url = "https://files.pythonhosted.org/packages/94/cb/5c3e975d77755d7b3d5193e92056b19d83752ea2da7ab394e22260a7b824/yarl-1.18.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ccaa3a4b521b780a7e771cc336a2dba389a0861592bbce09a476190bb0c8b4b3", size = 336133 },
+    { url = "https://files.pythonhosted.org/packages/19/89/b77d3fd249ab52a5c40859815765d35c91425b6bb82e7427ab2f78f5ff55/yarl-1.18.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2d06d3005e668744e11ed80812e61efd77d70bb7f03e33c1598c301eea20efbb", size = 347967 },
+    { url = "https://files.pythonhosted.org/packages/35/bd/f6b7630ba2cc06c319c3235634c582a6ab014d52311e7d7c22f9518189b5/yarl-1.18.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:9d41beda9dc97ca9ab0b9888cb71f7539124bc05df02c0cff6e5acc5a19dcc6e", size = 346397 },
+    { url = "https://files.pythonhosted.org/packages/18/1a/0b4e367d5a72d1f095318344848e93ea70da728118221f84f1bf6c1e39e7/yarl-1.18.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ba23302c0c61a9999784e73809427c9dbedd79f66a13d84ad1b1943802eaaf59", size = 350206 },
+    { url = "https://files.pythonhosted.org/packages/b5/cf/320fff4367341fb77809a2d8d7fe75b5d323a8e1b35710aafe41fdbf327b/yarl-1.18.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6748dbf9bfa5ba1afcc7556b71cda0d7ce5f24768043a02a58846e4a443d808d", size = 362089 },
+    { url = "https://files.pythonhosted.org/packages/57/cf/aadba261d8b920253204085268bad5e8cdd86b50162fcb1b10c10834885a/yarl-1.18.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0b0cad37311123211dc91eadcb322ef4d4a66008d3e1bdc404808992260e1a0e", size = 366267 },
+    { url = "https://files.pythonhosted.org/packages/54/58/fb4cadd81acdee6dafe14abeb258f876e4dd410518099ae9a35c88d8097c/yarl-1.18.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0fb2171a4486bb075316ee754c6d8382ea6eb8b399d4ec62fde2b591f879778a", size = 359141 },
+    { url = "https://files.pythonhosted.org/packages/9a/7a/4c571597589da4cd5c14ed2a0b17ac56ec9ee7ee615013f74653169e702d/yarl-1.18.3-cp311-cp311-win32.whl", hash = "sha256:61b1a825a13bef4a5f10b1885245377d3cd0bf87cba068e1d9a88c2ae36880e1", size = 84402 },
+    { url = "https://files.pythonhosted.org/packages/ae/7b/8600250b3d89b625f1121d897062f629883c2f45339623b69b1747ec65fa/yarl-1.18.3-cp311-cp311-win_amd64.whl", hash = "sha256:b9d60031cf568c627d028239693fd718025719c02c9f55df0a53e587aab951b5", size = 91030 },
+    { url = "https://files.pythonhosted.org/packages/33/85/bd2e2729752ff4c77338e0102914897512e92496375e079ce0150a6dc306/yarl-1.18.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1dd4bdd05407ced96fed3d7f25dbbf88d2ffb045a0db60dbc247f5b3c5c25d50", size = 142644 },
+    { url = "https://files.pythonhosted.org/packages/ff/74/1178322cc0f10288d7eefa6e4a85d8d2e28187ccab13d5b844e8b5d7c88d/yarl-1.18.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7c33dd1931a95e5d9a772d0ac5e44cac8957eaf58e3c8da8c1414de7dd27c576", size = 94962 },
+    { url = "https://files.pythonhosted.org/packages/be/75/79c6acc0261e2c2ae8a1c41cf12265e91628c8c58ae91f5ff59e29c0787f/yarl-1.18.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:25b411eddcfd56a2f0cd6a384e9f4f7aa3efee14b188de13048c25b5e91f1640", size = 92795 },
+    { url = "https://files.pythonhosted.org/packages/6b/32/927b2d67a412c31199e83fefdce6e645247b4fb164aa1ecb35a0f9eb2058/yarl-1.18.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:436c4fc0a4d66b2badc6c5fc5ef4e47bb10e4fd9bf0c79524ac719a01f3607c2", size = 332368 },
+    { url = "https://files.pythonhosted.org/packages/19/e5/859fca07169d6eceeaa4fde1997c91d8abde4e9a7c018e371640c2da2b71/yarl-1.18.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e35ef8683211db69ffe129a25d5634319a677570ab6b2eba4afa860f54eeaf75", size = 342314 },
+    { url = "https://files.pythonhosted.org/packages/08/75/76b63ccd91c9e03ab213ef27ae6add2e3400e77e5cdddf8ed2dbc36e3f21/yarl-1.18.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:84b2deecba4a3f1a398df819151eb72d29bfeb3b69abb145a00ddc8d30094512", size = 341987 },
+    { url = "https://files.pythonhosted.org/packages/1a/e1/a097d5755d3ea8479a42856f51d97eeff7a3a7160593332d98f2709b3580/yarl-1.18.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba", size = 336914 },
+    { url = "https://files.pythonhosted.org/packages/0b/42/e1b4d0e396b7987feceebe565286c27bc085bf07d61a59508cdaf2d45e63/yarl-1.18.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0e883008013c0e4aef84dcfe2a0b172c4d23c2669412cf5b3371003941f72bb", size = 325765 },
+    { url = "https://files.pythonhosted.org/packages/7e/18/03a5834ccc9177f97ca1bbb245b93c13e58e8225276f01eedc4cc98ab820/yarl-1.18.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5a3f356548e34a70b0172d8890006c37be92995f62d95a07b4a42e90fba54272", size = 344444 },
+    { url = "https://files.pythonhosted.org/packages/c8/03/a713633bdde0640b0472aa197b5b86e90fbc4c5bc05b727b714cd8a40e6d/yarl-1.18.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ccd17349166b1bee6e529b4add61727d3f55edb7babbe4069b5764c9587a8cc6", size = 340760 },
+    { url = "https://files.pythonhosted.org/packages/eb/99/f6567e3f3bbad8fd101886ea0276c68ecb86a2b58be0f64077396cd4b95e/yarl-1.18.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b958ddd075ddba5b09bb0be8a6d9906d2ce933aee81100db289badbeb966f54e", size = 346484 },
+    { url = "https://files.pythonhosted.org/packages/8e/a9/84717c896b2fc6cb15bd4eecd64e34a2f0a9fd6669e69170c73a8b46795a/yarl-1.18.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c7d79f7d9aabd6011004e33b22bc13056a3e3fb54794d138af57f5ee9d9032cb", size = 359864 },
+    { url = "https://files.pythonhosted.org/packages/1e/2e/d0f5f1bef7ee93ed17e739ec8dbcb47794af891f7d165fa6014517b48169/yarl-1.18.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4891ed92157e5430874dad17b15eb1fda57627710756c27422200c52d8a4e393", size = 364537 },
+    { url = "https://files.pythonhosted.org/packages/97/8a/568d07c5d4964da5b02621a517532adb8ec5ba181ad1687191fffeda0ab6/yarl-1.18.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ce1af883b94304f493698b00d0f006d56aea98aeb49d75ec7d98cd4a777e9285", size = 357861 },
+    { url = "https://files.pythonhosted.org/packages/7d/e3/924c3f64b6b3077889df9a1ece1ed8947e7b61b0a933f2ec93041990a677/yarl-1.18.3-cp312-cp312-win32.whl", hash = "sha256:f91c4803173928a25e1a55b943c81f55b8872f0018be83e3ad4938adffb77dd2", size = 84097 },
+    { url = "https://files.pythonhosted.org/packages/34/45/0e055320daaabfc169b21ff6174567b2c910c45617b0d79c68d7ab349b02/yarl-1.18.3-cp312-cp312-win_amd64.whl", hash = "sha256:7e2ee16578af3b52ac2f334c3b1f92262f47e02cc6193c598502bd46f5cd1477", size = 90399 },
+    { url = "https://files.pythonhosted.org/packages/30/c7/c790513d5328a8390be8f47be5d52e141f78b66c6c48f48d241ca6bd5265/yarl-1.18.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:90adb47ad432332d4f0bc28f83a5963f426ce9a1a8809f5e584e704b82685dcb", size = 140789 },
+    { url = "https://files.pythonhosted.org/packages/30/aa/a2f84e93554a578463e2edaaf2300faa61c8701f0898725842c704ba5444/yarl-1.18.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:913829534200eb0f789d45349e55203a091f45c37a2674678744ae52fae23efa", size = 94144 },
+    { url = "https://files.pythonhosted.org/packages/c6/fc/d68d8f83714b221a85ce7866832cba36d7c04a68fa6a960b908c2c84f325/yarl-1.18.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ef9f7768395923c3039055c14334ba4d926f3baf7b776c923c93d80195624782", size = 91974 },
+    { url = "https://files.pythonhosted.org/packages/56/4e/d2563d8323a7e9a414b5b25341b3942af5902a2263d36d20fb17c40411e2/yarl-1.18.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88a19f62ff30117e706ebc9090b8ecc79aeb77d0b1f5ec10d2d27a12bc9f66d0", size = 333587 },
+    { url = "https://files.pythonhosted.org/packages/25/c9/cfec0bc0cac8d054be223e9f2c7909d3e8442a856af9dbce7e3442a8ec8d/yarl-1.18.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e17c9361d46a4d5addf777c6dd5eab0715a7684c2f11b88c67ac37edfba6c482", size = 344386 },
+    { url = "https://files.pythonhosted.org/packages/ab/5d/4c532190113b25f1364d25f4c319322e86232d69175b91f27e3ebc2caf9a/yarl-1.18.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a74a13a4c857a84a845505fd2d68e54826a2cd01935a96efb1e9d86c728e186", size = 345421 },
+    { url = "https://files.pythonhosted.org/packages/23/d1/6cdd1632da013aa6ba18cee4d750d953104a5e7aac44e249d9410a972bf5/yarl-1.18.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41f7ce59d6ee7741af71d82020346af364949314ed3d87553763a2df1829cc58", size = 339384 },
+    { url = "https://files.pythonhosted.org/packages/9a/c4/6b3c39bec352e441bd30f432cda6ba51681ab19bb8abe023f0d19777aad1/yarl-1.18.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f52a265001d830bc425f82ca9eabda94a64a4d753b07d623a9f2863fde532b53", size = 326689 },
+    { url = "https://files.pythonhosted.org/packages/23/30/07fb088f2eefdc0aa4fc1af4e3ca4eb1a3aadd1ce7d866d74c0f124e6a85/yarl-1.18.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:82123d0c954dc58db301f5021a01854a85bf1f3bb7d12ae0c01afc414a882ca2", size = 345453 },
+    { url = "https://files.pythonhosted.org/packages/63/09/d54befb48f9cd8eec43797f624ec37783a0266855f4930a91e3d5c7717f8/yarl-1.18.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2ec9bbba33b2d00999af4631a3397d1fd78290c48e2a3e52d8dd72db3a067ac8", size = 341872 },
+    { url = "https://files.pythonhosted.org/packages/91/26/fd0ef9bf29dd906a84b59f0cd1281e65b0c3e08c6aa94b57f7d11f593518/yarl-1.18.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:fbd6748e8ab9b41171bb95c6142faf068f5ef1511935a0aa07025438dd9a9bc1", size = 347497 },
+    { url = "https://files.pythonhosted.org/packages/d9/b5/14ac7a256d0511b2ac168d50d4b7d744aea1c1aa20c79f620d1059aab8b2/yarl-1.18.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:877d209b6aebeb5b16c42cbb377f5f94d9e556626b1bfff66d7b0d115be88d0a", size = 359981 },
+    { url = "https://files.pythonhosted.org/packages/ca/b3/d493221ad5cbd18bc07e642894030437e405e1413c4236dd5db6e46bcec9/yarl-1.18.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b464c4ab4bfcb41e3bfd3f1c26600d038376c2de3297760dfe064d2cb7ea8e10", size = 366229 },
+    { url = "https://files.pythonhosted.org/packages/04/56/6a3e2a5d9152c56c346df9b8fb8edd2c8888b1e03f96324d457e5cf06d34/yarl-1.18.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8d39d351e7faf01483cc7ff7c0213c412e38e5a340238826be7e0e4da450fdc8", size = 360383 },
+    { url = "https://files.pythonhosted.org/packages/fd/b7/4b3c7c7913a278d445cc6284e59b2e62fa25e72758f888b7a7a39eb8423f/yarl-1.18.3-cp313-cp313-win32.whl", hash = "sha256:61ee62ead9b68b9123ec24bc866cbef297dd266175d53296e2db5e7f797f902d", size = 310152 },
+    { url = "https://files.pythonhosted.org/packages/f5/d5/688db678e987c3e0fb17867970700b92603cadf36c56e5fb08f23e822a0c/yarl-1.18.3-cp313-cp313-win_amd64.whl", hash = "sha256:578e281c393af575879990861823ef19d66e2b1d0098414855dd367e234f5b3c", size = 315723 },
+    { url = "https://files.pythonhosted.org/packages/6a/3b/fec4b08f5e88f68e56ee698a59284a73704df2e0e0b5bdf6536c86e76c76/yarl-1.18.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:61e5e68cb65ac8f547f6b5ef933f510134a6bf31bb178be428994b0cb46c2a04", size = 142780 },
+    { url = "https://files.pythonhosted.org/packages/ed/85/796b0d6a22d536ec8e14bdbb86519250bad980cec450b6e299b1c2a9079e/yarl-1.18.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719", size = 94981 },
+    { url = "https://files.pythonhosted.org/packages/ee/0e/a830fd2238f7a29050f6dd0de748b3d6f33a7dbb67dbbc081a970b2bbbeb/yarl-1.18.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a440a2a624683108a1b454705ecd7afc1c3438a08e890a1513d468671d90a04e", size = 92789 },
+    { url = "https://files.pythonhosted.org/packages/0f/4f/438c9fd668954779e48f08c0688ee25e0673380a21bb1e8ccc56de5b55d7/yarl-1.18.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09c7907c8548bcd6ab860e5f513e727c53b4a714f459b084f6580b49fa1b9cee", size = 317327 },
+    { url = "https://files.pythonhosted.org/packages/bd/79/a78066f06179b4ed4581186c136c12fcfb928c475cbeb23743e71a991935/yarl-1.18.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b4f6450109834af88cb4cc5ecddfc5380ebb9c228695afc11915a0bf82116789", size = 336999 },
+    { url = "https://files.pythonhosted.org/packages/55/02/527963cf65f34a06aed1e766ff9a3b3e7d0eaa1c90736b2948a62e528e1d/yarl-1.18.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9ca04806f3be0ac6d558fffc2fdf8fcef767e0489d2684a21912cc4ed0cd1b8", size = 331693 },
+    { url = "https://files.pythonhosted.org/packages/a2/2a/167447ae39252ba624b98b8c13c0ba35994d40d9110e8a724c83dbbb5822/yarl-1.18.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77a6e85b90a7641d2e07184df5557132a337f136250caafc9ccaa4a2a998ca2c", size = 321473 },
+    { url = "https://files.pythonhosted.org/packages/55/03/07955fabb20082373be311c91fd78abe458bc7ff9069d34385e8bddad20e/yarl-1.18.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6333c5a377c8e2f5fae35e7b8f145c617b02c939d04110c76f29ee3676b5f9a5", size = 313571 },
+    { url = "https://files.pythonhosted.org/packages/95/e2/67c8d3ec58a8cd8ddb1d63bd06eb7e7b91c9f148707a3eeb5a7ed87df0ef/yarl-1.18.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0b3c92fa08759dbf12b3a59579a4096ba9af8dd344d9a813fc7f5070d86bbab1", size = 325004 },
+    { url = "https://files.pythonhosted.org/packages/06/43/51ceb3e427368fe6ccd9eccd162be227fd082523e02bad1fd3063daf68da/yarl-1.18.3-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:4ac515b860c36becb81bb84b667466885096b5fc85596948548b667da3bf9f24", size = 322677 },
+    { url = "https://files.pythonhosted.org/packages/e4/0e/7ef286bfb23267739a703f7b967a858e2128c10bea898de8fa027e962521/yarl-1.18.3-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:045b8482ce9483ada4f3f23b3774f4e1bf4f23a2d5c912ed5170f68efb053318", size = 332806 },
+    { url = "https://files.pythonhosted.org/packages/c8/94/2d1f060f4bfa47c8bd0bcb652bfe71fba881564bcac06ebb6d8ced9ac3bc/yarl-1.18.3-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:a4bb030cf46a434ec0225bddbebd4b89e6471814ca851abb8696170adb163985", size = 339919 },
+    { url = "https://files.pythonhosted.org/packages/8e/8d/73b5f9a6ab69acddf1ca1d5e7bc92f50b69124512e6c26b36844531d7f23/yarl-1.18.3-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:54d6921f07555713b9300bee9c50fb46e57e2e639027089b1d795ecd9f7fa910", size = 340960 },
+    { url = "https://files.pythonhosted.org/packages/41/13/ce6bc32be4476b60f4f8694831f49590884b2c975afcffc8d533bf2be7ec/yarl-1.18.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1d407181cfa6e70077df3377938c08012d18893f9f20e92f7d2f314a437c30b1", size = 336592 },
+    { url = "https://files.pythonhosted.org/packages/81/d5/6e0460292d6299ac3919945f912b16b104f4e81ab20bf53e0872a1296daf/yarl-1.18.3-cp39-cp39-win32.whl", hash = "sha256:ac36703a585e0929b032fbaab0707b75dc12703766d0b53486eabd5139ebadd5", size = 84833 },
+    { url = "https://files.pythonhosted.org/packages/b2/fc/a8aef69156ad5508165d8ae956736d55c3a68890610834bd985540966008/yarl-1.18.3-cp39-cp39-win_amd64.whl", hash = "sha256:ba87babd629f8af77f557b61e49e7c7cac36f22f871156b91e10a6e9d4f829e9", size = 90968 },
+    { url = "https://files.pythonhosted.org/packages/f5/4b/a06e0ec3d155924f77835ed2d167ebd3b211a7b0853da1cf8d8414d784ef/yarl-1.18.3-py3-none-any.whl", hash = "sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b", size = 45109 },
+]
+
+[[package]]
+name = "zipp"
+version = "3.21.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3f/50/bad581df71744867e9468ebd0bcd6505de3b275e06f202c2cb016e3ff56f/zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4", size = 24545 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/1a/7e4798e9339adc931158c9d69ecc34f5e6791489d469f5e50ec15e35f458/zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931", size = 9630 },
+]

From c20025dbf7708b953a4c39600a8569fe9d8f023b Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Fri, 17 Jan 2025 18:43:29 +0530
Subject: [PATCH 035/183] Add fp8 kv cache for ROCm (#2856)

* add fp8 kv cache for rocm

* improvements

* update log statement

* remove bookkeeping field
---
 .../layers/attention/kv_cache.py              | 57 ++++++++++++-------
 .../layers/attention/rocm.py                  | 39 ++++++++-----
 2 files changed, 62 insertions(+), 34 deletions(-)

diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py
index 00308601..77e761a3 100644
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@@ -52,13 +52,18 @@ class KVCache:
         device: torch.device,
     ):
         """Construct the key-value cache for a layer."""
-
-        if dtype in {torch.float8_e5m2, torch.float8_e4m3fn} and (
-            ATTENTION != "flashinfer" or SYSTEM != "cuda"
-        ):
-            raise ValueError(
-                "FP8 KV cache is currently only supported for flashinfer on CUDA"
-            )
+        if dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
+            if not (
+                (ATTENTION == "flashinfer" and SYSTEM == "cuda")
+                or (ATTENTION == "paged" and SYSTEM == "rocm")
+            ):
+                raise ValueError(
+                    "FP8 KV cache is currently only supported for flashinfer on CUDA and paged attention on ROCm. "
+                )
+            if SYSTEM == "rocm" and dtype == torch.float8_e5m2:
+                raise ValueError(
+                    "float8_e5m2 FP8 KV cache is not supported on AMD ROCm"
+                )
 
         element_size = torch.tensor([], dtype=dtype).element_size()
         if SYSTEM == "ipex" and device.type == "xpu":
@@ -113,21 +118,17 @@ class KVCache:
         """Check if the cache can be scaled by the given scales."""
         if kv_scales.key_scale_cpu == 1.0 and kv_scales.value_scale_cpu == 1.0:
             return False
-        elif (
-            self.dtype == torch.float8_e4m3fn
-            and ATTENTION == "flashinfer"
-            and SYSTEM == "cuda"
+        elif self.dtype == torch.float8_e4m3fn and (
+            (ATTENTION == "flashinfer" and SYSTEM == "cuda")
+            or (ATTENTION == "paged" and SYSTEM == "rocm")
         ):
-            log_once(
-                logger.info,
-                "Using FP8 KV cache scales",
-            )
+            log_once(logger.info, "Using FP8 KV cache scales")
             return True
         else:
             # We have scales, but not the correct FP8 cache type, so warn once.
             log_once(
                 logger.info,
-                "Ignoring FP8 KV cache scales, only float8_e4m3fn KV cache on flashinfer is supported",
+                "Ignoring FP8 KV cache scales, supported only for float8_e4m3fn KV cache with flashinfer on CUDA and paged attention on ROCm",
             )
             return False
 
@@ -161,7 +162,7 @@ class KVCache:
         key_cache = self.kv_cache[0]
         value_cache = self.kv_cache[1]
 
-        if self.can_scale(kv_scales):
+        if self.can_scale(kv_scales) and SYSTEM == "cuda":
             if kv_scales.key_scale_cpu != 1.0:
                 key = fp8_quantize(
                     key.float(),
@@ -197,7 +198,15 @@ class KVCache:
                 key, value, key_cache, value_cache, slots
             )
         else:
-            paged_reshape_and_cache(key, value, key_cache, value_cache, slots)
+            paged_reshape_and_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slots,
+                kv_scales.key_scale_cpu,
+                kv_scales.value_scale_cpu,
+            )
 
 
 def paged_reshape_and_cache(
@@ -206,7 +215,10 @@ def paged_reshape_and_cache(
     key_cache: torch.Tensor,
     value_cache: torch.Tensor,
     slots: torch.Tensor,
+    k_scale: float = 1.0,
+    v_scale: float = 1.0,
 ):
+
     if SYSTEM == "cuda":
         try:
             import attention_kernels
@@ -224,8 +236,15 @@ def paged_reshape_and_cache(
             raise ImportError(
                 f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
             )
+
+        kv_cache_dtype = "auto"
+        if key_cache.dtype == torch.float8_e4m3fn:
+            key_cache = key_cache.view(torch.uint8)
+            value_cache = value_cache.view(torch.uint8)
+            kv_cache_dtype = "fp8"
+
         ops.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, "auto", 1.0, 1.0
+            key, value, key_cache, value_cache, slots, kv_cache_dtype, k_scale, v_scale
         )
     elif SYSTEM == "ipex":
         import intel_extension_for_pytorch as ipex
diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py
index b94b737d..65f3ea41 100644
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -133,6 +133,15 @@ def paged_attention(
 
     out = torch.empty_like(query)
 
+    if kv_cache.dtype == torch.float8_e4m3fn:
+        key = kv_cache.key.view(torch.uint8)
+        value = kv_cache.value.view(torch.uint8)
+        kv_cache_dtype = "fp8"
+    else:
+        key = kv_cache.key
+        value = kv_cache.value
+        kv_cache_dtype = "auto"
+
     # NOTE(woosuk): We use a simple heuristic to decide whether to use
     # PagedAttention V1 or V2. If the number of partitions is 1, we use
     # V1 to avoid the overhead of reduction. Also, if the number of
@@ -147,8 +156,8 @@ def paged_attention(
         ops.paged_attention_v1(
             out,
             query,
-            kv_cache.key,
-            kv_cache.value,
+            key,
+            value,
             num_kv_heads,
             softmax_scale,
             block_tables,
@@ -156,9 +165,9 @@ def paged_attention(
             block_size,
             max_s,
             None,
-            "auto",
-            1.0,
-            1.0,
+            kv_cache_dtype,
+            kv_scales.key_scale_cpu,
+            kv_scales.value_scale_cpu,
         )
     else:
         # Run PagedAttention V2.
@@ -182,8 +191,8 @@ def paged_attention(
                 max_logits,
                 tmp_output,
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                key,
+                value,
                 num_kv_heads,
                 softmax_scale,
                 block_tables,
@@ -191,9 +200,9 @@ def paged_attention(
                 block_size,
                 max_s,
                 None,
-                "auto",
-                1.0,
-                1.0,
+                kv_cache_dtype,
+                kv_scales.key_scale_cpu,
+                kv_scales.value_scale_cpu,
             )
         else:
             ops.paged_attention_rocm(
@@ -202,8 +211,8 @@ def paged_attention(
                 max_logits,
                 tmp_output,
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                key,
+                value,
                 num_kv_heads,
                 softmax_scale,
                 block_tables,
@@ -211,9 +220,9 @@ def paged_attention(
                 block_size,
                 max_s,
                 None,
-                "auto",
-                1.0,
-                1.0,
+                kv_cache_dtype,
+                kv_scales.key_scale_cpu,
+                kv_scales.value_scale_cpu,
                 None,
                 _PARTITION_SIZE,
             )

From 6e982f43a1056ed8a04bdbe9a8345990bce0fb13 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Fri, 17 Jan 2025 22:50:58 +0800
Subject: [PATCH 036/183] fix the crash of meta-llama/Llama-3.2-1B (#2918)

* fix the crash of meta-llama/Llama-3.2-1B

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* Apply suggestions from code review

Simpler fix (which doesn't break vlms).

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 .../models/custom_modeling/flash_llama_modeling.py            | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 28db42fe..54ec7ac2 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -642,9 +642,7 @@ class FlashLlamaForCausalLM(torch.nn.Module):
         embedding_multiplier = getattr(config, "embedding_multiplier", None)
         if embedding_multiplier is not None:
             self.embed_tokens.weight.data *= embedding_multiplier
-
-        prefix = "lm_head" if not prefix or name != "model" else f"{prefix}.{suffix}"
-
+        prefix = suffix if not prefix or name != "model" else f"{prefix}.{suffix}"
         with no_fp8(weights):
             self.lm_head = SpeculativeHead.load(
                 config,

From eecca27113538716ff14b05738eb6e8c5f7afd8a Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Fri, 17 Jan 2025 11:50:41 -0500
Subject: [PATCH 037/183] feat: improve qwen2-vl startup  (#2802)

* feat: tokenize each request individually and increase warmup image size

* feat: adjust rotary embed and avoid cuda graphs of size 2 and smaller

* fix: address image resize and rebase changes

* feat: update to run qwen2-vl tests

* fix: tweak param types
---
 backends/client/src/lib.rs                    |   2 +-
 backends/v2/src/client/mod.rs                 |   2 +-
 backends/v3/src/client/mod.rs                 |   2 +-
 .../test_flash_qwen2_vl_simple.json           |  26 +++
 .../models/test_flash_qwen2_vl.py             | 161 +++++++++---------
 .../models/test_flash_qwen2_vl_warmup.py      |  38 +++++
 .../text_generation_server/models/__init__.py |   6 +
 .../custom_modeling/flash_qwen2_modeling.py   |   7 +-
 .../models/custom_modeling/qwen2_vl.py        |   5 +-
 .../models/flash_causal_lm.py                 |  18 +-
 .../models/vlm_causal_lm.py                   |   1 -
 11 files changed, 173 insertions(+), 95 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_vl_warmup/test_flash_qwen2_vl_simple.json
 create mode 100644 integration-tests/models/test_flash_qwen2_vl_warmup.py

diff --git a/backends/client/src/lib.rs b/backends/client/src/lib.rs
index 45bee10c..fbe2e7e6 100644
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
@@ -86,6 +86,6 @@ impl ChunksToString for Vec<InputChunk> {
     }
 }
 
-static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+static WARMUP_IMAGE_BASE64: &str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
 
 pub type Result<T> = std::result::Result<T, ClientError>;
diff --git a/backends/v2/src/client/mod.rs b/backends/v2/src/client/mod.rs
index fa9d4406..9fe114a2 100644
--- a/backends/v2/src/client/mod.rs
+++ b/backends/v2/src/client/mod.rs
@@ -63,6 +63,6 @@ impl From<transport::Error> for ClientError {
     }
 }
 
-static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+static WARMUP_IMAGE_BASE64: &str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
 
 pub type Result<T> = std::result::Result<T, ClientError>;
diff --git a/backends/v3/src/client/mod.rs b/backends/v3/src/client/mod.rs
index d4ac50c9..ab4311c3 100644
--- a/backends/v3/src/client/mod.rs
+++ b/backends/v3/src/client/mod.rs
@@ -62,6 +62,6 @@ impl From<Chunk> for InputChunk {
     }
 }
 
-static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+static WARMUP_IMAGE_BASE64: &str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
 
 pub type Result<T> = std::result::Result<T, ClientError>;
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl_warmup/test_flash_qwen2_vl_simple.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl_warmup/test_flash_qwen2_vl_simple.json
new file mode 100644
index 00000000..a986510f
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl_warmup/test_flash_qwen2_vl_simple.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The correct answer is: blue",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1733445131,
+  "id": "",
+  "model": "Qwen/Qwen2-VL-2B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "2.4.2-dev0-native",
+  "usage": {
+    "completion_tokens": 7,
+    "prompt_tokens": 27,
+    "total_tokens": 34
+  }
+}
diff --git a/integration-tests/models/test_flash_qwen2_vl.py b/integration-tests/models/test_flash_qwen2_vl.py
index 97a533fc..946ab2f1 100644
--- a/integration-tests/models/test_flash_qwen2_vl.py
+++ b/integration-tests/models/test_flash_qwen2_vl.py
@@ -1,81 +1,80 @@
-# Disabled because it's broken.
-# import pytest
-#
-#
-# @pytest.fixture(scope="module")
-# def flash_qwen2_vl_handle(launcher):
-#     with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
-#         yield handle
-#
-#
-# @pytest.fixture(scope="module")
-# async def flash_qwen2(flash_qwen2_vl_handle):
-#     await flash_qwen2_vl_handle.health(300)
-#     return flash_qwen2_vl_handle.client
-#
-#
-# @pytest.mark.private
-# async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
-#     response = await flash_qwen2.chat(
-#         max_tokens=100,
-#         seed=42,
-#         messages=[
-#             {
-#                 "role": "user",
-#                 "content": [
-#                     {
-#                         "type": "image_url",
-#                         "image_url": {
-#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-#                         },
-#                     },
-#                     {"type": "text", "text": "Describe this image."},
-#                 ],
-#             },
-#         ],
-#     )
-#
-#     assert (
-#         response.choices[0].message.content
-#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
-#     )
-#
-#     assert response == response_snapshot
-#
-#
-# @pytest.mark.private
-# async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
-#     responses = await flash_qwen2.chat(
-#         max_tokens=100,
-#         seed=42,
-#         messages=[
-#             {
-#                 "role": "user",
-#                 "content": [
-#                     {
-#                         "type": "image_url",
-#                         "image_url": {
-#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-#                         },
-#                     },
-#                     {"type": "text", "text": "Describe this image."},
-#                 ],
-#             },
-#         ],
-#         stream=True,
-#     )
-#
-#     count = 0
-#     generated = ""
-#     last_response = None
-#     async for response in responses:
-#         count += 1
-#         generated += response.choices[0].delta.content
-#         last_response = response
-#
-#     assert (
-#         generated
-#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
-#     )
-#     assert count == 58
-#     assert last_response == response_snapshot
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_qwen2_vl_handle(launcher):
+    with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_qwen2(flash_qwen2_vl_handle):
+    await flash_qwen2_vl_handle.health(300)
+    return flash_qwen2_vl_handle.client
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.chat(
+        max_tokens=100,
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+        ],
+    )
+
+    assert (
+        response.choices[0].message.content
+        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
+    responses = await flash_qwen2.chat(
+        max_tokens=100,
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        generated += response.choices[0].delta.content
+        last_response = response
+
+    assert (
+        generated
+        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+    )
+    assert count == 58
+    assert last_response == response_snapshot
diff --git a/integration-tests/models/test_flash_qwen2_vl_warmup.py b/integration-tests/models/test_flash_qwen2_vl_warmup.py
new file mode 100644
index 00000000..5be87ee2
--- /dev/null
+++ b/integration-tests/models/test_flash_qwen2_vl_warmup.py
@@ -0,0 +1,38 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_qwen2_vl_handle(launcher):
+    with launcher(
+        "Qwen/Qwen2-VL-2B-Instruct",
+        max_input_length=40,
+        max_batch_prefill_tokens=50,
+        max_total_tokens=51,
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_qwen2(flash_qwen2_vl_handle):
+    await flash_qwen2_vl_handle.health(300)
+    return flash_qwen2_vl_handle.client
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.chat(
+        max_tokens=20,
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is the color of the sky?"},
+                ],
+            },
+        ],
+    )
+
+    assert response.choices[0].message.content == "The correct answer is: blue"
+
+    assert response == response_snapshot
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index e2d24643..7521fb46 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -29,6 +29,7 @@ from text_generation_server.models.custom_modeling.bloom_modeling import (
     BloomForCausalLM,
 )
 from text_generation_server.models.globals import ATTENTION
+import text_generation_server.models.globals as globals
 from text_generation_server.models.seq2seq_lm import Seq2SeqLM
 from text_generation_server.models.galactica import GalacticaCausalLMBatch
 from text_generation_server.models.custom_modeling.neox_modeling import (
@@ -1217,6 +1218,11 @@ def get_model(
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
     if model_type == QWEN2_VL:
+        # TODO: remove edge case when cuda graph issue is resolved for BS=2 with Qwen2-VL
+        logger.warning(
+            "Qwen2-VL requires cuda graphs to be greater than 2. Removing all cuda graphs with a batch size equal or less than 2."
+        )
+        globals.CUDA_GRAPHS = list(filter(lambda x: x > 2, globals.CUDA_GRAPHS))
         return VlmCausalLM(
             model_id=model_id,
             model_class=Qwen2VLForConditionalGeneration,
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index cc4039b1..01d3bf1a 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -138,7 +138,12 @@ class Qwen2Attention(torch.nn.Module):
                 dim=-1,
             )
 
-        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+        self.rotary_emb(
+            query,
+            torch.select(kv, dim=1, index=0),
+            cos[: query.shape[0], ...],
+            sin[: query.shape[0], ...],
+        )
 
         if prefill_cache_indices is not None:
             kv_to_cache = kv[prefill_cache_indices]
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
index a8e1e8c1..95cf6a31 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -517,11 +517,11 @@ class Qwen2VLForConditionalGeneration(nn.Module):
         pixel_values: torch.FloatTensor = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         video_grid_thw: Optional[torch.LongTensor] = None,
-        pixel_attention_mask=None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
         image_sizes: Optional[torch.LongTensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
         cross_attention_states: Optional[torch.Tensor] = None,
-        image_indices=None,
+        image_indices: Optional[torch.Tensor] = None,
     ):
         inputs_embeds = self.embed_tokens(input_ids)
 
@@ -533,6 +533,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
                 ).squeeze(0)
                 inputs_embeds[input_ids == self.image_token_id] = image_embeds
 
+        max_s = max(max_s, inputs_embeds.size(0))
         hidden_states = self.text_model(
             inputs_embeds=inputs_embeds,
             position_ids=position_ids,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index d097c54f..f2d27db9 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -56,11 +56,13 @@ from text_generation_server.models.globals import (
     MEM_POOL,
     ATTENTION,
     BLOCK_SIZE,
-    CUDA_GRAPHS,
     REQUEST_LOGPROBS,
     TGI_WIGGLE_ROOM,
     get_adapter_to_index,
 )
+
+# avoid coping CUDA_GRAPHS value by importing globals as a module
+import text_generation_server.models.globals as globals
 from text_generation_server.layers.attention import KVCache, Seqlen
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
@@ -1635,8 +1637,8 @@ class FlashCausalLM(Model):
                         int(val)
                         for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
                     ]
-                elif CUDA_GRAPHS is not None:
-                    tuning_sequences = CUDA_GRAPHS
+                elif globals.CUDA_GRAPHS is not None:
+                    tuning_sequences = globals.CUDA_GRAPHS
                 else:
                     tuning_sequences = [1, 2, 3, 4, 5, 6, 7]
 
@@ -1675,13 +1677,14 @@ class FlashCausalLM(Model):
                     "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp.",
                 )
 
-        if CUDA_GRAPHS:
+        if globals.CUDA_GRAPHS:
             try:
                 log_master(
-                    logger.info, f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}"
+                    logger.info,
+                    f"Cuda Graphs are enabled for sizes {globals.CUDA_GRAPHS}",
                 )
                 # Warmup cuda graphs
-                for bs in CUDA_GRAPHS:
+                for bs in globals.CUDA_GRAPHS:
                     synchronize(self.device)
                     free_memory = get_free_memory(
                         self.device, MEMORY_FRACTION * TGI_WIGGLE_ROOM
@@ -1705,7 +1708,8 @@ class FlashCausalLM(Model):
                 logger.exception("Decode cuda graph warmup failed")
         else:
             log_master(
-                logger.info, f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS})."
+                logger.info,
+                f"Cuda Graphs are disabled (CUDA_GRAPHS={globals.CUDA_GRAPHS}).",
             )
 
         assert max_input_tokens is not None
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index db78341d..4d6ea84e 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -236,7 +236,6 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                             w = image.width * 2
                             h = image.height * 2
                             image = image.resize((w, h))
-
                     if config.model_type == "llava_next":
                         images.append(image)
                     else:

From 8f6146f11adf5ee61b157dc06a248f088d2a64b1 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Fri, 17 Jan 2025 12:09:05 -0500
Subject: [PATCH 038/183] Revert "feat: improve qwen2-vl startup " (#2924)

Revert "feat: improve qwen2-vl startup  (#2802)"

This reverts commit eecca27113538716ff14b05738eb6e8c5f7afd8a.
---
 backends/client/src/lib.rs                    |   2 +-
 backends/v2/src/client/mod.rs                 |   2 +-
 backends/v3/src/client/mod.rs                 |   2 +-
 .../test_flash_qwen2_vl_simple.json           |  26 ---
 .../models/test_flash_qwen2_vl.py             | 161 +++++++++---------
 .../models/test_flash_qwen2_vl_warmup.py      |  38 -----
 .../text_generation_server/models/__init__.py |   6 -
 .../custom_modeling/flash_qwen2_modeling.py   |   7 +-
 .../models/custom_modeling/qwen2_vl.py        |   5 +-
 .../models/flash_causal_lm.py                 |  18 +-
 .../models/vlm_causal_lm.py                   |   1 +
 11 files changed, 95 insertions(+), 173 deletions(-)
 delete mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_vl_warmup/test_flash_qwen2_vl_simple.json
 delete mode 100644 integration-tests/models/test_flash_qwen2_vl_warmup.py

diff --git a/backends/client/src/lib.rs b/backends/client/src/lib.rs
index fbe2e7e6..45bee10c 100644
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
@@ -86,6 +86,6 @@ impl ChunksToString for Vec<InputChunk> {
     }
 }
 
-static WARMUP_IMAGE_BASE64: &str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
 
 pub type Result<T> = std::result::Result<T, ClientError>;
diff --git a/backends/v2/src/client/mod.rs b/backends/v2/src/client/mod.rs
index 9fe114a2..fa9d4406 100644
--- a/backends/v2/src/client/mod.rs
+++ b/backends/v2/src/client/mod.rs
@@ -63,6 +63,6 @@ impl From<transport::Error> for ClientError {
     }
 }
 
-static WARMUP_IMAGE_BASE64: &str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
 
 pub type Result<T> = std::result::Result<T, ClientError>;
diff --git a/backends/v3/src/client/mod.rs b/backends/v3/src/client/mod.rs
index ab4311c3..d4ac50c9 100644
--- a/backends/v3/src/client/mod.rs
+++ b/backends/v3/src/client/mod.rs
@@ -62,6 +62,6 @@ impl From<Chunk> for InputChunk {
     }
 }
 
-static WARMUP_IMAGE_BASE64: &str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
 
 pub type Result<T> = std::result::Result<T, ClientError>;
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl_warmup/test_flash_qwen2_vl_simple.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl_warmup/test_flash_qwen2_vl_simple.json
deleted file mode 100644
index a986510f..00000000
--- a/integration-tests/models/__snapshots__/test_flash_qwen2_vl_warmup/test_flash_qwen2_vl_simple.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "choices": [
-    {
-      "finish_reason": "stop",
-      "index": 0,
-      "logprobs": null,
-      "message": {
-        "content": "The correct answer is: blue",
-        "name": null,
-        "role": "assistant",
-        "tool_calls": null
-      },
-      "usage": null
-    }
-  ],
-  "created": 1733445131,
-  "id": "",
-  "model": "Qwen/Qwen2-VL-2B-Instruct",
-  "object": "chat.completion",
-  "system_fingerprint": "2.4.2-dev0-native",
-  "usage": {
-    "completion_tokens": 7,
-    "prompt_tokens": 27,
-    "total_tokens": 34
-  }
-}
diff --git a/integration-tests/models/test_flash_qwen2_vl.py b/integration-tests/models/test_flash_qwen2_vl.py
index 946ab2f1..97a533fc 100644
--- a/integration-tests/models/test_flash_qwen2_vl.py
+++ b/integration-tests/models/test_flash_qwen2_vl.py
@@ -1,80 +1,81 @@
-import pytest
-
-
-@pytest.fixture(scope="module")
-def flash_qwen2_vl_handle(launcher):
-    with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
-        yield handle
-
-
-@pytest.fixture(scope="module")
-async def flash_qwen2(flash_qwen2_vl_handle):
-    await flash_qwen2_vl_handle.health(300)
-    return flash_qwen2_vl_handle.client
-
-
-@pytest.mark.private
-async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
-    response = await flash_qwen2.chat(
-        max_tokens=100,
-        seed=42,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-                        },
-                    },
-                    {"type": "text", "text": "Describe this image."},
-                ],
-            },
-        ],
-    )
-
-    assert (
-        response.choices[0].message.content
-        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
-    )
-
-    assert response == response_snapshot
-
-
-@pytest.mark.private
-async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
-    responses = await flash_qwen2.chat(
-        max_tokens=100,
-        seed=42,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-                        },
-                    },
-                    {"type": "text", "text": "Describe this image."},
-                ],
-            },
-        ],
-        stream=True,
-    )
-
-    count = 0
-    generated = ""
-    last_response = None
-    async for response in responses:
-        count += 1
-        generated += response.choices[0].delta.content
-        last_response = response
-
-    assert (
-        generated
-        == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
-    )
-    assert count == 58
-    assert last_response == response_snapshot
+# Disabled because it's broken.
+# import pytest
+#
+#
+# @pytest.fixture(scope="module")
+# def flash_qwen2_vl_handle(launcher):
+#     with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
+#         yield handle
+#
+#
+# @pytest.fixture(scope="module")
+# async def flash_qwen2(flash_qwen2_vl_handle):
+#     await flash_qwen2_vl_handle.health(300)
+#     return flash_qwen2_vl_handle.client
+#
+#
+# @pytest.mark.private
+# async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
+#     response = await flash_qwen2.chat(
+#         max_tokens=100,
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {
+#                         "type": "image_url",
+#                         "image_url": {
+#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+#                         },
+#                     },
+#                     {"type": "text", "text": "Describe this image."},
+#                 ],
+#             },
+#         ],
+#     )
+#
+#     assert (
+#         response.choices[0].message.content
+#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+#     )
+#
+#     assert response == response_snapshot
+#
+#
+# @pytest.mark.private
+# async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
+#     responses = await flash_qwen2.chat(
+#         max_tokens=100,
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {
+#                         "type": "image_url",
+#                         "image_url": {
+#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+#                         },
+#                     },
+#                     {"type": "text", "text": "Describe this image."},
+#                 ],
+#             },
+#         ],
+#         stream=True,
+#     )
+#
+#     count = 0
+#     generated = ""
+#     last_response = None
+#     async for response in responses:
+#         count += 1
+#         generated += response.choices[0].delta.content
+#         last_response = response
+#
+#     assert (
+#         generated
+#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
+#     )
+#     assert count == 58
+#     assert last_response == response_snapshot
diff --git a/integration-tests/models/test_flash_qwen2_vl_warmup.py b/integration-tests/models/test_flash_qwen2_vl_warmup.py
deleted file mode 100644
index 5be87ee2..00000000
--- a/integration-tests/models/test_flash_qwen2_vl_warmup.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import pytest
-
-
-@pytest.fixture(scope="module")
-def flash_qwen2_vl_handle(launcher):
-    with launcher(
-        "Qwen/Qwen2-VL-2B-Instruct",
-        max_input_length=40,
-        max_batch_prefill_tokens=50,
-        max_total_tokens=51,
-    ) as handle:
-        yield handle
-
-
-@pytest.fixture(scope="module")
-async def flash_qwen2(flash_qwen2_vl_handle):
-    await flash_qwen2_vl_handle.health(300)
-    return flash_qwen2_vl_handle.client
-
-
-@pytest.mark.private
-async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
-    response = await flash_qwen2.chat(
-        max_tokens=20,
-        seed=42,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "What is the color of the sky?"},
-                ],
-            },
-        ],
-    )
-
-    assert response.choices[0].message.content == "The correct answer is: blue"
-
-    assert response == response_snapshot
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 7521fb46..e2d24643 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -29,7 +29,6 @@ from text_generation_server.models.custom_modeling.bloom_modeling import (
     BloomForCausalLM,
 )
 from text_generation_server.models.globals import ATTENTION
-import text_generation_server.models.globals as globals
 from text_generation_server.models.seq2seq_lm import Seq2SeqLM
 from text_generation_server.models.galactica import GalacticaCausalLMBatch
 from text_generation_server.models.custom_modeling.neox_modeling import (
@@ -1218,11 +1217,6 @@ def get_model(
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
     if model_type == QWEN2_VL:
-        # TODO: remove edge case when cuda graph issue is resolved for BS=2 with Qwen2-VL
-        logger.warning(
-            "Qwen2-VL requires cuda graphs to be greater than 2. Removing all cuda graphs with a batch size equal or less than 2."
-        )
-        globals.CUDA_GRAPHS = list(filter(lambda x: x > 2, globals.CUDA_GRAPHS))
         return VlmCausalLM(
             model_id=model_id,
             model_class=Qwen2VLForConditionalGeneration,
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index 01d3bf1a..cc4039b1 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -138,12 +138,7 @@ class Qwen2Attention(torch.nn.Module):
                 dim=-1,
             )
 
-        self.rotary_emb(
-            query,
-            torch.select(kv, dim=1, index=0),
-            cos[: query.shape[0], ...],
-            sin[: query.shape[0], ...],
-        )
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
         if prefill_cache_indices is not None:
             kv_to_cache = kv[prefill_cache_indices]
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
index 95cf6a31..a8e1e8c1 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -517,11 +517,11 @@ class Qwen2VLForConditionalGeneration(nn.Module):
         pixel_values: torch.FloatTensor = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         video_grid_thw: Optional[torch.LongTensor] = None,
-        pixel_attention_mask: Optional[torch.Tensor] = None,
+        pixel_attention_mask=None,
         image_sizes: Optional[torch.LongTensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
         cross_attention_states: Optional[torch.Tensor] = None,
-        image_indices: Optional[torch.Tensor] = None,
+        image_indices=None,
     ):
         inputs_embeds = self.embed_tokens(input_ids)
 
@@ -533,7 +533,6 @@ class Qwen2VLForConditionalGeneration(nn.Module):
                 ).squeeze(0)
                 inputs_embeds[input_ids == self.image_token_id] = image_embeds
 
-        max_s = max(max_s, inputs_embeds.size(0))
         hidden_states = self.text_model(
             inputs_embeds=inputs_embeds,
             position_ids=position_ids,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index f2d27db9..d097c54f 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -56,13 +56,11 @@ from text_generation_server.models.globals import (
     MEM_POOL,
     ATTENTION,
     BLOCK_SIZE,
+    CUDA_GRAPHS,
     REQUEST_LOGPROBS,
     TGI_WIGGLE_ROOM,
     get_adapter_to_index,
 )
-
-# avoid coping CUDA_GRAPHS value by importing globals as a module
-import text_generation_server.models.globals as globals
 from text_generation_server.layers.attention import KVCache, Seqlen
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
@@ -1637,8 +1635,8 @@ class FlashCausalLM(Model):
                         int(val)
                         for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
                     ]
-                elif globals.CUDA_GRAPHS is not None:
-                    tuning_sequences = globals.CUDA_GRAPHS
+                elif CUDA_GRAPHS is not None:
+                    tuning_sequences = CUDA_GRAPHS
                 else:
                     tuning_sequences = [1, 2, 3, 4, 5, 6, 7]
 
@@ -1677,14 +1675,13 @@ class FlashCausalLM(Model):
                     "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp.",
                 )
 
-        if globals.CUDA_GRAPHS:
+        if CUDA_GRAPHS:
             try:
                 log_master(
-                    logger.info,
-                    f"Cuda Graphs are enabled for sizes {globals.CUDA_GRAPHS}",
+                    logger.info, f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}"
                 )
                 # Warmup cuda graphs
-                for bs in globals.CUDA_GRAPHS:
+                for bs in CUDA_GRAPHS:
                     synchronize(self.device)
                     free_memory = get_free_memory(
                         self.device, MEMORY_FRACTION * TGI_WIGGLE_ROOM
@@ -1708,8 +1705,7 @@ class FlashCausalLM(Model):
                 logger.exception("Decode cuda graph warmup failed")
         else:
             log_master(
-                logger.info,
-                f"Cuda Graphs are disabled (CUDA_GRAPHS={globals.CUDA_GRAPHS}).",
+                logger.info, f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS})."
             )
 
         assert max_input_tokens is not None
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 4d6ea84e..db78341d 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -236,6 +236,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                             w = image.width * 2
                             h = image.height * 2
                             image = image.resize((w, h))
+
                     if config.model_type == "llava_next":
                         images.append(image)
                     else:

From 630f198624b6c405e5fcfb7f08f7f308026f68cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 17 Jan 2025 18:18:02 +0100
Subject: [PATCH 039/183] flashinfer: switch to plan API (#2904)

This change doesn't switch `forward` to `run` yet, since it requires
that we have access to the softmax scale and the logit softcap outside
the model.
---
 server/text_generation_server/layers/attention/cuda.py      | 1 -
 .../text_generation_server/layers/attention/flashinfer.py   | 6 ++----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py
index 7b5af3c4..f1469b3f 100644
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -235,7 +235,6 @@ def attention(
             paged_kv_cache=(kv_cache.key, kv_cache.value),
             logits_soft_cap=softcap,
             sm_scale=softmax_scale,
-            window_left=window_size_left,
             k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
             v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
         )
diff --git a/server/text_generation_server/layers/attention/flashinfer.py b/server/text_generation_server/layers/attention/flashinfer.py
index 909eea27..d2345184 100644
--- a/server/text_generation_server/layers/attention/flashinfer.py
+++ b/server/text_generation_server/layers/attention/flashinfer.py
@@ -84,7 +84,7 @@ def use_prefill_with_paged_kv_state(
 
     token = prefill_with_paged_kv_state.set(state)
     try:
-        state.begin_forward(
+        state.plan(
             qo_indptr=cu_seqlens,
             paged_kv_indptr=indptr,
             paged_kv_indices=block_tables,
@@ -99,7 +99,6 @@ def use_prefill_with_paged_kv_state(
         )
         yield
     finally:
-        state.end_forward()
         if token is not None:
             prefill_with_paged_kv_state.reset(token)
 
@@ -200,7 +199,7 @@ def use_decode_state(
     token = decode_state.set(state)
 
     try:
-        state.begin_forward(
+        state.plan(
             indptr=indptr,
             indices=block_tables,
             last_page_len=last_page_len,
@@ -214,6 +213,5 @@ def use_decode_state(
         )
         yield
     finally:
-        state.end_forward()
         if token is not None:
             decode_state.reset(token)

From 447a5b2f87b26bc7624018d487a47a6181be4913 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 20 Jan 2025 11:13:46 +0100
Subject: [PATCH 040/183] Fixing TRTLLM dockerfile. (#2922)

* Fixing TRTLLM dockerfile.

* Fixed.

* Creating a dummy modification to chekc CI runs.

* Removing the cache directive.

* Modifying this should cache hit.

* Revert "Modifying this should cache hit."

This reverts commit 46a2bde108f0ce5f8a3d652c531cfbb6625c59c1.

* Modifying this should cache hit.

* Unwanted files.
---
 .devcontainer/Dockerfile_trtllm      | 75 ----------------------------
 .devcontainer/devcontainer.json      | 19 -------
 Dockerfile_trtllm                    | 20 +++++---
 server/text_generation_server/cli.py |  2 +
 4 files changed, 16 insertions(+), 100 deletions(-)
 delete mode 100644 .devcontainer/Dockerfile_trtllm
 delete mode 100644 .devcontainer/devcontainer.json

diff --git a/.devcontainer/Dockerfile_trtllm b/.devcontainer/Dockerfile_trtllm
deleted file mode 100644
index 239a7bf8..00000000
--- a/.devcontainer/Dockerfile_trtllm
+++ /dev/null
@@ -1,75 +0,0 @@
-ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
-ARG OMPI_VERSION="4.1.7rc1"
-
-# Build dependencies resolver stage
-FROM lukemathwalker/cargo-chef:latest AS chef
-WORKDIR /usr/src/text-generation-inference/backends/trtllm
-
-FROM chef AS planner
-COPY . .
-RUN cargo chef prepare --recipe-path recipe.json
-
-# CUDA dependent dependencies resolver stage
-FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt update && apt install -y \
-    build-essential \
-    cmake \
-    curl \
-    gcc-14  \
-    g++-14 \
-    git \
-    git-lfs \
-    libssl-dev \
-    libucx-dev \
-    ninja-build \
-    pkg-config \
-    pipx \
-    python3 \
-    python3-dev \
-    python3-setuptools \
-    tar \
-    wget && \
-    pipx ensurepath
-
-ENV TGI_INSTALL_PREFIX=/usr/local/tgi
-ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
-
-# Install OpenMPI
-FROM cuda-builder AS mpi-builder
-ARG OMPI_VERSION
-
-ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
-RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
-    mkdir /usr/src/mpi && \
-    tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
-    cd /usr/src/mpi && \
-    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
-    make -j all && \
-    make install && \
-    rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
-
-# Install TensorRT
-FROM cuda-builder AS trt-builder
-COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
-RUN chmod +x /opt/install_tensorrt.sh && \
-    /opt/install_tensorrt.sh
-
-# Build Backend
-FROM cuda-builder AS tgi-builder
-WORKDIR /usr/src/text-generation-inference
-
-# Install Rust
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
-    chmod -R a+w /root/.rustup && \
-    chmod -R a+w /root/.cargo
-
-ENV PATH="/root/.cargo/bin:$PATH"
-RUN cargo install cargo-chef
-
-COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
-COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
-
-ENV MPI_HOME=/usr/local/mpi
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
deleted file mode 100644
index 8b4eb89c..00000000
--- a/.devcontainer/devcontainer.json
+++ /dev/null
@@ -1,19 +0,0 @@
-// For format details, see https://aka.ms/devcontainer.json. For config options, see the
-// README at: https://github.com/devcontainers/templates/tree/main/src/cpp
-{
-  "name": "CUDA",
-  "build": {
-    "dockerfile": "Dockerfile_trtllm",
-    "context": ".."
-  },
-  "remoteEnv": {
-    "PATH": "${containerEnv:PATH}:/usr/local/cuda/bin",
-    "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64",
-    "XLA_FLAGS": "--xla_gpu_cuda_data_dir=/usr/local/cuda"
-  },
-  "customizations" : {
-    "jetbrains" : {
-      "backend" : "CLion"
-    }
-  }
-}
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index e6a16ecc..ecefc048 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -6,15 +6,19 @@ FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
 WORKDIR /usr/src/text-generation-inference/backends/trtllm
 
 FROM chef AS planner
-COPY . .
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY router router
+COPY benchmark/ benchmark/
+COPY backends/ backends/
+COPY launcher/ launcher/
 RUN cargo chef prepare --recipe-path recipe.json
 
 # CUDA dependent dependencies resolver stage
 FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
 
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt update && apt install -y \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     build-essential \
     cmake \
     curl \
@@ -31,7 +35,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     python3-dev \
     python3-setuptools \
     tar \
-    wget && \
+    wget --no-install-recommends && \
     pipx ensurepath
 
 ENV TGI_INSTALL_PREFIX=/usr/local/tgi
@@ -79,7 +83,11 @@ ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
 ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
 ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH"
 
-COPY . .
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY router router
+COPY backends/trtllm backends/trtllm
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index d8155b49..cc7726cd 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -9,6 +9,8 @@ from enum import Enum
 from huggingface_hub import hf_hub_download
 from text_generation_server.utils.adapter import parse_lora_adapters
 
+# Dummy change should cache hit.
+
 
 app = typer.Typer()
 

From b980848abff7aa1bebcc31588f67b59b7afb7587 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 21 Jan 2025 10:01:51 +0100
Subject: [PATCH 041/183] Flash Transformers modeling backend support (#2913)

* add transformers_flash

* inits

* switch version to make it work

* Update Makefile-flash-att-v2

* Update Makefile-flash-att-v2

* Update Makefile-flash-att-v2

* Update Makefile-flash-att-v2

* Update Makefile-flash-att-v2

* Update Makefile-flash-att-v2

* runnable version

* working

* push change

* fix high dim

* init

* default

* latest transformers changes

* revert

* simplify check

* remove flag

* improve type hints + required args

* Update based on transformers PR

* small fix

* Remove Warpers for Processor

* fix compatibility version issue

* raise error if needed

* Simplify with monkey patch

* revert + style + minor improvements

* update comment

* device check

* move the import to avoid device issue

* Update __init__.py

* check for non-native models

* oupsi

---------

Co-authored-by: System administrator <root@ip-10-90-0-159.ec2.internal>
---
 .../layers/gptq/quantize.py                   |  15 +-
 .../text_generation_server/models/__init__.py |  67 +++--
 .../models/transformers_flash_causal_lm.py    | 266 ++++++++++++++++++
 .../utils/logits_process.py                   |  13 +-
 4 files changed, 330 insertions(+), 31 deletions(-)
 create mode 100644 server/text_generation_server/models/transformers_flash_causal_lm.py

diff --git a/server/text_generation_server/layers/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py
index 66fc15ec..aa664ea6 100644
--- a/server/text_generation_server/layers/gptq/quantize.py
+++ b/server/text_generation_server/layers/gptq/quantize.py
@@ -956,15 +956,24 @@ def quantize(
 
     pack(model, quantizers, bits, groupsize)
     from safetensors.torch import save_file
-    from transformers.modeling_utils import shard_checkpoint
+    from huggingface_hub import split_torch_state_dict_into_shards
 
     state_dict = model.state_dict()
     state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}
 
     max_shard_size = "10GB"
-    shards, index = shard_checkpoint(
-        state_dict, max_shard_size=max_shard_size, weights_name="model.safetensors"
+    state_dict_split = split_torch_state_dict_into_shards(
+        state_dict,
+        filename_pattern="model.safetensors",
+        max_shard_size=max_shard_size,
     )
+    index = None
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+    shards = state_dict_split.filename_to_tensors
     os.makedirs(output_dir, exist_ok=True)
     for shard_file, shard in shards.items():
         save_file(
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index e2d24643..3b437ce0 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -16,10 +16,12 @@ from transformers.models.auto import modeling_auto
 from huggingface_hub import hf_hub_download, HfApi
 from typing import Optional, List, Dict
 from pathlib import Path
+import transformers
 
 from text_generation_server.utils.speculate import get_speculate, set_speculate
 from text_generation_server.models.model import Model
 from text_generation_server.models.causal_lm import CausalLM, CausalLMBatchKeysLast
+
 from text_generation_server.models.custom_modeling.opt_modeling import OPTForCausalLM
 from text_generation_server.models.custom_modeling.mpt_modeling import (
     MPTForCausalLM,
@@ -178,6 +180,14 @@ except ImportError as e:
 if MAMBA_AVAILABLE:
     __all__.append(Mamba)
 
+FLASH_TRANSFORMERS_BACKEND = True
+try:
+    from text_generation_server.models.transformers_flash_causal_lm import (
+        TransformersFlashCausalLM,
+    )
+except ImportError:
+    FLASH_TRANSFORMERS_BACKEND = False
+
 
 class ModelType(enum.Enum):
     DEEPSEEK_V2 = {
@@ -381,6 +391,21 @@ def get_model(
     )
     model_type = config_dict.get("model_type", None)
 
+    transformers_causal_lm_class = CausalLM
+
+    # Fast transformers path
+    transformers_model_class = getattr(
+        transformers,
+        modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.get(model_type, ""),
+        None,
+    )
+    if (
+        FLASH_TRANSFORMERS_BACKEND
+        and transformers_model_class is not None
+        and transformers_model_class._supports_flex_attn
+    ):
+        transformers_causal_lm_class = TransformersFlashCausalLM
+
     quantization_config = config_dict.get("quantization_config", None)
     if quantization_config is None:
         quantization_config = config_dict.get("compression_config", None)
@@ -624,7 +649,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Deepseek V2")
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -683,7 +708,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Santacoder")
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id=model_id,
                 revision=revision,
                 quantize=quantize,
@@ -731,7 +756,7 @@ def get_model(
             except RuntimeError as e:
                 # Lots of legacy models with various weight names.
                 log_master(logger.warning, f"Couldn't load flash gpt2 variant: {e}")
-                return CausalLM.fallback(
+                return transformers_causal_lm_class.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -742,7 +767,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-2"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -767,7 +792,7 @@ def get_model(
             except RuntimeError as e:
                 # Lots of legacy models with various weight names.
                 log_master(logger.warning, f"Couldn't load flash gptj variant: {e}")
-                return CausalLM.fallback(
+                return transformers_causal_lm_class.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -778,7 +803,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-J"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -815,7 +840,7 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -838,7 +863,7 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -862,7 +887,7 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -911,7 +936,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -937,7 +962,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -963,7 +988,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -988,7 +1013,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Cohere"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1016,7 +1041,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded DBRX"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1066,7 +1091,7 @@ def get_model(
                     config_class=RWConfig,
                 )
             else:
-                return CausalLM.fallback(
+                return transformers_causal_lm_class.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -1091,7 +1116,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mistral"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1116,7 +1141,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mixtral"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1143,7 +1168,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Starcoder2")
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1168,7 +1193,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Qwen2"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1329,7 +1354,7 @@ def get_model(
     elif quantize == "exl2":
         raise NotImplementedError("exl2 quantization is not supported for AutoModel")
     if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-        return CausalLM.fallback(
+        return transformers_causal_lm_class.fallback(
             model_id,
             revision,
             quantize=quantize,
@@ -1350,7 +1375,7 @@ def get_model(
     auto_map = config_dict.get("auto_map", None)
     if trust_remote_code and auto_map is not None:
         if "AutoModelForCausalLM" in auto_map.keys():
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
diff --git a/server/text_generation_server/models/transformers_flash_causal_lm.py b/server/text_generation_server/models/transformers_flash_causal_lm.py
new file mode 100644
index 00000000..647fabc2
--- /dev/null
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@@ -0,0 +1,266 @@
+import math
+from typing import List, Optional
+
+import torch
+from opentelemetry import trace
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+import transformers.modeling_utils
+
+from text_generation_server.models.flash_causal_lm import FlashCausalLM
+from text_generation_server.utils import initialize_torch_distributed
+
+from text_generation_server.layers.attention import paged_attention, attention, Seqlen
+from text_generation_server.layers.attention.kv_cache import KVScales, KVCache
+from text_generation_server.models.globals import ATTENTION
+
+
+tracer = trace.get_tracer(__name__)
+
+
+def tgi_flash_attention_forward(
+    module,
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],  # This is a positional arg in Transformers
+    kv_cache: List[KVCache],
+    kv_head_mapping: torch.Tensor,
+    slots: torch.Tensor,
+    cu_seqlen_prefill: Optional[torch.Tensor],
+    seqlen: Seqlen,
+    block_tables: torch.Tensor,
+    max_s: int,
+    kv_scales: KVScales,
+    softmax_scale: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    softcap: Optional[float] = None,
+    **kwargs,  # This is needed to "absorb" other args passed by Transformers modeling
+):
+
+    kv_cache = kv_cache[module.layer_idx]
+
+    query_states = query_states.transpose(1, 2).squeeze(dim=0)
+    key_states = key_states.transpose(1, 2).squeeze(dim=0)
+    value_states = value_states.transpose(1, 2).squeeze(dim=0)
+
+    # Take care of updating the cache in-place
+    kv_cache.store(key=key_states, value=value_states, slots=slots, kv_scales=kv_scales)
+
+    _, num_heads, head_dim = query_states.shape
+    softmax_scale = 1 / math.sqrt(head_dim) if softmax_scale is None else softmax_scale
+    sliding_window = -1 if sliding_window is None else sliding_window
+
+    if cu_seqlen_prefill is not None:
+        attn_output = attention(
+            query=query_states,
+            key=key_states,
+            value=value_states,
+            kv_cache=kv_cache,
+            kv_scales=kv_scales,
+            seqlen=seqlen,
+            block_tables=block_tables,
+            softmax_scale=softmax_scale,
+            window_size_left=sliding_window,
+            softcap=softcap,
+        )
+    else:
+        attn_output = paged_attention(
+            query_states,
+            kv_cache,
+            kv_head_mapping,
+            softmax_scale,
+            block_tables,
+            seqlen,
+            max_s,
+            kv_scales=kv_scales,
+            softcap=softcap,
+        )
+
+    attn_output = attn_output.view(-1, num_heads * head_dim)
+
+    return attn_output, None
+
+
+transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS["tgi"] = tgi_flash_attention_forward
+
+
+class TransformersFlashCausalLM(FlashCausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        tokenizer_class=AutoTokenizer,
+        config_class=AutoConfig,
+        kv_cache_dtype: Optional[torch.dtype] = None,
+    ):
+        self.quantize = quantize
+        self.process_group, rank, world_size = initialize_torch_distributed()
+
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
+
+        if torch.cuda.is_available():
+            device = torch.device("cuda:0")
+            dtype = torch.float16 if dtype is None else dtype
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            device = torch.device("xpu")
+            dtype = torch.float16 if dtype is None else dtype
+        else:
+            raise ValueError(
+                "Flash `Transformers` modeling backend is not available on cpu."
+            )
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=dtype,
+            device_map="auto",
+            load_in_8bit=quantize == "bitsandbytes",
+            trust_remote_code=trust_remote_code,
+            attn_implementation="tgi",
+            tp_plan="auto" if world_size > 1 else None,
+        )
+
+        if tokenizer.pad_token_id is None:
+            if model.config.pad_token_id is not None:
+                tokenizer.pad_token_id = model.config.pad_token_id
+            elif model.config.eos_token_id is not None and isinstance(
+                model.config.eos_token_id, int
+            ):
+                tokenizer.pad_token_id = model.config.eos_token_id
+            elif tokenizer.eos_token_id is not None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            else:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+        self.num_layers = model.config.num_hidden_layers
+        self.num_heads = model.config.num_attention_heads // self.process_group.size()
+        self.num_kv_heads = model.config.num_key_value_heads
+        self.num_kv_heads = (
+            self.num_kv_heads // self.process_group.size()
+            if self.num_kv_heads > 1
+            else self.num_kv_heads
+        )
+        self.head_size = model.config.hidden_size // model.config.num_attention_heads
+
+        self.cuda_graphs = {}
+        self.kv_cache = []
+        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
+
+        if ATTENTION == "flashinfer":
+            from text_generation_server.layers.attention.flashinfer import (
+                create_prefill_state,
+                create_decode_state,
+                create_prefill_with_paged_kv_state,
+            )
+
+            self.prefill_state = create_prefill_state(device=device)
+            self.prefill_with_paged_kv_state = create_prefill_with_paged_kv_state(
+                device=device
+            )
+
+            self.decode_state = create_decode_state(
+                device=device,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+            )
+
+        self.num_groups = self.num_heads // self.num_kv_heads
+
+        # Those will never change and will be used in the forwards
+        self.kv_head_mapping = torch.arange(
+            0, self.num_kv_heads, dtype=torch.int32, device=device
+        ).repeat_interleave(self.num_groups)
+        # This means no scale
+        self.kv_scales = KVScales(
+            torch.tensor(1.0, device=device),
+            torch.tensor(1.0, device=device),
+        )
+
+        torch.distributed.barrier(group=self.process_group)
+        # Skip FlashCausalLM init.
+        super(FlashCausalLM, self).__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=False,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+
+        # Monkey patch of `self.model.forward` to match `FlashCausalLM`. It avoids duplicating a lot of code
+        # We first copy the original model.forward because we still need it in the monkey patch
+        self.model.original_forward = self.model.forward
+        self.model.forward = self._model_forward
+
+    @classmethod
+    def fallback(
+        cls,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+    ):
+        return cls(
+            model_id=model_id,
+            revision=revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+        )
+
+    def _model_forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[KVCache],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        lm_head_indices: Optional[torch.Tensor],
+        prefill_cache_indices=None,  # not used, but passed to match original signature
+        adapter_data=None,  # not supported, but passed to match original signature
+    ):
+        hidden_states = self.model.model.forward(
+            input_ids=input_ids.unsqueeze(0),  # expand dim to fit Transformers
+            position_ids=position_ids.unsqueeze(0),  # expand dim to fit Transformers
+            past_key_values=None,  # we use self.kv_cache instead of transformers cache object
+            use_cache=False,  # we use self.kv_cache instead of transformers cache object
+            return_dict=True,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+            kv_head_mapping=self.kv_head_mapping,
+            kv_scales=self.kv_scales,
+        )[0].squeeze(dim=0)
+
+        # And compute logits from the lm_head, slicing correctly the indices
+        # NOTE: some logits post-processing (e.g. in gemma2) may be absent here with the split of the modules
+        # To update with full Transformers support asap
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.model.lm_head.forward(hidden_states)
+
+        return logits, None
diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py
index 132e441b..64a285b9 100644
--- a/server/text_generation_server/utils/logits_process.py
+++ b/server/text_generation_server/utils/logits_process.py
@@ -5,13 +5,12 @@ import torch
 from typing import List, Optional, DefaultDict
 
 from loguru import logger
-from typing import Dict, Union
+from typing import Dict
 from text_generation_server.pb.generate_pb2 import GrammarType
 
 from outlines.fsm.guide import RegexGuide
 
 from transformers import (
-    LogitsWarper,
     LogitsProcessor,
     PreTrainedTokenizerBase,
     TemperatureLogitsWarper,
@@ -219,7 +218,7 @@ class HeterogeneousTemperatureLogitsWarper:
         return None
 
 
-class HeterogeneousTopPLogitsWarper(LogitsWarper):
+class HeterogeneousTopPLogitsWarper(LogitsProcessor):
     """
     [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
     This version allows for a separate value for each sample and runs inplace when possible.
@@ -278,7 +277,7 @@ class HeterogeneousTopPLogitsWarper(LogitsWarper):
         return None
 
 
-class HeterogeneousTopKLogitsWarper(LogitsWarper):
+class HeterogeneousTopKLogitsWarper(LogitsProcessor):
     r"""
     [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
     This version allows for a separate value for each sample and runs inplace when possible.
@@ -359,7 +358,7 @@ class HeterogeneousTopKLogitsWarper(LogitsWarper):
         return None
 
 
-class HeterogeneousTypicalLogitsWarper(LogitsWarper):
+class HeterogeneousTypicalLogitsWarper(LogitsProcessor):
     r"""
     [`LogitsWarper`] that performs typical decoding. See [Typical Decoding for Natural Language
     Generation](https://arxiv.org/abs/2202.00666) for more information.
@@ -453,13 +452,13 @@ class HeterogeneousProcessorWrapper(LogitsProcessor):
     r"""
     A wrapper for logit warpers or processors without heterogeneous parameter support.
     Args:
-        processors (`Dict[int, Union[LogitsProcessor, LogitsWarper]]`):
+        processors (`Dict[int, LogitsProcessor]`):
             A mapping of sample indices to logit warpers or processors, to be run sequentially.
     """
 
     def __init__(
         self,
-        processors: Dict[int, Union[LogitsProcessor, LogitsWarper]],
+        processors: Dict[int, LogitsProcessor],
     ):
         self.processors = processors
 

From 17367438f3baa3bd5e7c2a763c312c2823b76493 Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Tue, 21 Jan 2025 10:19:16 +0100
Subject: [PATCH 042/183] =?UTF-8?q?Give=20TensorRT-LLMa=20proper=20CI/CD?=
 =?UTF-8?q?=20=F0=9F=98=8D=20(#2886)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test(ctest) enable address sanitizer

* feat(trtllm): expose finish reason to Rust

* feat(trtllm): fix logits retrieval

* misc(ci): enabe building tensorrt-llm

* misc(ci): update Rust action toolchain

* misc(ci): let's try to build the Dockerfile for trtllm

# Conflicts:
#	Dockerfile_trtllm

* misc(ci): provide mecanism to cache inside container

* misc(ci): export aws creds as output of step

* misc(ci): let's try this way

* misc(ci): again

* misc(ci): again

* misc(ci): add debug profile

* misc(ci): add debug profile

* misc(ci): lets actually use sccache ...

* misc(ci): do not build with ssl enabled

* misc(ci): WAT

* misc(ci): WAT

* misc(ci): WAT

* misc(ci): WAT

* misc(ci): WAT

* misc(backend): test with TGI S3 conf

* misc(backend): test with TGI S3 conf

* misc(backend): once more?

* misc(backend): let's try with GHA

* misc(backend): missing env directive

* misc(backend): make sure to correctly set IS_GHA_BUILD=true in wf

* misc(backend): ok let's debug smtg

* misc(backend): WWWWWWWWWWWWWAAAAAAAA

* misc(backend): kthxbye retry s3

* misc(backend): use session token

* misc(backend): add more info

* misc(backend): lets try 1h30

* misc(backend): lets try 1h30

* misc(backend): increase to 2h

* misc(backend): lets try...

* misc(backend): lets try...

* misc(backend): let's build for ci-runtime

* misc(backend): let's add some more tooling

* misc(backend): add some tags

* misc(backend): disable Werror for now

* misc(backend): added automatic gha detection

* misc(backend): remove leak sanitizer which is included in asan

* misc(backend): forward env

* misc(backend): forward env

* misc(backend): let's try

* misc(backend): let's try

* misc(backend): again

* misc(backend): again

* misc(backend): again

* misc(backend): again

* misc(backend): again

* misc(backend): fix sscache -> sccache

* misc(backend): fix sscache -> sccache

* misc(backend): fix sscache -> sccache

* misc(backend): let's actually cache things now

* misc(backend): let's actually cache things now

* misc(backend): attempt to run the testS?

* misc(backend): attempt to run the tests?

* misc(backend): attempt to run the tests?

* change runner size

* fix: Correctly tag docker images (#2878)

* fix: Correctly tag docker images

* fix: Correctly tag docker images

* misc(llamacpp): maybe?

* misc(llamacpp): maybe?

* misc(llamacpp): maybe?

* misc(ci): gogogo

* misc(ci): gogogo

* misc(ci): gogogo

* misc(ci): gogogo

* misc(ci): gogogo

* misc(ci): gogogo

* misc(ci): go

* misc(ci): go

* misc(ci): go

* misc(ci): use bin folder

* misc(ci): make the wf callable for reuse

* misc(ci): make the wf callable for reuse (bis)

* misc(ci): make the wf callable for reuse (bis)

* misc(ci): give the wf a name

* Create test-trtllm.yml

* Update test-trtllm.yml

* Create build-trtllm2

* Rename build-trtllm2 to 1-build-trtllm2

* Rename test-trtllm.yml to 1-test-trtllm2.yml

* misc(ci): fw secrets

* Update 1-test-trtllm2.yml

* Rename 1-build-trtllm2 to 1-build-trtllm2.yml

* Update 1-test-trtllm2.yml

* misc(ci): use ci-build.yaml as main dispatcher

* Delete .github/workflows/1-test-trtllm2.yml

* Delete .github/workflows/1-build-trtllm2.yml

* misc(ci): rights?

* misc(ci): rights?

* misc(ci): once more?

* misc(ci): once more?

* misc(ci): baby more time?

* misc(ci): baby more time?

* misc(ci): try the permission above again?

* misc(ci): try the permission above again?

* misc(ci): try the permission scoped again?

* misc(ci): install tensorrt_llm_executor_static

* misc(ci): attempt to rebuild with sccache?

* misc(ci):run the tests on GPU instance

* misc(ci): let's actually setup sccache in the build.rs

* misc(ci): reintroduce variables

* misc(ci): enforce sccache

* misc(ci): correct right job name dependency

* misc(ci): detect dev profile for debug

* misc(ci): detect gha build

* misc(ci): detect gha build

* misc(ci): ok debug

* misc(ci): wtf

* misc(ci): wtf2

* misc(ci): wtf3

* misc(ci): use commit HEAD instead of merge commit for image id

* misc(ci): wtfinfini

* misc(ci): wtfinfini

* misc(ci): KAMEHAMEHA

* Merge TRTLLM in standard CI

* misc(ci): remove input machine

* misc(ci): missing id-token for AWS auth

* misc(ci): missing id-token for AWS auth

* misc(ci): missing id-token for AWS auth

* misc(ci): again...

* misc(ci): again...

* misc(ci): again...

* misc(ci): again...

* misc(ci): missing benchmark

* misc(ci): missing backends

* misc(ci): missing launcher

* misc(ci): give everything aws needs

* misc(ci): give everything aws needs

* misc(ci): fix warnings

* misc(ci): attempt to fix sccache not building trtllm

* misc(ci): attempt to fix sccache not building trtllm again

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>
Co-authored-by: Pauline Bailly-Masson <155966238+paulinebm@users.noreply.github.com>
---
 .github/workflows/build.yaml             |  54 +++++++++-
 .github/workflows/ci_build.yaml          |   1 +
 Cargo.toml                               |  28 ++---
 Dockerfile_trtllm                        |  95 +++++++++++------
 backends/trtllm/CMakeLists.txt           |  95 +++++++++++------
 backends/trtllm/build.rs                 | 127 ++++++++++++++++++----
 backends/trtllm/cmake/spdlog.cmake       |   8 +-
 backends/trtllm/cmake/trtllm.cmake       |   6 +-
 backends/trtllm/csrc/backend.cpp         |  19 ++--
 backends/trtllm/csrc/ffi.hpp             | 129 ++++++++++++++---------
 backends/trtllm/scripts/setup_sccache.py |  49 +++++++++
 backends/trtllm/src/lib.rs               |  35 ++++++
 backends/trtllm/src/main.rs              |   4 +-
 backends/trtllm/tests/test_backend.cpp   |  18 ++--
 14 files changed, 492 insertions(+), 176 deletions(-)
 create mode 100644 backends/trtllm/scripts/setup_sccache.py

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c43d8eb9..2e3fe7ef 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -31,15 +31,28 @@ jobs:
       group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
     runs-on:
-      group: aws-highmemory-32-plus-priv
+      group: aws-highmemory-64-plus-priv
     permissions:
       contents: write
       packages: write
+      id-token: write
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Extract TensorRT-LLM version
+        run: |
+          echo "TENSORRT_LLM_VERSION=$(grep -oP '([a-z,0-9]{40})' $GITHUB_WORKSPACE/backends/trtllm/cmake/trtllm.cmake)" >> $GITHUB_ENV
+          echo "TensorRT-LLM version: ${{ env.TENSORRT_LLM_VERSION }}"
+      - name: "Configure AWS Credentials"
+        id: aws-creds
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-1
+          role-to-assume: ${{ secrets.AWS_ROLE_GITHUB_TGI_TEST }}
+          role-duration-seconds: 7200
+          output-credentials: true
       - name: Construct harware variables
         shell: bash
         run: |
@@ -52,6 +65,7 @@ jobs:
                 export runs_on="aws-g6-12xl-plus-priv-cache"
                 export platform=""
                 export extra_pytest=""
+                export target="nil" 
                 ;;
             cuda-trtllm)
                 export dockerfile="Dockerfile_trtllm"
@@ -61,6 +75,10 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest=""
+                export target="ci-runtime"
+                export sccache_s3_key_prefix="trtllm"
+                export sccache_region="us-east-1"
+                export build_type="dev"
                 ;;
             rocm)
                 export dockerfile="Dockerfile_amd"
@@ -71,6 +89,7 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest="-k test_flash_gemma_gptq_load"
+                export target="nil"
                 ;;
             intel-xpu)
                 export dockerfile="Dockerfile_intel"
@@ -80,6 +99,7 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform="xpu"
                 export extra_pytest=""
+                export target="nil"
                 ;;
             intel-cpu)
                 export dockerfile="Dockerfile_intel"
@@ -90,6 +110,7 @@ jobs:
                 export runs_on="aws-highmemory-32-plus-priv"
                 export platform="cpu"
                 export extra_pytest="-k test_flash_gemma_simple"
+                export target="nil"
                 ;;
           esac
           echo $dockerfile
@@ -106,6 +127,10 @@ jobs:
           echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
           echo "EXTRA_PYTEST=${extra_pytest}" >> $GITHUB_ENV
           echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
+          echo "TARGET=${target}" >> $GITHUB_ENV
+          echo "SCCACHE_S3_KEY_PREFIX=${sccache_s3_key_prefix}" >> $GITHUB_ENV
+          echo "SCCACHE_REGION=${sccache_region}" >> $GITHUB_ENV
+          echo "BUILD_TYPE=${build_type}" >> $GITHUB_ENV
       - name: Initialize Docker Buildx
         uses: docker/setup-buildx-action@v3
         with:
@@ -170,6 +195,14 @@ jobs:
             GIT_SHA=${{ env.GITHUB_SHA }}
             DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
             PLATFORM=${{ env.PLATFORM }}
+            build_type=${{ env.BUILD_TYPE }}
+            is_gha_build=true
+            aws_access_key_id=${{ steps.aws-creds.outputs.aws-access-key-id }}
+            aws_secret_access_key=${{ steps.aws-creds.outputs.aws-secret-access-key }}
+            aws_session_token=${{ steps.aws-creds.outputs.aws-session-token }}
+            sccache_bucket=${{ secrets.AWS_S3_BUCKET_GITHUB_TGI_TEST }}
+            sccache_s3_key_prefix=${{ env.SCCACHE_S3_KEY_PREFIX }}
+            sccache_region=${{ env.SCCACHE_REGION }}
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
           cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
@@ -215,3 +248,22 @@ jobs:
           echo $DOCKER_IMAGE
           docker pull $DOCKER_IMAGE
           pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
+
+  backend_trtllm_cxx_tests:
+    needs: build-and-push
+    if: needs.build-and-push.outputs.label == '-trtllm'
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-trtllm-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    runs-on:
+      group: aws-g6-12xl-plus-priv-cache
+    container:
+      image: ${{ needs.build-and-push.outputs.docker_image }}
+      credentials:
+        username: ${{ secrets.REGISTRY_USERNAME }}
+        password: ${{ secrets.REGISTRY_PASSWORD }}
+      options: --gpus all --shm-size=8g
+
+    steps:
+      - name: Run C++/CUDA tests
+        run: /usr/local/tgi/bin/tgi_trtllm_backend_tests
diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml
index 0d87cb29..d8746b65 100644
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@@ -42,6 +42,7 @@ jobs:
     permissions:
       contents: write
       packages: write
+      id-token: write
     with:
       hardware: ${{ matrix.hardware }}
       # https://github.com/actions/runner/issues/2206
diff --git a/Cargo.toml b/Cargo.toml
index d8155153..9f49c9ab 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,21 +1,21 @@
 [workspace]
 members = [
-  "benchmark",
-  "backends/v2",
-  "backends/v3",
-  "backends/grpc-metadata",
-  "backends/trtllm",
-  "launcher",
-  "router"
+    "benchmark",
+    "backends/v2",
+    "backends/v3",
+    "backends/grpc-metadata",
+    "backends/trtllm",
+    "launcher",
+    "router"
 ]
 default-members = [
-  "benchmark",
-  "backends/v2",
-  "backends/v3",
-  "backends/grpc-metadata",
-  # "backends/trtllm",
-  "launcher",
-  "router"
+    "benchmark",
+    "backends/v2",
+    "backends/v3",
+    "backends/grpc-metadata",
+    # "backends/trtllm",
+    "launcher",
+    "router"
 ]
 resolver = "2"
 
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index ecefc048..40972764 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -1,19 +1,7 @@
-ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
-ARG OMPI_VERSION="4.1.7rc1"
-
-# Build dependencies resolver stage
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
-WORKDIR /usr/src/text-generation-inference/backends/trtllm
-
-FROM chef AS planner
-COPY Cargo.lock Cargo.lock
-COPY Cargo.toml Cargo.toml
-COPY rust-toolchain.toml rust-toolchain.toml
-COPY router router
-COPY benchmark/ benchmark/
-COPY backends/ backends/
-COPY launcher/ launcher/
-RUN cargo chef prepare --recipe-path recipe.json
+ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
+ARG ompi_version="4.1.7rc1"
+ARG build_type=release
+ARG is_gha_build=false
 
 # CUDA dependent dependencies resolver stage
 FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
@@ -26,8 +14,11 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     g++-14 \
     git \
     git-lfs \
+    lld \
     libssl-dev \
     libucx-dev \
+    libasan8 \
+    libubsan1 \
     ninja-build \
     pkg-config \
     pipx \
@@ -43,9 +34,9 @@ ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
 
 # Install OpenMPI
 FROM cuda-builder AS mpi-builder
-ARG OMPI_VERSION
+ARG ompi_version
 
-ENV OMPI_TARBALL_FILENAME="openmpi-$OMPI_VERSION.tar.bz2"
+ENV OMPI_TARBALL_FILENAME="openmpi-$ompi_version.tar.bz2"
 RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
     mkdir /usr/src/mpi && \
     tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
@@ -65,34 +56,56 @@ RUN chmod +x /opt/install_tensorrt.sh && \
 FROM cuda-builder AS tgi-builder
 WORKDIR /usr/src/text-generation-inference
 
+# Scoped global args reuse
+ARG is_gha_build
+ARG build_type
+
 # Install Rust
+ENV PATH="/root/.cargo/bin:$PATH"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
     chmod -R a+w /root/.rustup && \
-    chmod -R a+w /root/.cargo
+    chmod -R a+w /root/.cargo && \
+    cargo install sccache --locked
 
-ENV PATH="/root/.cargo/bin:$PATH"
-RUN cargo install cargo-chef
+# SCCACHE Specifics args - before finding a better, more generic, way...
+ARG aws_access_key_id
+ARG aws_secret_access_key
+ARG aws_session_token
+ARG sccache_bucket
+ARG sccache_s3_key_prefix
+ARG sccache_region
 
-# Cache dependencies
-COPY --from=planner /usr/src/text-generation-inference/backends/trtllm/recipe.json .
-RUN cargo chef cook --release --recipe-path recipe.json
+ENV AWS_ACCESS_KEY_ID=$aws_access_key_id
+ENV AWS_SECRET_ACCESS_KEY=$aws_secret_access_key
+ENV AWS_SESSION_TOKEN=$aws_session_token
+ENV SCCACHE_BUCKET=$sccache_bucket
+ENV SCCACHE_S3_KEY_PREFIX=$sccache_s3_key_prefix
+ENV SCCACHE_REGION=$sccache_region
 
-# Build actual TGI
-ARG CUDA_ARCH_LIST
-ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
 ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
 ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH"
+ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
+
+ENV USE_LLD_LINKER=ON
+ENV CUDA_ARCH_LIST=${cuda_arch_list}
+ENV IS_GHA_BUILD=${is_gha_build}
 
 COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY router router
-COPY backends/trtllm backends/trtllm
+COPY backends backends
+COPY benchmark benchmark
+COPY launcher launcher
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+
 RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
-    cd backends/trtllm && \
-    CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
+    python3 backends/trtllm/scripts/setup_sccache.py --is-gha-build ${is_gha_build} && \
+    CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX \
+    RUSTC_WRAPPER=sccache \
+    cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \
+    sccache --show-stats
 
 FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
 RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
@@ -116,6 +129,28 @@ FROM runtime
 
 LABEL co.huggingface.vendor="Hugging Face Inc."
 LABEL org.opencontainers.image.authors="hardware@hf.co"
+LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
 
 ENTRYPOINT ["./text-generation-launcher"]
 CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
+
+# This is used only for the CI/CD
+FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
+RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
+    rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
+    pipx ensurepath && \
+    pipx install --include-deps transformers tokenizers
+
+WORKDIR /usr/local/tgi/bin
+
+ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
+ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV TOKENIZERS_PARALLELISM=false
+ENV OMPI_MCA_plm_rsh_agent=""
+
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
+
+# Basically we copy from target/debug instead of target/release
+COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
\ No newline at end of file
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 9c1f3436..2cbd3a07 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -1,11 +1,18 @@
 cmake_minimum_required(VERSION 3.20)
 
-if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
     find_program(CCACHE_EXECUTABLE "ccache")
     if (CCACHE_EXECUTABLE)
         message(STATUS "Using ccache")
-        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
+        set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
+        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
+        set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
     endif ()
+else ()
+    message(STATUS "Using user specified cmake cxx compiler launcher: ${CMAKE_CXX_COMPILER_LAUNCHER}")
+    set(CMAKE_C_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
 endif ()
 
 if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
@@ -21,28 +28,31 @@ include(CheckCXXCompilerFlag)
 
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
+option(TGI_TRTLLM_BACKEND_BUILD_USE_LLD "Enable lld linker instead of ld" OFF)
 set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
-set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
+set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path rgo where TensorRT libraries and headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
 
 # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
 find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+find_package(MPI REQUIRED)
 
 #### External dependencies ####
 include(cmake/json.cmake)
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
 
-if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(TGI_TRTLLM_BACKEND_DEBUG ON)
     add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
-endif()
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
+endif ()
 
-# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
-check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
-if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
-    set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
-endif()
+if (${TGI_TRTLLM_BACKEND_BUILD_USE_LLD})
+    message(STATUS "Using lld linker")
+    add_link_options("-fuse-ld=lld")
+endif ()
 
 # Let's build TRTLLM as part of CMake
 add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
@@ -55,51 +65,68 @@ add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp cs
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
-#        $<INSTALL_INTERFACE:csrc>
+        #        $<INSTALL_INTERFACE:csrc>
 )
 target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
 target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
 target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
-
-if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
-else()
-    target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
-endif ()
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)
 
 # This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
-install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
-install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
+install(TARGETS tgi_trtllm_backend_impl)
+install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
+install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
+if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
+    install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
+endif ()
+
 
 #### Unit Tests ####
-if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
+if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug")
     message(STATUS "Building tests")
+    option(TGI_TRTLLM_BACKEND_ENABLE_ASAN "Enable AddressSanitizer")
+    option(TGI_TRTLLM_BACKEND_ENABLE_UBSAN "Enable UndefinedSanitizer")
+
     FetchContent_Declare(
             Catch2
             URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
     )
     FetchContent_MakeAvailable(Catch2)
 
+    # This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
+    check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
+    if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
+        message(STATUS "Enabling non-NVRO detection")
+        target_compile_options(tgi_trtllm_backend_impl "-Wnvro")
+    endif ()
+
+    cmake_path(GET TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH PARENT_PATH TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH)
+    message(STATUS "Adding linking path: ${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")
+
     add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
+
+    #    target_compile_options(tgi_trtllm_backend_tests PRIVATE -Werror)
+    target_link_directories(tgi_trtllm_backend_tests PRIVATE "${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")
     target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
     target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
     target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
     target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
+    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)
 
-    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
-    else()
-        target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
+    if (${TGI_TRTLLM_BACKEND_ENABLE_ASAN})
+        message(STATUS "Enabled AddressSanitizer")
+        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=address)
     endif ()
 
-    if(CMAKE_BUILD_TYPE MATCHES "Debug")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
-        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
-    endif()
+    if (${TGI_TRTLLM_BACKEND_ENABLE_UBSAN})
+        message(STATUS "Enabled UndefinedSanitizer")
+        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined)
+    endif ()
 
-    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
-    include(CTest)
-    include(Catch)
-    catch_discover_tests(tgi_trtllm_backend_tests)
-endif ()
+    install(TARGETS tgi_trtllm_backend_tests)
+
+    #    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+    #    include(CTest)
+    #    include(Catch)
+    #    catch_discover_tests(tgi_trtllm_backend_tests)
+endif ()
\ No newline at end of file
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index d9c1aa15..c18b13a9 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -3,6 +3,7 @@ use pkg_config;
 use std::env;
 use std::env::consts::ARCH;
 use std::path::{absolute, PathBuf};
+use std::sync::LazyLock;
 
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
@@ -12,12 +13,20 @@ const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
 const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
 const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
 
+const IS_GHA_BUILD: LazyLock<bool> = LazyLock::new(|| {
+    option_env!("IS_GHA_BUILD").map_or(false, |value| match value.to_lowercase().as_str() {
+        "on" => true,
+        "true" => true,
+        "1" => true,
+        _ => false,
+    })
+});
+
 // Dependencies
-const BACKEND_DEPS: [&str; 2] = ["tgi_trtllm_backend_impl", "tgi_trtllm_backend"];
+const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl";
 const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
-const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
+const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [
     ("dylib", "tensorrt_llm"),
-    ("static", "tensorrt_llm_executor_static"),
     ("dylib", "tensorrt_llm_nvrtc_wrapper"),
     ("dylib", "nvinfer_plugin_tensorrt_llm"),
     ("dylib", "decoder_attention"),
@@ -32,6 +41,48 @@ macro_rules! probe {
     };
 }
 
+fn get_compiler_flag(
+    switch: bool,
+    true_case: &'static str,
+    false_case: &'static str,
+) -> &'static str {
+    match switch {
+        true => true_case,
+        false => false_case,
+    }
+}
+
+fn get_library_architecture() -> &'static str {
+    let os = env::var("CARGO_CFG_TARGET_OS").unwrap();
+    let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
+    let env = env::var("CARGO_CFG_TARGET_ENV").unwrap();
+
+    match os.as_str() {
+        "linux" => {
+            if env != "gnu" {
+                panic!("unsupported linux ABI {env}, only 'gnu' is supported")
+            }
+
+            match arch.as_str() {
+                "x86_64" => "x86_64-linux-gnu",
+                "aarch64" => "aarch64-linux-gnu",
+                _ => panic!("unsupported linux architecture {arch}"),
+            }
+        }
+        "windows" => {
+            if env != "msvc" {
+                panic!("unsupported windows ABI {env}, only 'msvc' is supported")
+            }
+
+            match arch.as_str() {
+                "x86_64" => "x86_64-windows-msvc",
+                _ => panic!("unsupported windows architecture {arch}"),
+            }
+        }
+        _ => panic!("unsupported OS {os}"),
+    }
+}
+
 fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
     // Build the backend implementation through CMake
     let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
@@ -54,10 +105,45 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
         .env("OPT_LEVEL", opt_level)
         .define("CMAKE_INSTALL_PREFIX", &install_path)
         .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
-        .define("Python3_ROOT_DIR", "../venv")
+        .define("CMAKE_LIBRARY_ARCHITECTURE", get_library_architecture())
         .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
+        .define(
+            "TGI_TRTLLM_BACKEND_DEBUG",
+            get_compiler_flag(is_debug, "ON", "OFF"),
+        )
         .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
 
+    if is_debug || *IS_GHA_BUILD {
+        config.define("TGI_TRTLLM_BACKEND_BUILD_TESTS", "ON");
+    }
+
+    if option_env!("USE_LLD_LINKER").is_some() {
+        println!("cargo:warning=Using lld linker");
+        config.define("TGI_TRTLLM_BACKEND_BUILD_USE_LLD", "ON");
+    }
+
+    if (is_debug && option_env!("ENABLE_ASAN").is_some()) || *IS_GHA_BUILD {
+        println!("cargo:warning=Enabling Address Sanitizer");
+        config.define("TGI_TRTLLM_BACKEND_ENABLE_ASAN", "ON");
+    }
+
+    if (is_debug && option_env!("ENABLE_UBSAN").is_some()) || *IS_GHA_BUILD {
+        println!("cargo:warning=Enabling Undefined Sanitizer");
+        config.define("TGI_TRTLLM_BACKEND_ENABLE_UBSAN", "ON");
+    }
+
+    if let Some(nvcc_host_compiler) = option_env!("CMAKE_CUDA_HOST_COMPILER") {
+        config.define("CMAKE_CUDA_HOST_COMPILER", nvcc_host_compiler);
+    }
+
+    if let Some(wrapper) = option_env!("RUSTC_WRAPPER") {
+        println!("cargo:warning=Using caching tool: {wrapper}");
+
+        env::set_var("CMAKE_C_COMPILER_LAUNCHER", wrapper);
+        env::set_var("CMAKE_CXX_COMPILER_LAUNCHER", wrapper);
+        env::set_var("CMAKE_CUDA_COMPILER_LAUNCHER", wrapper);
+    }
+
     // Allow to override which Python to use ...
     if let Some(python3) = option_env!("Python3_EXECUTABLE") {
         config.define("Python3_EXECUTABLE", python3);
@@ -78,23 +164,18 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     }
 
     // Emit linkage information from the artifacts we just built
-    let install_lib_path = install_path.join("lib");
-
-    println!(
-        r"cargo:warning=Adding link search path: {}",
-        install_lib_path.display()
-    );
-    println!(r"cargo:rustc-link-search={}", install_lib_path.display());
-
+    for path in ["lib", "lib64"] {
+        let install_lib_path = install_path.join(path);
+        println!(
+            r"cargo:warning=Adding link search path: {}",
+            install_lib_path.display()
+        );
+        println!(r"cargo:rustc-link-search={}", install_lib_path.display());
+    }
     (PathBuf::from(install_path), deps_folder)
 }
 
 fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
-    let ndebug = match is_debug {
-        true => "1",
-        false => "0",
-    };
-
     CFG.include_prefix = "backends/trtllm";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
@@ -106,7 +187,10 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
         .include("/usr/local/tensorrt/include")
         .include("csrc/")
         .file("csrc/ffi.hpp")
-        .define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
+        .define(
+            "TGI_TRTLLM_BACKEND_DEBUG",
+            get_compiler_flag(is_debug, "ON", "OFF"),
+        )
         .compile("tgi_trtllm_backend");
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");
@@ -125,6 +209,7 @@ fn main() {
     let build_profile = env::var("PROFILE").unwrap();
     let (is_debug, opt_level) = match build_profile.as_ref() {
         "debug" => (true, "0"),
+        "dev" => (true, "0"),
         _ => (false, "3"),
     };
 
@@ -161,7 +246,5 @@ fn main() {
         });
 
     // Backend
-    BACKEND_DEPS.iter().for_each(|name| {
-        println!("cargo:rustc-link-lib=static={}", name);
-    });
-}
+    println!("cargo:rustc-link-lib=static={}", &BACKEND_DEPS);
+}
\ No newline at end of file
diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake
index 45e6790a..e7566cd7 100644
--- a/backends/trtllm/cmake/spdlog.cmake
+++ b/backends/trtllm/cmake/spdlog.cmake
@@ -4,14 +4,14 @@ set(SPDLOG_FMT_EXTERNAL OFF)
 
 # Define the level at which SPDLOG_ compilation level is defined
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
 else ()
-    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
 endif ()
 
 fetchcontent_declare(
         spdlog
-#        DOWNLOAD_EXTRACT_TIMESTAMP
-        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
+        #        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.15.0.tar.gz
 )
 fetchcontent_makeavailable(spdlog)
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
index 4217892b..d789b1eb 100644
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@@ -14,11 +14,13 @@ message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 set(ENABLE_UCX OFF)
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
     set(FAST_BUILD ON)
-    set(NVTX_DISABLE OFF)
+    set(NVTX_DISABLE ON)
+    set(INDEX_RANGE_CHECK ON)
 else ()
     set(FAST_BUILD OFF)
     set(FAST_MATH ON)
-    set(NVTX_DISABLE ON)
+    set(NVTX_DISABLE OFF)
+    set(INDEX_RANGE_CHECK OFF)
 endif ()
 
 find_package(Python3 REQUIRED Interpreter)
diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
index b50044d8..2151466b 100644
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@@ -1,7 +1,6 @@
 #include <ranges>
 
 #include <nlohmann/json.hpp>
-#include <spdlog/spdlog.h>
 
 #include "backend.hpp"
 #include "hardware.hpp"
@@ -17,7 +16,8 @@ namespace huggingface::tgi::backends::trtllm {
         if (world_size > 1) {
             SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
             mode = tle::CommunicationMode::kORCHESTRATOR;
-            orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
+            orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr,
+                                                                             true);
         } else {
             SPDLOG_INFO("Detected single engine deployment, using leader mode");
         }
@@ -44,21 +44,22 @@ namespace huggingface::tgi::backends::trtllm {
     }
 
     backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
-        : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
+            : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
 
     size_t backend_t::num_tokens_ready() const noexcept {
         return executor_.getNumResponsesReady();
     }
 
     std::expected<request_id_t, backend_error_t>
-    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
-        SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
-        return executor_.enqueueRequest(tle::Request {
+    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
+                      const sampling_params_t s_params) noexcept {
+        SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
+        return executor_.enqueueRequest(tle::Request{
                 {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
-                static_cast<tle::SizeType32>(generation_params.max_new_tokens),
+                static_cast<tle::SizeType32>(g_params.max_new_tokens),
                 true,
-                (tle::SamplingConfig) sampling_params,
-                tle::OutputConfig { /* returnLogProbs= */ true },
+                (tle::SamplingConfig) s_params,
+                tle::OutputConfig{ /* returnLogProbs= */ true},
                 std::nullopt,
                 std::nullopt,
                 std::nullopt,
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index d0342d4b..840614bb 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -28,20 +28,62 @@ namespace huggingface::tgi::backends::trtllm {
 
 #include "backends/trtllm/src/lib.rs.h"
 
+
 namespace huggingface::tgi::backends::trtllm {
     std::once_flag backend_initialized_flag;
 
+    constexpr finish_reason_t as_finish_reason_t(const tle::FinishReason reason) noexcept {
+        switch (reason) {
+            case tle::FinishReason::kNOT_FINISHED:
+                return finish_reason_t::kNOT_FINISHED;
+            case tle::FinishReason::kSTOP_WORDS:
+                return finish_reason_t::kSTOP_WORDS;
+            case tle::FinishReason::kEND_ID:
+                return finish_reason_t::kEND_ID;
+            case tle::FinishReason::kLENGTH:
+                return finish_reason_t::kLENGTH;
+            default:
+                std::unreachable();
+        }
+    }
+
+    static auto as_generation_step = [](const tle::Response &r) {
+        const auto reqId = r.getRequestId();
+        if (!r.hasError()) [[likely]] {
+            const auto result = r.getResult();
+            const auto logits = result.logProbs.value()[0];
+            return generation_step_t{
+                    reqId,
+                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
+                    logits.back(),
+                    result.isFinal,
+                    as_finish_reason_t(result.finishReasons[0]),
+                    false,
+                    std::string()
+            };
+        } else {
+            return generation_step_t{
+                    reqId,
+                    0,
+                    0.0,
+                    true,
+                    finish_reason_t::kNOT_FINISHED,
+                    true,
+                    std::move(r.getErrorMsg())
+            };
+        }
+    };
+
+
     class tensorrt_llm_backend_t {
     private:
         backend_t inner_;
 
     public:
         tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
-            : inner_(engine_folder, executor_worker_path) {}
+                : inner_(engine_folder, executor_worker_path) {}
 
-        size_t num_tokens_ready() const noexcept {
-            return inner_.num_tokens_ready();
-        }
+        size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
 
         request_id_t submit(
                 rust::Slice<const uint32_t> tokens,
@@ -59,13 +101,13 @@ namespace huggingface::tgi::backends::trtllm {
             // Submit the request to the executor and get back a potential request_id used to track request status
             const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
             const auto maybe_request_id = inner_.submit(
-                signed_tokens,
-                {max_new_tokens},
-                {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
+                    signed_tokens,
+                    {max_new_tokens},
+                    {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
             );
 
             // If we do have a value, let's return the request_id
-            if(maybe_request_id.has_value()) [[likely]] {
+            if (maybe_request_id.has_value()) [[likely]] {
                 return *maybe_request_id;
             } else {
                 SPDLOG_WARN("[FFI] Failed to submit request to the executor");
@@ -74,61 +116,45 @@ namespace huggingface::tgi::backends::trtllm {
         }
 
         std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
-            if(num_tokens_ready() > 0) [[likely]] {
+            if (num_tokens_ready() > 0) [[likely]] {
                 const auto responses = inner_.pull_tokens();
 
                 SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
-                // Transform tle::Response to GenerationStep
-                auto steps = std::make_unique<std::vector<generation_step_t>>();
-                std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
-                    const auto reqId = r.getRequestId();
-                    if (!r.hasError()) [[likely]] {
-                        const auto result = r.getResult();
-                        return generation_step_t{
-                                reqId,
-                                static_cast<uint32_t>(result.outputTokenIds[0][0]),
-                                result.logProbs.value()[0][0],
-                                result.isFinal,
-                                false,
-                                std::string()
-                        };
-                    } else {
-                        return generation_step_t{
-                                reqId,
-                                0,
-                                0.0,
-                                true,
-                                true,
-                                std::move(r.getErrorMsg())
-                        };
-                    }
-                });
-                return steps;
+
+                // Transform tle::Response to generation_step_t
+#ifdef __cpp_lib_ranges_to_container
+                auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
+#else
+                auto steps = std::vector<generation_step_t>();
+                steps.reserve(responses.size());
+                std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
+#endif
+                return std::make_unique<std::vector<generation_step_t>>(steps);
 
             } else {
                 return std::make_unique<std::vector<generation_step_t>>();
             }
         }
 
-        void cancel(request_id_t requestId) noexcept {
-            SPDLOG_DEBUG("[FFI] cancelling request {:d}", requestId);
-            inner_.cancel(requestId);
+        void cancel(request_id_t request_id) noexcept {
+            SPDLOG_DEBUG("[FFI] cancelling request {:d}", request_id);
+            inner_.cancel(request_id);
         }
     };
 
     void initialize_logging() {
 #ifndef TGI_TRTLLM_BACKEND_DEBUG
         if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
-        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
-        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
-            return std::tolower(c);
-        });
+            std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
+            std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
+                return std::tolower(c);
+            });
 
-        if (log_level == "debug")
-            spdlog::set_level(spdlog::level::debug);
-        else
-            spdlog::set_level(spdlog::level::info);
-    }
+            if (log_level == "debug")
+                spdlog::set_level(spdlog::level::debug);
+            else
+                spdlog::set_level(spdlog::level::info);
+        }
 #else
         spdlog::set_level(spdlog::level::debug);
 #endif
@@ -151,11 +177,14 @@ namespace huggingface::tgi::backends::trtllm {
         }
     }
 
-    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
+    std::unique_ptr<tensorrt_llm_backend_t>
+    create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
         std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
         return std::make_unique<tensorrt_llm_backend_t>(
-            std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
-            std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
+                std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
+                                      std::filesystem::path::format::auto_format),
+                std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
+                                      std::filesystem::path::format::auto_format)
         );
     }
 }
diff --git a/backends/trtllm/scripts/setup_sccache.py b/backends/trtllm/scripts/setup_sccache.py
new file mode 100644
index 00000000..982f8c77
--- /dev/null
+++ b/backends/trtllm/scripts/setup_sccache.py
@@ -0,0 +1,49 @@
+from argparse import ArgumentParser
+
+AWS_S3_CACHING_VARIABLES = {
+    "AWS_ACCESS_KEY_ID": "aws_access_key_id",
+    "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
+    "AWS_SESSION_TOKEN": "aws_session_token",
+    "SCCACHE_REGION": "s3_region",
+    "SCCACHE_BUCKET": "s3_bucket_name",
+}
+
+ALL_CACHING_STORAGE_VARIABLES = {
+    "AWS_S3_CACHING_VARIABLES"
+}
+
+
+def setup_sccache_locally():
+    from os import environ
+
+    print("Setting up Local Caching Layer")
+    for target in ALL_CACHING_STORAGE_VARIABLES:
+        for envvar in globals()[target].keys():
+            if envvar in environ:
+                print(f"Deleted {envvar} from environment variables")
+                del environ[envvar]
+
+
+def setup_sccache_for_s3():
+    from os import environ
+
+    print("Setting up AWS S3 Caching Layer")
+    for envvar in AWS_S3_CACHING_VARIABLES.keys():
+        if not envvar in environ or not environ[envvar] or len(environ[envvar]) == 0:
+            print(f"Missing definition for environment variable {envvar}")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("TensorRT-LLM Build Caching Setup")
+
+    parser.add_argument("--is-gha-build", type=str, default="FALSE",
+                        help="Indicate if the build is from Github Actions")
+
+    # Parse args
+    args = parser.parse_args()
+    args.is_gha_build = args.is_gha_build.lower() in {"on", "true", "1"}
+
+    if args.is_gha_build:
+        setup_sccache_for_s3()
+    else:
+        setup_sccache_locally()
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index d6acafa1..08507256 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -6,6 +6,26 @@ mod utils;
 
 #[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
 mod ffi {
+    #[cxx_name = "finish_reason_t"]
+    #[derive(Debug, Clone, Copy)]
+    pub enum FinishReason {
+        /// The request is not finished.
+        #[cxx_name = "kNOT_FINISHED"]
+        NotFinished = 0u8,
+
+        /// The request finished because the end id was generated.
+        #[cxx_name = "kEND_ID"]
+        EndTokenId = 1u8,
+
+        /// The request finished because a stop word was generated.
+        #[cxx_name = "kSTOP_WORDS"]
+        StopWords = 2u8,
+
+        /// The request finished because the maximum number of tokens was reached.
+        #[cxx_name = "kLENGTH"]
+        MaxLength = 3u8,
+    }
+
     /// Struct used as shared type between rust and C++ to represent the result
     /// of a single decoding iteration
     #[cxx_name = "generation_step_t"]
@@ -15,6 +35,7 @@ mod ffi {
         token_id: u32,
         log_prob: f32,
         is_final: bool,
+        finish_reason: FinishReason,
         has_error: bool,
         error_msg: String,
     }
@@ -66,3 +87,17 @@ mod ffi {
         fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
     }
 }
+
+use ffi::FinishReason;
+use text_generation_router::FinishReason as InferFinishReason;
+
+impl From<FinishReason> for InferFinishReason {
+    fn from(reason: FinishReason) -> Self {
+        match reason {
+            FinishReason::StopWords => InferFinishReason::StopSequence,
+            FinishReason::MaxLength => InferFinishReason::Length,
+            FinishReason::EndTokenId => InferFinishReason::EndOfSequenceToken,
+            _ => panic!("Cannot convert {reason:?} to text_generation_router::FinishReason"),
+        }
+    }
+}
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index af299f7d..5af96ade 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -11,7 +11,7 @@ use text_generation_router::server::{
     get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer,
 };
 use text_generation_router::usage_stats::UsageStatsLevel;
-use text_generation_router::{server, HubTokenizerConfig, Tokenizer};
+use text_generation_router::{server, Tokenizer};
 
 /// App Configuration
 #[derive(Parser, Debug)]
@@ -69,7 +69,7 @@ struct Args {
 
 async fn get_tokenizer(
     tokenizer_name: &str,
-    tokenizer_config_path: Option<&str>,
+    _tokenizer_config_path: Option<&str>,
     revision: Option<&str>,
 ) -> Option<Tokenizer> {
     // Parse Huggingface hub token
diff --git a/backends/trtllm/tests/test_backend.cpp b/backends/trtllm/tests/test_backend.cpp
index 14d92b75..f44cc03f 100644
--- a/backends/trtllm/tests/test_backend.cpp
+++ b/backends/trtllm/tests/test_backend.cpp
@@ -8,13 +8,13 @@
 
 #include "backend.hpp"
 
-
-
 using namespace huggingface::tgi::backends::trtllm;
 
 TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
 {
-    const json config_j = {{"temperature", 0.6}, {"top_p", 0.95}, {"eos_token_id", {1,2,3}}};
+    const json config_j = {{"temperature",  0.6},
+                           {"top_p",        0.95},
+                           {"eos_token_id", {1, 2, 3}}};
     const auto generation_config = generation_config_t(config_j);
 
     REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6));
@@ -24,8 +24,9 @@ TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
     REQUIRE_FALSE(generation_config.stop_words.empty());
     REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
 
-    for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
-    {
+    for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1},
+                                                                                                        {2},
+                                                                                                        {3}})) {
         // Currently we do not support multi-tokens stop words
         REQUIRE(lhs.size() == 1);
         REQUIRE(rhs.size() == 1);
@@ -35,7 +36,7 @@ TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
 
 TEST_CASE("parse generation_config.json default", "[generation_config_t]")
 {
-    const json config_j = {{"eos_token_id", {1,2,3}}};
+    const json config_j = {{"eos_token_id", {1, 2, 3}}};
     const auto generation_config = generation_config_t(config_j);
 
     REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
@@ -44,8 +45,9 @@ TEST_CASE("parse generation_config.json default", "[generation_config_t]")
     REQUIRE_FALSE(generation_config.stop_words.empty());
     REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
 
-    for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
-    {
+    for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1},
+                                                                                                        {2},
+                                                                                                        {3}})) {
         // Currently we do not support multi-tokens stop words
         REQUIRE(lhs.size() == 1);
         REQUIRE(rhs.size() == 1);

From bdb3e488e4795953eca16e1b682429759a1cdd31 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 21 Jan 2025 11:06:10 +0100
Subject: [PATCH 043/183] Trying to avoid the random timeout. (#2929)

* Trying to avoid the random timeout.

* More read timeout ?

* Longer timeout ?

* Remove legacy ENV directive.

* Remove the dummy test, only increase the read timeout.

* Wat?
---
 Dockerfile                    | 2 +-
 integration-tests/conftest.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 18b1d5b0..f7236e72 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -58,7 +58,7 @@ ARG INSTALL_CHANNEL=pytorch
 # Automatically set by buildx
 ARG TARGETPLATFORM
 
-ENV PATH /opt/conda/bin:$PATH
+ENV PATH=/opt/conda/bin:$PATH
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         build-essential \
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index c702ae70..6f8aa715 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -562,6 +562,7 @@ def launcher(event_loop):
                 docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
             ]
 
+        client.api.timeout = 1000
         container = client.containers.run(
             DOCKER_IMAGE,
             command=args,
@@ -573,7 +574,7 @@ def launcher(event_loop):
             devices=devices,
             volumes=volumes,
             ports={"80/tcp": port},
-            healthcheck={"timeout": int(60 * 1e9), "retries": 2},  # 60s
+            healthcheck={"timeout": int(180 * 1e9), "retries": 2},  # 60s
             shm_size="1G",
         )
 

From 64a33c1f0516c4b8cf6a628dd603b095517cb369 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 21 Jan 2025 17:33:33 +0100
Subject: [PATCH 044/183] Run `pre-commit run --all-files` to fix CI (#2933)

---
 .github/workflows/build.yaml             |  2 +-
 Dockerfile_trtllm                        |  2 +-
 backends/trtllm/CMakeLists.txt           |  2 +-
 backends/trtllm/build.rs                 |  2 +-
 backends/trtllm/scripts/setup_sccache.py | 14 ++++++++------
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2e3fe7ef..27907e39 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -65,7 +65,7 @@ jobs:
                 export runs_on="aws-g6-12xl-plus-priv-cache"
                 export platform=""
                 export extra_pytest=""
-                export target="nil" 
+                export target="nil"
                 ;;
             cuda-trtllm)
                 export dockerfile="Dockerfile_trtllm"
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index 40972764..d57b1298 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -153,4 +153,4 @@ COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
 COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
 
 # Basically we copy from target/debug instead of target/release
-COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
\ No newline at end of file
+COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 2cbd3a07..5ce016e1 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -129,4 +129,4 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug")
     #    include(CTest)
     #    include(Catch)
     #    catch_discover_tests(tgi_trtllm_backend_tests)
-endif ()
\ No newline at end of file
+endif ()
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index c18b13a9..8b041860 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -247,4 +247,4 @@ fn main() {
 
     // Backend
     println!("cargo:rustc-link-lib=static={}", &BACKEND_DEPS);
-}
\ No newline at end of file
+}
diff --git a/backends/trtllm/scripts/setup_sccache.py b/backends/trtllm/scripts/setup_sccache.py
index 982f8c77..65fdee23 100644
--- a/backends/trtllm/scripts/setup_sccache.py
+++ b/backends/trtllm/scripts/setup_sccache.py
@@ -8,9 +8,7 @@ AWS_S3_CACHING_VARIABLES = {
     "SCCACHE_BUCKET": "s3_bucket_name",
 }
 
-ALL_CACHING_STORAGE_VARIABLES = {
-    "AWS_S3_CACHING_VARIABLES"
-}
+ALL_CACHING_STORAGE_VARIABLES = {"AWS_S3_CACHING_VARIABLES"}
 
 
 def setup_sccache_locally():
@@ -29,15 +27,19 @@ def setup_sccache_for_s3():
 
     print("Setting up AWS S3 Caching Layer")
     for envvar in AWS_S3_CACHING_VARIABLES.keys():
-        if not envvar in environ or not environ[envvar] or len(environ[envvar]) == 0:
+        if envvar not in environ or not environ[envvar] or len(environ[envvar]) == 0:
             print(f"Missing definition for environment variable {envvar}")
 
 
 if __name__ == "__main__":
     parser = ArgumentParser("TensorRT-LLM Build Caching Setup")
 
-    parser.add_argument("--is-gha-build", type=str, default="FALSE",
-                        help="Indicate if the build is from Github Actions")
+    parser.add_argument(
+        "--is-gha-build",
+        type=str,
+        default="FALSE",
+        help="Indicate if the build is from Github Actions",
+    )
 
     # Parse args
     args = parser.parse_args()

From 2dfe3b3ee6cb17a1f1c3071365a295e1a0e9c4d2 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 22 Jan 2025 12:20:15 +0100
Subject: [PATCH 045/183] Upgrading the deps to have transformers==4.48.0
 necessary (#2937)

---
 flake.lock | 25 ++++++++++++-------------
 flake.nix  |  2 +-
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/flake.lock b/flake.lock
index 772de6eb..ce23c5e4 100644
--- a/flake.lock
+++ b/flake.lock
@@ -305,11 +305,11 @@
     },
     "flake-compat_4": {
       "locked": {
-        "lastModified": 1696426674,
-        "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
         "owner": "edolstra",
         "repo": "flake-compat",
-        "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
         "type": "github"
       },
       "original": {
@@ -718,11 +718,11 @@
     },
     "nixpkgs_6": {
       "locked": {
-        "lastModified": 1732034459,
-        "narHash": "sha256-Zais/zMRuJdlALidkUgEuasXOd37ZZLqkPkF9bIYSrY=",
+        "lastModified": 1737453259,
+        "narHash": "sha256-5LaFI9SQwCZmJDasMoYMdzNouWXNk3BvjKcO19tq1Rs=",
         "owner": "danieldk",
         "repo": "nixpkgs",
-        "rev": "40280e7bf9743cdf563494db4ece2a43aa674fa8",
+        "rev": "e0372dbcfd19ddd783b7c3b3868f19322f83318e",
         "type": "github"
       },
       "original": {
@@ -853,11 +853,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1736994333,
-        "narHash": "sha256-v4Jrok5yXsZ6dwj2+2uo5cSyUi9fBTurHqHvNHLT1XA=",
+        "lastModified": 1737512878,
+        "narHash": "sha256-dgF6htdmfNnZzVInifks6npnCAyVsIHWSpWNs10RSW0=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "848db855cb9e88785996e961951659570fc58814",
+        "rev": "06b8ed0eee289fe94c66f1202ced9a6a2c59a14c",
         "type": "github"
       },
       "original": {
@@ -978,16 +978,15 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1737042758,
-        "narHash": "sha256-yW8IIcNU08IXxFJEv9lqnmx+4qcT6IfTvDqgw8NO1nM=",
+        "lastModified": 1737540114,
+        "narHash": "sha256-ubowOFdG8pAodwuxzWHLIoQJtGXJTlb4RtISVVY3Tx0=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "12137321ce83850131ef5307e846c4e304366e3a",
+        "rev": "d18053189cc5ce4111250da841df101c8cdf13ff",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "pytorch-3.5.1",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index eb55f512..83cedfa6 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/pytorch-3.5.1";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {

From 1d3c9beba82fe70b2e9b9d8b6ddf82e190a827ac Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Wed, 22 Jan 2025 21:36:15 +0800
Subject: [PATCH 046/183] fix moe in quantization path (#2935)

update ipex xpu to support moe for mixtral

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_intel                              |  3 +-
 .../layers/moe/__init__.py                    | 17 +----
 .../layers/moe/fused_moe_ipex.py              | 65 +++++++++++++++++++
 3 files changed, 68 insertions(+), 17 deletions(-)
 create mode 100644 server/text_generation_server/layers/moe/fused_moe_ipex.py

diff --git a/Dockerfile_intel b/Dockerfile_intel
index 5edd8951..0f0d4383 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -118,8 +118,9 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/li
 ENV CCL_ZE_IPC_EXCHANGE=sockets
 #ENV TORCH_LLM_ALLREDUCE=1
 #ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
+ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
 
-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 033af6f63745ac748cccdadee5c6140c7971edf6
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 1ccf72b2d11cd00b47aef6d6cd054c088aa6f083
 RUN cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc,ats-m150' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch
 
 # Install benchmarker
diff --git a/server/text_generation_server/layers/moe/__init__.py b/server/text_generation_server/layers/moe/__init__.py
index be40d78a..f0bd76aa 100644
--- a/server/text_generation_server/layers/moe/__init__.py
+++ b/server/text_generation_server/layers/moe/__init__.py
@@ -25,7 +25,7 @@ from text_generation_server.utils.weights import (
 )
 
 if SYSTEM == "ipex":
-    from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
+    from .fused_moe_ipex import fused_topk, grouped_topk
 else:
     from moe_kernels.fused_moe import fused_topk, grouped_topk
 
@@ -139,10 +139,6 @@ class DenseMoELayer(nn.Module):
             )
             for i in range(self.n_experts)
         ]
-        if SYSTEM == "ipex":
-            self.ipex_fused_moe = GatedMLPMOE(
-                W13=self.gate_proj, W2=self.down_proj, W3=self.up_proj, use_prepack=True
-            )
 
         self.process_group = weights.process_group
 
@@ -155,17 +151,6 @@ class DenseMoELayer(nn.Module):
         input_shape = x.shape
         x = x.view(-1, input_shape[-1])
 
-        if SYSTEM == "ipex":
-            return self.ipex_fused_moe(
-                hidden_states=x,
-                router_logits=gating_output,
-                top_k=self.topk,
-                renormalize=self.renormalize,
-                use_grouped_topk=self.n_expert_group is not None,
-                num_expert_group=self.n_expert_group,
-                topk_group=self.topk_group,
-            )
-
         if self.n_expert_group is not None and self.topk_group is not None:
             topk_weights, topk_ids = grouped_topk(
                 x,
diff --git a/server/text_generation_server/layers/moe/fused_moe_ipex.py b/server/text_generation_server/layers/moe/fused_moe_ipex.py
new file mode 100644
index 00000000..e26ff877
--- /dev/null
+++ b/server/text_generation_server/layers/moe/fused_moe_ipex.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import torch
+
+
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    group_scores = (
+        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_ids
+
+
+def fused_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    topk_weights = torch.nn.functional.softmax(
+        gating_output, dim=1, dtype=torch.float32
+    )
+    topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
+    if renormalize:
+        topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids

From 1dd346666a3b354f2c7542141bc1928a47c452db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 22 Jan 2025 18:18:11 +0100
Subject: [PATCH 047/183] Clarify FP8-Marlin use on capability 8.9 (#2940)

The log message stated that the GPU does not support FP8 on capability
8.9. However we use FP8-Marlin on that capability because it is faster.
---
 server/text_generation_server/layers/fp8.py        | 10 ++++++++++
 server/text_generation_server/layers/marlin/fp8.py |  4 ----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
index 4e83ec9d..715974ff 100644
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@@ -52,6 +52,16 @@ def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
             #       gives better decoding throughput on L4 and L40.
             from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
 
+            if major == 8 and minor == 9:
+                log_once(
+                    logger.info,
+                    "GPU supports FP8, but using Marlin FP8 kernel for better performance",
+                )
+            else:
+                log_once(
+                    logger.info, "GPU does not support FP8, using Marlin FP8 kernel"
+                )
+
             return GPTQMarlinFP8Linear
 
     # On other systems let Torch decide if the hardware supports FP8.
diff --git a/server/text_generation_server/layers/marlin/fp8.py b/server/text_generation_server/layers/marlin/fp8.py
index 49f5c480..e07b9fc6 100644
--- a/server/text_generation_server/layers/marlin/fp8.py
+++ b/server/text_generation_server/layers/marlin/fp8.py
@@ -2,14 +2,12 @@ from typing import Optional
 
 import torch
 import torch.nn as nn
-from loguru import logger
 from text_generation_server.layers.fp8 import fp8_quantize
 from text_generation_server.layers.marlin.gptq import _check_valid_shape
 from text_generation_server.layers.marlin.util import (
     _check_marlin_kernels,
     permute_scales,
 )
-from text_generation_server.utils.log import log_once
 
 try:
     import marlin_kernels
@@ -36,8 +34,6 @@ class GPTQMarlinFP8Linear(nn.Module):
         _check_marlin_kernels()
         assert marlin_kernels is not None
 
-        log_once(logger.info, "GPU does not support FP8, using Marlin FP8 kernel")
-
         scales = scales.unsqueeze(0)
         if scales.shape[1] == 1:
             out_features, in_features = qweight.shape

From cc212154e00247074579bf30cfe9977be0d9f84e Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 23 Jan 2025 13:54:40 +0100
Subject: [PATCH 048/183] Bump TensorRT-LLM backend dependency to v0.16.0
 (#2931)

* backend(trtllm): update to 0.16.0

* backend(trtllm): do not use shallow clone

* backend(trtllm): use tag instead

* backend(trtllm): move to nvidia remote instead of hf

* backend(trtllm): reenable shallow clone

* backend(trtllm): attempt to use ADD instead of RUN for openmpi

* backend(trtllm): make sure we are using correct path for openmpi ADD in dockerfile

* backend(trtllm): add correctly untar it
---
 Dockerfile_trtllm                           | 12 +++++++-----
 backends/trtllm/cmake/trtllm.cmake          |  4 ++--
 backends/trtllm/scripts/install_tensorrt.sh |  4 ++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index d57b1298..dd977a81 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -1,5 +1,5 @@
 ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
-ARG ompi_version="4.1.7rc1"
+ARG ompi_version="4.1.7"
 ARG build_type=release
 ARG is_gha_build=false
 
@@ -37,10 +37,12 @@ FROM cuda-builder AS mpi-builder
 ARG ompi_version
 
 ENV OMPI_TARBALL_FILENAME="openmpi-$ompi_version.tar.bz2"
-RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME" -P /opt/src && \
-    mkdir /usr/src/mpi && \
-    tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
-    cd /usr/src/mpi && \
+ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
+    https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME \
+    /opt/src/mpi/
+
+WORKDIR /opt/src/mpi
+RUN tar --strip-components=1 -xf $OMPI_TARBALL_FILENAME &&\
     ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
     make -j all && \
     make install && \
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
index d789b1eb..3e9712c0 100644
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@@ -27,8 +27,8 @@ find_package(Python3 REQUIRED Interpreter)
 
 fetchcontent_declare(
         trtllm
-        GIT_REPOSITORY https://github.com/huggingface/TensorRT-LLM.git
-        GIT_TAG 1bb9ca4688805444f203647674bac1d7219d0579
+        GIT_REPOSITORY https://github.com/nvidia/TensorRT-LLM.git
+        GIT_TAG v0.16.0
         GIT_SHALLOW ON
         DOWNLOAD_EXTRACT_TIMESTAMP
 )
diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh
index 7deb2fe8..f3e7270a 100755
--- a/backends/trtllm/scripts/install_tensorrt.sh
+++ b/backends/trtllm/scripts/install_tensorrt.sh
@@ -2,8 +2,8 @@
 
 set -ex
 
-TRT_VER_BASE="10.6.0"
-TRT_VER_FULL="${TRT_VER_BASE}.26"
+TRT_VER_BASE="10.7.0"
+TRT_VER_FULL="${TRT_VER_BASE}.23"
 CUDA_VER="12.6"
 CUDNN_VER="9.5.0.50-1"
 NCCL_VER="2.22.3-1+cuda12.6"

From 6ab02931cfddaaff42957af46b5e3d86fe196860 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Thu, 23 Jan 2025 14:18:47 +0100
Subject: [PATCH 049/183] Set `alias` for `max_completion_tokens` in
 `ChatRequest` (#2932)

---
 router/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/router/src/lib.rs b/router/src/lib.rs
index dbd36827..414d38ed 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -460,7 +460,7 @@ pub struct CompletionRequest {
     pub prompt: Prompt,
 
     /// The maximum number of tokens that can be generated in the chat completion.
-    #[serde(default, alias = "max_completion_tokens")]
+    #[serde(default)]
     #[schema(default = "1024", example = "32")]
     pub max_tokens: Option<u32>,
 
@@ -840,7 +840,7 @@ pub(crate) struct ChatRequest {
     pub top_logprobs: Option<u32>,
 
     /// The maximum number of tokens that can be generated in the chat completion.
-    #[serde(default)]
+    #[serde(default, alias = "max_completion_tokens")]
     #[schema(default = "1024", example = "32")]
     pub max_tokens: Option<u32>,
 

From 4e172028aa46cfc9c1dc7edba4dc9f75cb1c81b4 Mon Sep 17 00:00:00 2001
From: Nikolai Kolodziej <7687617+kldzj@users.noreply.github.com>
Date: Thu, 23 Jan 2025 14:19:21 +0100
Subject: [PATCH 050/183] Add NVIDIA A40 to known cards (#2941)

feat: add NVIDIA A40 to known cards
---
 launcher/src/main.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 394cc1e6..c6f6b6e9 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1632,6 +1632,7 @@ enum Gpu {
     L40,
     L40S,
     A10G,
+    A40,
     H100,
     A100,
     Unknown(String),
@@ -1652,6 +1653,7 @@ impl From<&str> for Gpu {
             "nvidia-l40" => Gpu::L40,
             "nvidia-l40s" => Gpu::L40S,
             "nvidia-a10g" => Gpu::A10G,
+            "nvidia-a40" => Gpu::A40,
             "nvidia-h100-80gb-hbm3" => Gpu::H100,
             "nvidia-h100-nvl" => Gpu::H100,
             "nvidia-h100" => Gpu::H100,
@@ -1673,6 +1675,7 @@ impl std::fmt::Display for Gpu {
             Gpu::L40 => write!(f, "nvida-l40"),
             Gpu::L40S => write!(f, "nvida-l40s"),
             Gpu::A10G => write!(f, "nvidia-a10g"),
+            Gpu::A40 => write!(f, "nvidia-a40"),
             Gpu::H100 => write!(f, "nvidia-h100-80fb-hbm3"),
             Gpu::A100 => write!(f, "nvida-a100-sxm4-80gb"),
             Gpu::Unknown(card) => write!(f, "{}", card),
@@ -1696,6 +1699,9 @@ impl ComputeType {
             Gpu::L40S => Some(363 * 10u64.pow(12)),
             // https://www.nvidia.com/en-us/data-center/products/a10-gpu/
             Gpu::A10G => Some(125 * 10u64.pow(12)),
+            // https://www.nvidia.com/en-us/data-center/a40/
+            // https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
+            Gpu::A40 => Some(149 * 10u64.pow(12)),
             // https://www.nvidia.com/en-us/data-center/h100/
             // https://www.techpowerup.com/gpu-specs/docs/nvidia-gh100-architecture.pdf
             Gpu::H100 => Some(900 * 10u64.pow(12)),

From 0a899026637b215579d9deb7b29e3230374fc5ed Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 23 Jan 2025 16:48:26 +0100
Subject: [PATCH 051/183] [TRTLLM] Expose finish reason (#2841)

* feat(trtllm): expose finish reason to Rust

* misc(llamacpp): fix typo

* misc(backend): update deps
---
 backends/trtllm/CMakeLists.txt |  2 +-
 backends/trtllm/Cargo.toml     | 10 +++-------
 backends/trtllm/src/looper.rs  | 10 +++++++---
 backends/trtllm/src/main.rs    | 29 ++++-------------------------
 4 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 5ce016e1..8388f113 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -30,7 +30,7 @@ option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
 option(TGI_TRTLLM_BACKEND_BUILD_USE_LLD "Enable lld linker instead of ld" OFF)
 set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
-set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path rgo where TensorRT libraries and headers are located")
+set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
 
diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml
index 5d907109..b6c39346 100644
--- a/backends/trtllm/Cargo.toml
+++ b/backends/trtllm/Cargo.toml
@@ -7,20 +7,16 @@ homepage.workspace = true
 
 [dependencies]
 async-trait = "0.1"
-#async-stream = "0.3"
 clap = { version = "4.5", features = ["derive"] }
 cxx = "1.0"
-hashbrown = "0.14"
+hashbrown = "0.15"
 hf-hub = { workspace = true }
-#log = { version = "0.4", features = [] }
 text-generation-router = { path = "../../router" }
 tokenizers = { workspace = true }
-tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
-tokio-stream = "0.1.15"
+tokio = { version = "1.43.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokio-stream = "0.1.17"
 thiserror = "1.0.63"
 tracing = "0.1"
-#tracing-opentelemetry = "0.25"
-#tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
 pyo3 = { workspace = true }
 
 [build-dependencies]
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index 969046d1..b148ec30 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -18,10 +18,12 @@ use text_generation_router::validation::ValidationError::{
     EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality,
 };
 use text_generation_router::validation::{Chunk, ValidGenerateRequest};
-use text_generation_router::{FinishReason, Token};
+use text_generation_router::Token;
 
 use crate::errors::TensorRtLlmBackendError;
-use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
+use crate::ffi::{
+    create_backend_from_engine_folder, FinishReason, GenerationStep, TensorRtLlmBackendImpl,
+};
 use crate::utils::first_line;
 
 type InferResult<T> = Result<T, InferError>;
@@ -40,6 +42,7 @@ struct DecodedToken {
     id: u32,
     log_prob: f32,
     is_final: bool,
+    finish_reason: FinishReason,
 }
 
 impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
@@ -51,6 +54,7 @@ impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
                 id: step.token_id,
                 log_prob: step.log_prob,
                 is_final: step.is_final,
+                finish_reason: step.finish_reason,
             })
         } else {
             Err(GenerationError(step.error_msg.clone()))
@@ -192,7 +196,7 @@ fn post_process_decoded_token(
                 let generated_text = GeneratedText {
                     text: text.unwrap(),
                     generated_tokens: ctx.tokens.len() as u32,
-                    finish_reason: FinishReason::EndOfSequenceToken, // TODO : Map FinishReason
+                    finish_reason: decoded_token.finish_reason.into(),
                     seed: None,
                 };
 
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index 5af96ade..cef225be 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -67,11 +67,7 @@ struct Args {
     payload_limit: usize,
 }
 
-async fn get_tokenizer(
-    tokenizer_name: &str,
-    _tokenizer_config_path: Option<&str>,
-    revision: Option<&str>,
-) -> Option<Tokenizer> {
+async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option<Tokenizer> {
     // Parse Huggingface hub token
     let authorization_token = std::env::var("HF_TOKEN")
         .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
@@ -182,19 +178,6 @@ async fn get_tokenizer(
         }
     };
 
-    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
-    // let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
-    // {
-    //     HubTokenizerConfig::from_file(filename)
-    // } else {
-    //     tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
-    // };
-
-    // let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
-    //     tracing::warn!("Could not find tokenizer config locally and no API specified");
-    //     HubTokenizerConfig::default()
-    // });
-
     let tokenizer: Tokenizer = {
         use pyo3::prelude::*;
         pyo3::Python::with_gil(|py| -> PyResult<()> {
@@ -292,13 +275,9 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
     }
 
     // Create the backend
-    match get_tokenizer(
-        &tokenizer_name,
-        tokenizer_config_path.as_deref(),
-        revision.as_deref(),
-    )
-    .await
-    .expect("Failed to retrieve tokenizer implementation")
+    match get_tokenizer(&tokenizer_name, revision.as_deref())
+        .await
+        .expect("Failed to retrieve tokenizer implementation")
     {
         Tokenizer::Python { .. } => Err(TensorRtLlmBackendError::Tokenizer(
             "Failed to retrieve Rust based tokenizer".to_string(),

From 29a0893b67a333aec6cc03976d0878984d0a8241 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 23 Jan 2025 18:07:30 +0100
Subject: [PATCH 052/183] Tmp tp transformers (#2942)

* Upgrade the version number.

* Remove modifications in Lock.

* Tmp branch to test transformers backend with 2.5.1 and TP>1

* Fixing the transformers backend.

inference_mode forces the use of `aten.matmul` instead of `aten.mm` the
former doesn't have sharding support crashing the transformers TP
support.

`lm_head.forward` also crashes because it skips the hook that
cast/decast the DTensor.

Torch 2.5.1 is required for sharding support.

* Put back the attention impl.

* Revert the flashinfer (this will fails).

* Building AOT.

* Using 2.5 kernels.

* Remove the archlist, it's defined in the docker anyway.
---
 Dockerfile                                    |  6 ++---
 README.md                                     |  6 ++---
 .../basic_tutorials/gated_model_access.md     |  2 +-
 docs/source/conceptual/quantization.md        |  6 ++---
 docs/source/installation_amd.md               |  2 +-
 docs/source/installation_intel.md             |  4 ++--
 docs/source/installation_nvidia.md            |  2 +-
 docs/source/quicktour.md                      |  4 ++--
 docs/source/reference/api_reference.md        |  2 +-
 server/Makefile-flashinfer                    |  5 ++--
 server/pyproject.toml                         | 24 +++++++++----------
 .../text_generation_server/models/__init__.py |  8 +++----
 .../models/flash_causal_lm.py                 |  2 +-
 .../models/transformers_flash_causal_lm.py    | 17 ++++++-------
 server/text_generation_server/server.py       |  6 ++---
 15 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index f7236e72..72005333 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,7 +47,7 @@ RUN cargo build --profile release-opt --frozen
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
 
 # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
-ARG PYTORCH_VERSION=2.4.0
+ARG PYTORCH_VERSION=2.5.1
 
 ARG PYTHON_VERSION=3.11
 # Keep in sync with `server/pyproject.toml
@@ -235,8 +235,8 @@ RUN cd server && \
     make gen-server && \
     python -c "from text_generation_server.pb import generate_pb2" && \
     pip install -U pip uv && \
-    uv pip install -e ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
-    uv pip install nvidia-nccl-cu12==2.22.3
+    uv pip install -e ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir # && \
+    # uv pip install nvidia-nccl-cu12==2.22.3
 
 ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
 # Required to find libpython within the rust binaries
diff --git a/README.md b/README.md
index 9842a2a7..6072c9bd 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 7c32f0b3..6520a6ab 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2 \
     --model-id $model
 ```
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index f7cc8929..a0eb0c51 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model --quantize bitsandbytes
 ```
 
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model --quantize bitsandbytes-nf4
 ```
 
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model --quantize gptq
 ```
 
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
index 033add75..797f0a5d 100644
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
     --device=/dev/kfd --device=/dev/dri --group-add video \
     --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.1-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.0.2-rocm \
     --model-id $model
 ```
 
diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md
index 3329b476..165d1105 100644
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.1-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.0.2-intel-xpu \
     --model-id $model --cuda-graphs 0
 ```
 
@@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.1-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.0.2-intel-cpu \
     --model-id $model --cuda-graphs 0
 ```
 
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
index 71f3febd..0092f5ef 100644
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.1 \
+    ghcr.io/huggingface/text-generation-inference:3.0.2 \
     --model-id $model
 ```
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index fc24a7a4..4ffb9728 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.1 \
+    ghcr.io/huggingface/text-generation-inference:3.0.2 \
     --model-id $model
 ```
 
@@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.0.1 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.0.2 --help
 ```
 
 </Tip>
diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
index 6b333fdc..f7723187 100644
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@@ -163,7 +163,7 @@ hub = {
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.0.1"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.0.2"),
  env=hub,
  role=role,
 )
diff --git a/server/Makefile-flashinfer b/server/Makefile-flashinfer
index d5f684ba..f6cbb572 100644
--- a/server/Makefile-flashinfer
+++ b/server/Makefile-flashinfer
@@ -1,5 +1,6 @@
 install-flashinfer:
 	# We need fsspec as an additional dependency, but
 	# `pip install flashinfer` cannot resolve it.
-	pip install fsspec
-	pip install flashinfer==0.2.0.post1 -i https://flashinfer.ai/whl/cu124/torch2.4
+	pip install fsspec sympy==1.13.1 numpy
+	pip install -U setuptools
+	FLASHINFER_ENABLE_AOT=1 pip install git+https://github.com/flashinfer-ai/flashinfer.git@v0.2.0.post1#egg=flashinfer  --no-build-isolation
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 6d0b8dfb..129e9b64 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -69,22 +69,22 @@ gen = [
 
 [tool.uv.sources]
 attention-kernels = [
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
+  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 moe-kernels = [
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
 
 [tool.pytest.ini_options]
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 3b437ce0..dd116c9e 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -931,10 +931,10 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
-        elif sharded:
-            raise NotImplementedError(
-                FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
-            )
+        # elif sharded:
+        #     raise NotImplementedError(
+        #         FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
+        #     )
         else:
             return transformers_causal_lm_class.fallback(
                 model_id,
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index d097c54f..1073f4f9 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1571,7 +1571,7 @@ class FlashCausalLM(Model):
             real_free_memory = get_free_memory(self.device, MEMORY_FRACTION)
             log_master(
                 logger.debug,
-                f"Free memory {free_memory/1e9:.2f}GB , (real: {real_free_memory/1e9:.2f}GB",
+                f"Free memory {free_memory / 1e9:.2f}GB , (real: {real_free_memory / 1e9:.2f}GB",
             )
 
             _, _batch, _ = self.generate_token(batch)
diff --git a/server/text_generation_server/models/transformers_flash_causal_lm.py b/server/text_generation_server/models/transformers_flash_causal_lm.py
index 647fabc2..c2669dba 100644
--- a/server/text_generation_server/models/transformers_flash_causal_lm.py
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@@ -3,7 +3,7 @@ from typing import List, Optional
 
 import torch
 from opentelemetry import trace
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import transformers.modeling_utils
 
 from text_generation_server.models.flash_causal_lm import FlashCausalLM
@@ -36,9 +36,7 @@ def tgi_flash_attention_forward(
     softcap: Optional[float] = None,
     **kwargs,  # This is needed to "absorb" other args passed by Transformers modeling
 ):
-
     kv_cache = kv_cache[module.layer_idx]
-
     query_states = query_states.transpose(1, 2).squeeze(dim=0)
     key_states = key_states.transpose(1, 2).squeeze(dim=0)
     value_states = value_states.transpose(1, 2).squeeze(dim=0)
@@ -95,7 +93,6 @@ class TransformersFlashCausalLM(FlashCausalLM):
         default_dtype=torch.float16,
         trust_remote_code: bool = False,
         tokenizer_class=AutoTokenizer,
-        config_class=AutoConfig,
         kv_cache_dtype: Optional[torch.dtype] = None,
     ):
         self.quantize = quantize
@@ -105,17 +102,17 @@ class TransformersFlashCausalLM(FlashCausalLM):
             raise RuntimeError("Speculator decoding is not enabled for AutoModel")
 
         if torch.cuda.is_available():
-            device = torch.device("cuda:0")
-            dtype = torch.float16 if dtype is None else dtype
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
         elif hasattr(torch, "xpu") and torch.xpu.is_available():
             device = torch.device("xpu")
-            dtype = torch.float16 if dtype is None else dtype
+            dtype = default_dtype if dtype is None else dtype
         else:
             raise ValueError(
                 "Flash `Transformers` modeling backend is not available on cpu."
             )
 
-        tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer = tokenizer_class.from_pretrained(
             model_id,
             revision=revision,
             padding_side="left",
@@ -126,10 +123,10 @@ class TransformersFlashCausalLM(FlashCausalLM):
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map="auto",
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
             attn_implementation="tgi",
+            device_map=device if world_size == 1 else None,
             tp_plan="auto" if world_size > 1 else None,
         )
 
@@ -261,6 +258,6 @@ class TransformersFlashCausalLM(FlashCausalLM):
         # To update with full Transformers support asap
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
-        logits = self.model.lm_head.forward(hidden_states)
+        logits = self.model.lm_head(hidden_states)
 
         return logits, None
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index 45b48df8..935e0985 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -68,9 +68,9 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
         self.quantize = model.quantize
         self.server_urls = server_urls
         # For some reason, inference_mode does not work well with GLOO which we use on CPU
-        if model.device.type == "cuda":
-            # Force inference mode for the lifetime of TextGenerationService
-            self._inference_mode_raii_guard = torch._C._InferenceMode(True)
+        # if model.device.type == "cuda":
+        #     # Force inference mode for the lifetime of TextGenerationService
+        #     self._inference_mode_raii_guard = torch._C._InferenceMode(True)
 
     async def Info(self, request, context):
         return self.model.info

From 18c4607d463658f5bb78b3146b5ac6a513b99ac4 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Thu, 23 Jan 2025 18:09:57 +0100
Subject: [PATCH 053/183] Transformers backend TP fix (#2945)

* init dispatch

* cohere fix
---
 .../text_generation_server/models/__init__.py | 182 +++++++++++++-----
 .../models/transformers_flash_causal_lm.py    |   7 +
 2 files changed, 140 insertions(+), 49 deletions(-)

diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index dd116c9e..eb5a8de7 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -391,21 +391,6 @@ def get_model(
     )
     model_type = config_dict.get("model_type", None)
 
-    transformers_causal_lm_class = CausalLM
-
-    # Fast transformers path
-    transformers_model_class = getattr(
-        transformers,
-        modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.get(model_type, ""),
-        None,
-    )
-    if (
-        FLASH_TRANSFORMERS_BACKEND
-        and transformers_model_class is not None
-        and transformers_model_class._supports_flex_attn
-    ):
-        transformers_causal_lm_class = TransformersFlashCausalLM
-
     quantization_config = config_dict.get("quantization_config", None)
     if quantization_config is None:
         quantization_config = config_dict.get("compression_config", None)
@@ -649,7 +634,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Deepseek V2")
             )
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -756,7 +741,7 @@ def get_model(
             except RuntimeError as e:
                 # Lots of legacy models with various weight names.
                 log_master(logger.warning, f"Couldn't load flash gpt2 variant: {e}")
-                return transformers_causal_lm_class.fallback(
+                return CausalLM.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -767,7 +752,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-2"))
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -792,7 +777,7 @@ def get_model(
             except RuntimeError as e:
                 # Lots of legacy models with various weight names.
                 log_master(logger.warning, f"Couldn't load flash gptj variant: {e}")
-                return transformers_causal_lm_class.fallback(
+                return CausalLM.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -803,7 +788,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-J"))
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -840,7 +825,7 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -863,7 +848,7 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
             )
         else:
-            return transformers_causal_lm_class.fallback(
+            return TransformersFlashCausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -887,7 +872,7 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
             )
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -913,12 +898,7 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
 
-    elif (
-        model_type == LLAMA
-        or model_type == BAICHUAN
-        or model_type == PHI3
-        or model_type == GRANITE
-    ):
+    elif model_type == LLAMA or model_type == PHI3 or model_type == GRANITE:
         if FLASH_ATTENTION:
             return FlashCausalLM(
                 model_id=model_id,
@@ -931,12 +911,8 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
-        # elif sharded:
-        #     raise NotImplementedError(
-        #         FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
-        #     )
-        else:
-            return transformers_causal_lm_class.fallback(
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -944,6 +920,47 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+        elif sharded:
+            raise NotImplementedError(
+                FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
+            )
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+    elif model_type == BAICHUAN:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashLlamaForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif sharded:
+            raise NotImplementedError(
+                FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
+            )
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
     if model_type == GEMMA:
         if FLASH_ATTENTION:
             return FlashCausalLM(
@@ -959,10 +976,19 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma"))
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -988,7 +1014,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1010,10 +1036,19 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Cohere"))
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1041,7 +1076,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded DBRX"))
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1091,7 +1126,7 @@ def get_model(
                     config_class=RWConfig,
                 )
             else:
-                return transformers_causal_lm_class.fallback(
+                return CausalLM.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -1113,10 +1148,19 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mistral"))
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1138,10 +1182,19 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mixtral"))
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1163,12 +1216,21 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         elif sharded:
             raise NotImplementedError(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Starcoder2")
             )
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1190,10 +1252,19 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Qwen2"))
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1339,8 +1410,6 @@ def get_model(
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("LlavaNext"))
 
-    if sharded:
-        raise NotImplementedError("sharded is not supported for AutoModel")
     if quantize == "gptq":
         raise NotImplementedError(
             "gptq quantization is not supported for AutoModel, you can try to quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
@@ -1353,8 +1422,19 @@ def get_model(
         raise NotImplementedError("Eetq quantization is not supported for AutoModel")
     elif quantize == "exl2":
         raise NotImplementedError("exl2 quantization is not supported for AutoModel")
-    if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-        return transformers_causal_lm_class.fallback(
+
+    # Fast transformers if available
+    transformers_model_class = getattr(
+        transformers,
+        modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.get(model_type, ""),
+        None,
+    )
+    if (
+        FLASH_TRANSFORMERS_BACKEND
+        and transformers_model_class is not None
+        and transformers_model_class._supports_flex_attn
+    ):
+        return TransformersFlashCausalLM.fallback(
             model_id,
             revision,
             quantize=quantize,
@@ -1362,6 +1442,10 @@ def get_model(
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
+
+    if sharded:
+        raise NotImplementedError("sharded is not supported for AutoModel")
+
     if model_type in modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
         return Seq2SeqLM.fallback(
             model_id,
@@ -1375,7 +1459,7 @@ def get_model(
     auto_map = config_dict.get("auto_map", None)
     if trust_remote_code and auto_map is not None:
         if "AutoModelForCausalLM" in auto_map.keys():
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
diff --git a/server/text_generation_server/models/transformers_flash_causal_lm.py b/server/text_generation_server/models/transformers_flash_causal_lm.py
index c2669dba..36de89b4 100644
--- a/server/text_generation_server/models/transformers_flash_causal_lm.py
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@@ -260,4 +260,11 @@ class TransformersFlashCausalLM(FlashCausalLM):
             hidden_states = hidden_states[lm_head_indices]
         logits = self.model.lm_head(hidden_states)
 
+        # For Granite while next transformers version is released and we can use `lm_head_indices` natively
+        if hasattr(self.model.config, "logits_scaling"):
+            logits = logits / self.model.config.logits_scaling
+        # For Cohere for similar reasons
+        elif hasattr(self.model, "logit_scale"):
+            logits = logits * self.model.logit_scale
+
         return logits, None

From d937eb64da04cfc82ffa7beb54066733ce857117 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 23 Jan 2025 18:54:34 +0100
Subject: [PATCH 054/183] Fixing cargo lock.

---
 Cargo.lock | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e63d1540..af3e1902 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "addr2line"
@@ -1544,7 +1544,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
  "ahash",
- "allocator-api2",
 ]
 
 [[package]]
@@ -2187,9 +2186,9 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
 
 [[package]]
 name = "libc"
-version = "0.2.164"
+version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f"
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
 [[package]]
 name = "libfuzzer-sys"
@@ -4431,7 +4430,7 @@ dependencies = [
  "cmake",
  "cxx",
  "cxx-build",
- "hashbrown 0.14.5",
+ "hashbrown 0.15.1",
  "hf-hub",
  "pkg-config",
  "pyo3",
@@ -4791,9 +4790,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.41.1"
+version = "1.43.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33"
+checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
 dependencies = [
  "backtrace",
  "bytes",
@@ -4819,9 +4818,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.4.0"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
+checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4862,9 +4861,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.16"
+version = "0.1.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1"
+checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
 dependencies = [
  "futures-core",
  "pin-project-lite",

From d9dda117266389e2755099a0eac7f2d7c69d6dca Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 24 Jan 2025 09:32:17 +0100
Subject: [PATCH 055/183] Trying to put back the archlist (to fix the oom).
 (#2947)

---
 server/Makefile-flashinfer | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/Makefile-flashinfer b/server/Makefile-flashinfer
index f6cbb572..093b895b 100644
--- a/server/Makefile-flashinfer
+++ b/server/Makefile-flashinfer
@@ -3,4 +3,4 @@ install-flashinfer:
 	# `pip install flashinfer` cannot resolve it.
 	pip install fsspec sympy==1.13.1 numpy
 	pip install -U setuptools
-	FLASHINFER_ENABLE_AOT=1 pip install git+https://github.com/flashinfer-ai/flashinfer.git@v0.2.0.post1#egg=flashinfer  --no-build-isolation
+	TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0+PTX" FLASHINFER_ENABLE_AOT=1 pip install git+https://github.com/flashinfer-ai/flashinfer.git@v0.2.0.post1#egg=flashinfer  --no-build-isolation

From d2ff68e98d91eb71bdccfb533e099a62577b31e0 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 24 Jan 2025 12:18:28 +0100
Subject: [PATCH 056/183] Remove AWS credentials?

---
 .github/workflows/build.yaml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 27907e39..39aa511a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -45,14 +45,6 @@ jobs:
         run: |
           echo "TENSORRT_LLM_VERSION=$(grep -oP '([a-z,0-9]{40})' $GITHUB_WORKSPACE/backends/trtllm/cmake/trtllm.cmake)" >> $GITHUB_ENV
           echo "TensorRT-LLM version: ${{ env.TENSORRT_LLM_VERSION }}"
-      - name: "Configure AWS Credentials"
-        id: aws-creds
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: us-east-1
-          role-to-assume: ${{ secrets.AWS_ROLE_GITHUB_TGI_TEST }}
-          role-duration-seconds: 7200
-          output-credentials: true
       - name: Construct harware variables
         shell: bash
         run: |

From 6cb41a80a1151f8242a66c9362debfad3598f792 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 24 Jan 2025 14:34:17 +0100
Subject: [PATCH 057/183] Revert "Remove AWS credentials?"

This reverts commit d2ff68e98d91eb71bdccfb533e099a62577b31e0.
---
 .github/workflows/build.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 39aa511a..27907e39 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -45,6 +45,14 @@ jobs:
         run: |
           echo "TENSORRT_LLM_VERSION=$(grep -oP '([a-z,0-9]{40})' $GITHUB_WORKSPACE/backends/trtllm/cmake/trtllm.cmake)" >> $GITHUB_ENV
           echo "TensorRT-LLM version: ${{ env.TENSORRT_LLM_VERSION }}"
+      - name: "Configure AWS Credentials"
+        id: aws-creds
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-1
+          role-to-assume: ${{ secrets.AWS_ROLE_GITHUB_TGI_TEST }}
+          role-duration-seconds: 7200
+          output-credentials: true
       - name: Construct harware variables
         shell: bash
         run: |

From 40b00275b248a4a8a6fdb25430fe7f7917eb4bd1 Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Mon, 27 Jan 2025 11:21:48 +0100
Subject: [PATCH 058/183] Attempt to remove AWS S3 flaky cache for sccache
 (#2953)

* backend(trtllm): attempt to remove AWS S3 flaky cache for sccache

* backend(trtllm): what if we expose ENV instead of inline?

* backend(trtllm): and with the right env var for gha sccache

* backend(trtllm): relax the way to detect sccache

* backend(trtllm): make sccache definition manually

* backend(trtllm): ok let's try to define the launchers in build.rs when rustc_wrapper is present

* backend(trtllm): export env variable in run mb?

* backend(trtllm): Cache mode max to cache intermediate layers

* backend(trtllm): inject ompi_version build arg in dependent step
---
 .github/workflows/build.yaml   | 47 ++++++++++---------------
 Dockerfile_trtllm              | 64 ++++++++++++++++------------------
 backends/trtllm/CMakeLists.txt | 15 --------
 backends/trtllm/build.rs       |  9 +++--
 4 files changed, 53 insertions(+), 82 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 27907e39..73a55efe 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -6,11 +6,11 @@ on:
       hardware:
         type: string
         description: Hardware
-          # options:
-          # - cuda
-          # - cuda-trtllm
-          # - rocm
-          # - intel
+        # options:
+        # - cuda
+        # - cuda-trtllm
+        # - rocm
+        # - intel
         required: true
       release-tests:
         description: "Run release integration tests"
@@ -41,19 +41,18 @@ jobs:
         uses: actions/checkout@v4
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Inject required variables for sccache to interact with Github Actions Cache
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
       - name: Extract TensorRT-LLM version
         run: |
           echo "TENSORRT_LLM_VERSION=$(grep -oP '([a-z,0-9]{40})' $GITHUB_WORKSPACE/backends/trtllm/cmake/trtllm.cmake)" >> $GITHUB_ENV
           echo "TensorRT-LLM version: ${{ env.TENSORRT_LLM_VERSION }}"
-      - name: "Configure AWS Credentials"
-        id: aws-creds
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: us-east-1
-          role-to-assume: ${{ secrets.AWS_ROLE_GITHUB_TGI_TEST }}
-          role-duration-seconds: 7200
-          output-credentials: true
-      - name: Construct harware variables
+      - name: Construct hardware variables
         shell: bash
         run: |
           case ${{ inputs.hardware }} in
@@ -75,9 +74,6 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest=""
-                export target="ci-runtime"
-                export sccache_s3_key_prefix="trtllm"
-                export sccache_region="us-east-1"
                 export build_type="dev"
                 ;;
             rocm)
@@ -128,8 +124,6 @@ jobs:
           echo "EXTRA_PYTEST=${extra_pytest}" >> $GITHUB_ENV
           echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
           echo "TARGET=${target}" >> $GITHUB_ENV
-          echo "SCCACHE_S3_KEY_PREFIX=${sccache_s3_key_prefix}" >> $GITHUB_ENV
-          echo "SCCACHE_REGION=${sccache_region}" >> $GITHUB_ENV
           echo "BUILD_TYPE=${build_type}" >> $GITHUB_ENV
       - name: Initialize Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -196,17 +190,14 @@ jobs:
             DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
             PLATFORM=${{ env.PLATFORM }}
             build_type=${{ env.BUILD_TYPE }}
-            is_gha_build=true
-            aws_access_key_id=${{ steps.aws-creds.outputs.aws-access-key-id }}
-            aws_secret_access_key=${{ steps.aws-creds.outputs.aws-secret-access-key }}
-            aws_session_token=${{ steps.aws-creds.outputs.aws-session-token }}
-            sccache_bucket=${{ secrets.AWS_S3_BUCKET_GITHUB_TGI_TEST }}
-            sccache_s3_key_prefix=${{ env.SCCACHE_S3_KEY_PREFIX }}
-            sccache_region=${{ env.SCCACHE_REGION }}
+            sccache_gha_enabled=on
+            actions_cache_url=${{ env.ACTIONS_CACHE_URL }}
+            actions_runtime_token=${{ env.ACTIONS_RUNTIME_TOKEN }}
+
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
-          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
-          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
+          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
+          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
       - name: Final
         id: final
         run: |
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index dd977a81..6538996a 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -1,7 +1,9 @@
 ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
-ARG ompi_version="4.1.7"
 ARG build_type=release
-ARG is_gha_build=false
+ARG ompi_version=4.1.7
+ARG sccache_gha_enabled=no
+ARG actions_cache_url=""
+ARG actions_runtime_token=""
 
 # CUDA dependent dependencies resolver stage
 FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
@@ -34,19 +36,19 @@ ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
 
 # Install OpenMPI
 FROM cuda-builder AS mpi-builder
-ARG ompi_version
-
-ENV OMPI_TARBALL_FILENAME="openmpi-$ompi_version.tar.bz2"
-ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
-    https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILENAME \
-    /opt/src/mpi/
-
 WORKDIR /opt/src/mpi
-RUN tar --strip-components=1 -xf $OMPI_TARBALL_FILENAME &&\
+
+ARG ompi_version
+ENV OMPI_VERSION=${ompi_version}
+ENV OMPI_TARBALL_FILENAME=openmpi-${OMPI_VERSION}.tar.bz2
+ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
+    https://download.open-mpi.org/release/open-mpi/v4.1/${OMPI_TARBALL_FILENAME} .
+
+RUN tar --strip-components=1 -xf ${OMPI_TARBALL_FILENAME} &&\
     ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
     make -j all && \
     make install && \
-    rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
+    rm -rf ${OMPI_TARBALL_FILENAME}/..
 
 # Install TensorRT
 FROM cuda-builder AS trt-builder
@@ -59,8 +61,11 @@ FROM cuda-builder AS tgi-builder
 WORKDIR /usr/src/text-generation-inference
 
 # Scoped global args reuse
-ARG is_gha_build
+ARG cuda_arch_list
 ARG build_type
+ARG sccache_gha_enabled
+ARG actions_cache_url
+ARG actions_runtime_token
 
 # Install Rust
 ENV PATH="/root/.cargo/bin:$PATH"
@@ -69,28 +74,17 @@ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y &&
     chmod -R a+w /root/.cargo && \
     cargo install sccache --locked
 
-# SCCACHE Specifics args - before finding a better, more generic, way...
-ARG aws_access_key_id
-ARG aws_secret_access_key
-ARG aws_session_token
-ARG sccache_bucket
-ARG sccache_s3_key_prefix
-ARG sccache_region
-
-ENV AWS_ACCESS_KEY_ID=$aws_access_key_id
-ENV AWS_SECRET_ACCESS_KEY=$aws_secret_access_key
-ENV AWS_SESSION_TOKEN=$aws_session_token
-ENV SCCACHE_BUCKET=$sccache_bucket
-ENV SCCACHE_S3_KEY_PREFIX=$sccache_s3_key_prefix
-ENV SCCACHE_REGION=$sccache_region
-
 ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
-ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig:$PKG_CONFIG_PATH"
-ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt:$CMAKE_PREFIX_PATH"
+ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
+ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt"
 
 ENV USE_LLD_LINKER=ON
 ENV CUDA_ARCH_LIST=${cuda_arch_list}
-ENV IS_GHA_BUILD=${is_gha_build}
+
+# SCCACHE Specifics args - before finding a better, more generic, way...
+ENV SCCACHE_GHA_ENABLED=${sccache_gha_enabled}
+ENV ACTIONS_CACHE_URL=${actions_cache_url}
+ENV ACTIONS_RUNTIME_TOKEN=${actions_runtime_token}
 
 COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
@@ -102,10 +96,12 @@ COPY launcher launcher
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 
-RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
-    python3 backends/trtllm/scripts/setup_sccache.py --is-gha-build ${is_gha_build} && \
-    CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX \
-    RUSTC_WRAPPER=sccache \
+ENV RUSTC_WRAPPER=sccache
+ENV CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX
+RUN export CMAKE_C_COMPILER_LAUNCHER=sccache && \
+    export CMAKE_CXX_COMPILER_LAUNCHER=sccache && \
+    export CMAKE_CUDA_COMPILER_LAUNCHER=sccache && \
+    mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
     cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \
     sccache --show-stats
 
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 8388f113..26af80be 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -1,20 +1,5 @@
 cmake_minimum_required(VERSION 3.20)
 
-if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
-    find_program(CCACHE_EXECUTABLE "ccache")
-    if (CCACHE_EXECUTABLE)
-        message(STATUS "Using ccache")
-        set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
-        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
-        set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}")
-    endif ()
-else ()
-    message(STATUS "Using user specified cmake cxx compiler launcher: ${CMAKE_CXX_COMPILER_LAUNCHER}")
-    set(CMAKE_C_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
-    set(CMAKE_CXX_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
-    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CMAKE_CXX_COMPILER_LAUNCHER}")
-endif ()
-
 if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
     cmake_policy(SET CMP0135 NEW)
 endif ()
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index 8b041860..4d559fd4 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -14,7 +14,7 @@ const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
 const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");
 
 const IS_GHA_BUILD: LazyLock<bool> = LazyLock::new(|| {
-    option_env!("IS_GHA_BUILD").map_or(false, |value| match value.to_lowercase().as_str() {
+    option_env!("SCCACHE_GHA_ENABLED").map_or(false, |value| match value.to_lowercase().as_str() {
         "on" => true,
         "true" => true,
         "1" => true,
@@ -138,10 +138,9 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
 
     if let Some(wrapper) = option_env!("RUSTC_WRAPPER") {
         println!("cargo:warning=Using caching tool: {wrapper}");
-
-        env::set_var("CMAKE_C_COMPILER_LAUNCHER", wrapper);
-        env::set_var("CMAKE_CXX_COMPILER_LAUNCHER", wrapper);
-        env::set_var("CMAKE_CUDA_COMPILER_LAUNCHER", wrapper);
+        config.define("CMAKE_C_COMPILER_LAUNCHER", wrapper);
+        config.define("CMAKE_CXX_COMPILER_LAUNCHER", wrapper);
+        config.define("CMAKE_CUDA_COMPILER_LAUNCHER", wrapper);
     }
 
     // Allow to override which Python to use ...

From db922eb77e34eb9df7e4102148ba089760398ccb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 27 Jan 2025 11:42:36 +0100
Subject: [PATCH 059/183] Update to attention-kernels 0.2.0 (#2950)

This version removes our patches/custom API. Makes it simpler to
get changes from upstream. One of which is that we can enable FP8
KV cache for paged attention as well.
---
 flake.lock                                    |  13 +-
 flake.nix                                     |   2 +-
 server/pyproject.toml                         |   7 +-
 .../layers/attention/cuda.py                  |  16 +-
 .../layers/attention/kv_cache.py              |  11 +-
 server/uv.lock                                | 184 ++++++------------
 6 files changed, 87 insertions(+), 146 deletions(-)

diff --git a/flake.lock b/flake.lock
index ce23c5e4..6bf4ba2f 100644
--- a/flake.lock
+++ b/flake.lock
@@ -853,11 +853,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1737512878,
-        "narHash": "sha256-dgF6htdmfNnZzVInifks6npnCAyVsIHWSpWNs10RSW0=",
+        "lastModified": 1737685583,
+        "narHash": "sha256-p+NVABRpGi+pT+xxf9HcLcFVxG6L+vEEy+NwzB9T0f8=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "06b8ed0eee289fe94c66f1202ced9a6a2c59a14c",
+        "rev": "eb64cbcc8eee0fa87ebded92805280d2ec97415a",
         "type": "github"
       },
       "original": {
@@ -978,15 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1737540114,
-        "narHash": "sha256-ubowOFdG8pAodwuxzWHLIoQJtGXJTlb4RtISVVY3Tx0=",
+        "lastModified": 1737715219,
+        "narHash": "sha256-oIxoNreSeSILjWxcZHXW3cdcoNQHnXO5deXoIiC1tng=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "d18053189cc5ce4111250da841df101c8cdf13ff",
+        "rev": "b91a56628f446c6cb79d224f17c1c66fe1a260f6",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
+        "ref": "attention-kernels-0.2.0",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 83cedfa6..883bae91 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/attention-kernels-0.2.0";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 129e9b64..dbc1fa7a 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -68,12 +68,7 @@ gen = [
 ]
 
 [tool.uv.sources]
-attention-kernels = [
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
-]
+attention-kernels.url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
 marlin-kernels = [
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py
index f1469b3f..6b77f591 100644
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -111,6 +111,8 @@ def paged_attention(
 
         out = torch.empty_like(query)
 
+        kv_cache_dtype = "fp8" if kv_cache.dtype == torch.float8_e4m3fn else "auto"
+
         use_v1 = max_s <= 8192 and (
             max_num_partitions == 1 or num_seqs * num_heads > 512
         )
@@ -120,15 +122,16 @@ def paged_attention(
                 query,
                 kv_cache.key,
                 kv_cache.value,
-                kv_head_mapping,
+                kv_cache.key.shape[1],
                 softmax_scale,
                 block_tables,
                 input_lengths,
                 block_size,
                 max_s,
                 None,
-                "auto",
-                1.0,
+                kv_cache_dtype,
+                kv_scales.key_scale_cpu,
+                kv_scales.value_scale_cpu,
             )
         else:
             # Run PagedAttention V2.
@@ -153,15 +156,16 @@ def paged_attention(
                 query,
                 kv_cache.key,
                 kv_cache.value,
-                kv_head_mapping,
+                kv_cache.key.shape[1],
                 softmax_scale,
                 block_tables,
                 input_lengths,
                 block_size,
                 max_s,
                 None,
-                "auto",
-                1.0,
+                kv_cache_dtype,
+                kv_scales.key_scale_cpu,
+                kv_scales.value_scale_cpu,
             )
     return out
 
diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py
index 77e761a3..bca7959b 100644
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@@ -55,10 +55,10 @@ class KVCache:
         if dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
             if not (
                 (ATTENTION == "flashinfer" and SYSTEM == "cuda")
-                or (ATTENTION == "paged" and SYSTEM == "rocm")
+                or (ATTENTION == "paged" and SYSTEM in ("cuda", "rocm"))
             ):
                 raise ValueError(
-                    "FP8 KV cache is currently only supported for flashinfer on CUDA and paged attention on ROCm. "
+                    "FP8 KV cache is currently only supported for flashinfer on CUDA and paged attention on CUDA and ROCm. "
                 )
             if SYSTEM == "rocm" and dtype == torch.float8_e5m2:
                 raise ValueError(
@@ -226,8 +226,13 @@ def paged_reshape_and_cache(
             raise ImportError(
                 f"Could not import attention_kernels. Make sure your installation is correct. Complete error: {e}"
             )
+
+        kv_cache_dtype = "auto"
+        if key_cache.dtype == torch.float8_e4m3fn:
+            kv_cache_dtype = "fp8"
+
         attention_kernels.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, "auto", 1.0
+            key, value, key_cache, value_cache, slots, kv_cache_dtype, k_scale, v_scale
         )
     elif SYSTEM == "rocm":
         try:
diff --git a/server/uv.lock b/server/uv.lock
index 5e2c9d1c..ef26b463 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -170,79 +170,13 @@ wheels = [
 
 [[package]]
 name = "attention-kernels"
-version = "0.1.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.13'",
-]
+version = "0.2.0.post2"
+source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
 dependencies = [
-    { name = "torch", marker = "python_full_version >= '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0a/49/1847d04522158767065d24382b6fe2e0540a63f222113e273745f77ee2c5/attention-kernels-0.1.1.tar.gz", hash = "sha256:aff84a6e61e4720c14a2c2a62242b21c271d5d36bb6a0f0f017b762611b2d477", size = 49652 }
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.10.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.10.*'" },
+    { name = "torch" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.11.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.11.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.12.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.12.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "attention-kernels"
-version = "0.1.1"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version < '3.10'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf" },
+    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:863e02dda4b30e9d04ef6cf4d17d16c154f54bdcb8a8b87b8b46075eabf62d25" },
 ]
 
 [package.metadata]
@@ -987,7 +921,7 @@ sdist = { url = "https://files.pythonhosted.org/packages/b2/82/886d1eece474ef236
 [[package]]
 name = "marlin-kernels"
 version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }
 resolution-markers = [
     "python_full_version == '3.10.*'",
 ]
@@ -995,7 +929,7 @@ dependencies = [
     { name = "torch", marker = "python_full_version == '3.10.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:bb416d14623dc0ad0eeb2835446c37a41f994555f1baec8701de6d4c1fc17ec8" },
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", hash = "sha256:dd91a4e2c3b5e954833c5c34b0322e4c02cd92a967eb94654b6bbcece131340b" },
 ]
 
 [package.metadata]
@@ -1004,7 +938,7 @@ requires-dist = [{ name = "torch" }]
 [[package]]
 name = "marlin-kernels"
 version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }
 resolution-markers = [
     "python_full_version == '3.11.*'",
 ]
@@ -1012,7 +946,7 @@ dependencies = [
     { name = "torch", marker = "python_full_version == '3.11.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:a89bb61d718002d4432158641bce95c6fd68f9ee1a7d5402dd283903397f3185" },
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", hash = "sha256:b24d92135fbd156c55ce43158ab4a90fa880ba0df965528895cf1870b03a64bf" },
 ]
 
 [package.metadata]
@@ -1021,7 +955,7 @@ requires-dist = [{ name = "torch" }]
 [[package]]
 name = "marlin-kernels"
 version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }
 resolution-markers = [
     "python_full_version == '3.12.*'",
 ]
@@ -1029,7 +963,7 @@ dependencies = [
     { name = "torch", marker = "python_full_version == '3.12.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:ed938d196fc5e9cce9fc44cd2b889d5adc5ca7475c8a23858f1474d29e38bdbf" },
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", hash = "sha256:8a407f1435a571a8d4ca3b9f533da83fde323043a9836b739cf8018c77782d49" },
 ]
 
 [package.metadata]
@@ -1038,7 +972,7 @@ requires-dist = [{ name = "torch" }]
 [[package]]
 name = "marlin-kernels"
 version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }
 resolution-markers = [
     "python_full_version < '3.10'",
 ]
@@ -1046,7 +980,7 @@ dependencies = [
     { name = "torch", marker = "python_full_version < '3.10'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:113c54f68565ad476ca12366b4de92131fa3e9ddb16cbe8ad63272972a15ac28" },
+    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", hash = "sha256:bf7003753c364c504b3998fffdfcf619a42ab04f908903dbad8d54347b6b142b" },
 ]
 
 [package.metadata]
@@ -1078,7 +1012,7 @@ sdist = { url = "https://files.pythonhosted.org/packages/9c/a1/76f32a7ce5b18e5b8
 [[package]]
 name = "moe-kernels"
 version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }
 resolution-markers = [
     "python_full_version == '3.10.*'",
 ]
@@ -1088,7 +1022,7 @@ dependencies = [
     { name = "triton", marker = "python_full_version == '3.10.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:f8c126395f11522881c6bf1f6120e3670822006a84e2ff74af561c22445746b3" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl", hash = "sha256:242d5de087902aa84dff54b6ba140b4066904fe5e7757f934645343b052ab076" },
 ]
 
 [package.metadata]
@@ -1101,7 +1035,7 @@ requires-dist = [
 [[package]]
 name = "moe-kernels"
 version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }
 resolution-markers = [
     "python_full_version == '3.11.*'",
 ]
@@ -1111,7 +1045,7 @@ dependencies = [
     { name = "triton", marker = "python_full_version == '3.11.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:2afff8346251f01d5d90bab738e3dfaa6b14a414a9c88205d396ab2bae87983a" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl", hash = "sha256:8198a3388a03a3248d5f5698097e8ce0a73b6a01f9854fc2338aacc57e554e8a" },
 ]
 
 [package.metadata]
@@ -1124,7 +1058,7 @@ requires-dist = [
 [[package]]
 name = "moe-kernels"
 version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }
 resolution-markers = [
     "python_full_version == '3.12.*'",
 ]
@@ -1134,7 +1068,7 @@ dependencies = [
     { name = "triton", marker = "python_full_version == '3.12.*'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:b1a29e33d3b7d85e2b4f8bd47db28211096d1f645e0868d5a1f3666ebb9bd9e3" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl", hash = "sha256:b35fb02ae560b560f4af107791a3308dc97d5ca57d39bab20acec3a0f082ccf2" },
 ]
 
 [package.metadata]
@@ -1147,7 +1081,7 @@ requires-dist = [
 [[package]]
 name = "moe-kernels"
 version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }
 resolution-markers = [
     "python_full_version < '3.10'",
 ]
@@ -1157,7 +1091,7 @@ dependencies = [
     { name = "triton", marker = "python_full_version < '3.10'" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:9573611174cda9f6fafa1816521e38582fd2903b321bbaf78f83cf6e3189ac7d" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl", hash = "sha256:cf8d276deb7a4d40fed3eb02e1b6f8d08ccec0f4256260922af13927ca044f56" },
 ]
 
 [package.metadata]
@@ -1460,6 +1394,7 @@ name = "nvidia-cublas-cu12"
 version = "12.4.5.8"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 },
     { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 },
 ]
 
@@ -1468,6 +1403,7 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 },
     { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 },
 ]
 
@@ -1476,6 +1412,7 @@ name = "nvidia-cuda-nvrtc-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 },
     { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 },
 ]
 
@@ -1484,6 +1421,7 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 },
     { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 },
 ]
 
@@ -1506,6 +1444,7 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 },
     { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
 ]
 
@@ -1514,6 +1453,7 @@ name = "nvidia-curand-cu12"
 version = "10.3.5.147"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 },
     { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 },
 ]
 
@@ -1527,6 +1467,7 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 },
     { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
 ]
 
@@ -1538,6 +1479,7 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 },
     { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
 ]
 
@@ -1563,6 +1505,7 @@ name = "nvidia-nvjitlink-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 },
     { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 },
 ]
 
@@ -1571,6 +1514,7 @@ name = "nvidia-nvtx-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 },
     { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
 ]
 
@@ -2830,11 +2774,7 @@ accelerate = [
     { name = "accelerate" },
 ]
 attention = [
-    { name = "attention-kernels", version = "0.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
-    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
-    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
-    { name = "attention-kernels", version = "0.1.1", source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+    { name = "attention-kernels" },
 ]
 bnb = [
     { name = "bitsandbytes" },
@@ -2852,17 +2792,17 @@ gen = [
 ]
 marlin = [
     { name = "marlin-kernels", version = "0.3.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
+    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
 ]
 moe = [
     { name = "moe-kernels", version = "0.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
+    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
 ]
 outlines = [
     { name = "outlines" },
@@ -2878,51 +2818,47 @@ quantize = [
 [package.metadata]
 requires-dist = [
     { name = "accelerate", marker = "extra == 'accelerate'", specifier = ">=1.2.1,<2" },
-    { name = "attention-kernels", marker = "python_full_version == '3.9.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl" },
-    { name = "attention-kernels", marker = "(python_full_version < '3.9' and extra == 'attention') or (python_full_version >= '3.13' and extra == 'attention')" },
-    { name = "attention-kernels", marker = "python_full_version == '3.10.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl" },
-    { name = "attention-kernels", marker = "python_full_version == '3.11.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl" },
-    { name = "attention-kernels", marker = "python_full_version == '3.12.*' and extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl" },
+    { name = "attention-kernels", marker = "extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
     { name = "bitsandbytes", marker = "extra == 'bnb'", specifier = ">=0.45.0" },
     { name = "compressed-tensors", marker = "extra == 'compressed-tensors'", specifier = ">=0.9.0" },
     { name = "datasets", marker = "extra == 'quantize'", specifier = ">=2.21,<3" },
     { name = "einops", specifier = ">=0.8.0" },
     { name = "grpc-interceptor", specifier = ">=0.15.4" },
-    { name = "grpcio", specifier = ">=1.69.0" },
-    { name = "grpcio-reflection", specifier = ">=1.69.0" },
-    { name = "grpcio-status", specifier = ">=1.69.0" },
+    { name = "grpcio", specifier = ">=1.67.0" },
+    { name = "grpcio-reflection", specifier = ">=1.67.0" },
+    { name = "grpcio-status", specifier = ">=1.67.0" },
     { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" },
     { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
-    { name = "hf-transfer", specifier = ">=0.1.9" },
+    { name = "hf-transfer", specifier = ">=0.1.8" },
     { name = "loguru", specifier = ">=0.7.3" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.9.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp39-cp39-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.9.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" },
     { name = "marlin-kernels", marker = "(python_full_version < '3.9' and extra == 'marlin') or (python_full_version >= '3.13' and extra == 'marlin')" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp310-cp310-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp311-cp311-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.4-cp312-cp312-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "python_full_version == '3.9.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
+    { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.9.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl" },
     { name = "moe-kernels", marker = "(python_full_version < '3.9' and extra == 'moe') or (python_full_version >= '3.13' and extra == 'moe')" },
-    { name = "moe-kernels", marker = "python_full_version == '3.10.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "python_full_version == '3.11.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "python_full_version == '3.12.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.10.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.11.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "python_full_version == '3.12.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
     { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
-    { name = "numpy", specifier = ">=2.0.2" },
-    { name = "opentelemetry-api", specifier = ">=1.29.0" },
-    { name = "opentelemetry-exporter-otlp", specifier = ">=1.29.0" },
+    { name = "numpy", specifier = ">=1.26,<3" },
+    { name = "opentelemetry-api", specifier = ">=1.27.0" },
+    { name = "opentelemetry-exporter-otlp", specifier = ">=1.27.0" },
     { name = "opentelemetry-instrumentation-grpc", specifier = ">=0.50b0" },
     { name = "outlines", marker = "extra == 'outlines'", specifier = ">=0.1.13" },
     { name = "peft", marker = "extra == 'peft'", specifier = ">=0.14.0" },
     { name = "pillow", specifier = ">=11.1.0" },
-    { name = "prometheus-client", specifier = ">=0.21.1" },
-    { name = "protobuf", specifier = ">=5.29.3" },
+    { name = "prometheus-client", specifier = ">=0.21.0" },
+    { name = "protobuf", specifier = ">=5.28.3" },
     { name = "py-cpuinfo", specifier = ">=9.0.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.3.0,<8" },
-    { name = "rich", specifier = ">=13.9.4" },
-    { name = "safetensors", specifier = ">=0.5.2" },
+    { name = "rich", specifier = ">=13.8.1" },
+    { name = "safetensors", specifier = ">=0.4.5" },
     { name = "scipy", specifier = ">=1.13.1" },
     { name = "sentencepiece", specifier = ">=0.2.0" },
     { name = "texttable", marker = "extra == 'quantize'", specifier = ">=1.6.7,<2" },
-    { name = "tokenizers", specifier = ">=0.21.0" },
+    { name = "tokenizers", specifier = ">=0.20.3" },
     { name = "typer", specifier = ">=0.15.1" },
 ]
 

From c690da5973491ba6c93f9616859d6b8c70d89b2c Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Tue, 28 Jan 2025 10:29:18 +0100
Subject: [PATCH 060/183] fix: Telemetry (#2957)

* fix: add telemetry regular pings and fix unhandled errors avoid not sending telemetry stop events.

* fix: simplify error handling

* fix: update ping delay and update doc.

* fix: clippy

* doc: Rephrase properly.
---
 docs/source/usage_statistics.md |  2 +-
 router/src/server.rs            | 47 ++++++++++++++++++++++++++-------
 router/src/usage_stats.rs       |  3 ++-
 3 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/docs/source/usage_statistics.md b/docs/source/usage_statistics.md
index d3878b53..dbf36c77 100644
--- a/docs/source/usage_statistics.md
+++ b/docs/source/usage_statistics.md
@@ -3,7 +3,7 @@
 
 Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted.
 
-Data is sent twice, once on server startup and once when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine.
+Usage statistics are collected only when TGI is running in a Docker container. This prevents data collection when TGI is run directly on the host machine. The collected data includes startup and shutdown events, as well as a heartbeat signal sent every 15 minutes.
 
 ## What data is collected
 
diff --git a/router/src/server.rs b/router/src/server.rs
index aef0f812..2781f9fb 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -54,6 +54,9 @@ use std::fs::File;
 use std::io::BufReader;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
 use std::path::{Path, PathBuf};
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use std::time::Duration;
 use thiserror::Error;
 use tokio::select;
 use tokio::signal;
@@ -1819,9 +1822,9 @@ pub async fn run(
         HubTokenizerConfig::default()
     });
 
-    let tokenizer: Tokenizer = {
+    let tokenizer: Result<Tokenizer, WebServerError> = {
         use pyo3::prelude::*;
-        pyo3::Python::with_gil(|py| -> PyResult<()> {
+        Python::with_gil(|py| -> PyResult<()> {
             py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), trust_remote_code)?;
             Ok(())
         })
@@ -1832,16 +1835,16 @@ pub async fn run(
             let out = legacy_tokenizer_handle(config_filename.as_ref());
             out.ok_or(err)
         })
-        .expect("We cannot load a tokenizer");
+        .map_err(|_| WebServerError::Tokenizer("Unable to load tokenizer.".to_string()))?;
         let filename = "out/tokenizer.json";
         if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
-            Tokenizer::Rust(tok)
+            Ok(Tokenizer::Rust(tok))
         } else {
-            Tokenizer::Python {
+            Ok(Tokenizer::Python {
                 tokenizer_name: tokenizer_name.clone(),
                 revision: revision.clone(),
                 trust_remote_code,
-            }
+            })
         }
     };
 
@@ -1901,11 +1904,27 @@ pub async fn run(
         _ => None,
     };
 
-    if let Some(ref ua) = user_agent {
+    let stop_usage_thread = Arc::new(AtomicBool::new(false));
+    let stop_usage_thread_clone = stop_usage_thread.clone();
+    if let Some(ua) = user_agent.clone() {
         let start_event =
             usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start, None);
         tokio::spawn(async move {
+            // send start event
             start_event.send().await;
+            let mut last_report = Instant::now();
+            while !stop_usage_thread_clone.load(Ordering::Relaxed) {
+                if last_report.elapsed() > Duration::from_secs(900) {
+                    let report_event = usage_stats::UsageStatsEvent::new(
+                        ua.clone(),
+                        usage_stats::EventType::Ping,
+                        None,
+                    );
+                    report_event.send().await;
+                    last_report = Instant::now();
+                }
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
         });
     };
     let compat_return_full_text = match &model_info.pipeline_tag {
@@ -1926,7 +1945,7 @@ pub async fn run(
         validation_workers,
         api_key,
         config,
-        (tokenizer, tokenizer_config),
+        (tokenizer?, tokenizer_config),
         (preprocessor_config, processor_config),
         hostname,
         port,
@@ -1943,6 +1962,7 @@ pub async fn run(
     .await;
 
     if let Some(ua) = user_agent {
+        stop_usage_thread.store(true, Ordering::Relaxed);
         match result {
             Ok(_) => {
                 let stop_event = usage_stats::UsageStatsEvent::new(
@@ -2419,8 +2439,13 @@ async fn start(
         }
     } else {
         // Run server
-
-        let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();
+        let listener = match tokio::net::TcpListener::bind(&addr).await {
+            Ok(listener) => listener,
+            Err(e) => {
+                tracing::error!("Failed to bind to {addr}: {e}");
+                return Err(WebServerError::Axum(Box::new(e)));
+            }
+        };
         axum::serve(listener, app)
             .with_graceful_shutdown(shutdown_signal())
             .await
@@ -2535,4 +2560,6 @@ impl From<InferError> for Event {
 pub enum WebServerError {
     #[error("Axum error: {0}")]
     Axum(#[from] axum::BoxError),
+    #[error("Tokenizer error: {0}")]
+    Tokenizer(String),
 }
diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs
index e9d98327..4139c4c5 100644
--- a/router/src/usage_stats.rs
+++ b/router/src/usage_stats.rs
@@ -43,6 +43,7 @@ pub enum EventType {
     Start,
     Stop,
     Error,
+    Ping,
 }
 
 #[derive(Debug, Serialize)]
@@ -70,7 +71,7 @@ impl UsageStatsEvent {
             .post(TELEMETRY_URL)
             .headers(headers)
             .body(body)
-            .timeout(Duration::from_secs(5))
+            .timeout(Duration::from_secs(10))
             .send()
             .await;
     }

From eb3df0f46fd1d0e42188fd14124f255a2a7df199 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 28 Jan 2025 10:30:28 +0100
Subject: [PATCH 061/183] Fixing the oom maybe with 2.5.1 change. (#2958)

---
 server/text_generation_server/models/globals.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py
index 889de028..19696372 100644
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@@ -28,7 +28,7 @@ if PREFIX_CACHING and ATTENTION not in {
     raise RuntimeError("Prefix caching is only supported with flashinfer")
 
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
-TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.95"))
+TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.93"))
 assert TGI_WIGGLE_ROOM > 0
 assert TGI_WIGGLE_ROOM < 1
 

From 73b7cf83f635287970d28658d9fb9f8a33d49d44 Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Tue, 28 Jan 2025 16:53:16 +0100
Subject: [PATCH 062/183] Add backend name to telemetry (#2962)

* feat: Add backend name to telemetry
---
 backends/trtllm/src/looper.rs | 4 ++++
 backends/v2/src/backend.rs    | 4 ++++
 backends/v3/src/backend.rs    | 4 ++++
 router/src/infer/mod.rs       | 2 ++
 router/src/server.rs          | 1 +
 router/src/usage_stats.rs     | 3 +++
 6 files changed, 18 insertions(+)

diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
index b148ec30..5fed954f 100644
--- a/backends/trtllm/src/looper.rs
+++ b/backends/trtllm/src/looper.rs
@@ -340,4 +340,8 @@ impl Backend for TensorRtLlmBackendV2 {
     async fn health(&self, _: bool) -> bool {
         true
     }
+
+    fn name(&self) -> &'static str {
+        "TensorRT-LLM"
+    }
 }
diff --git a/backends/v2/src/backend.rs b/backends/v2/src/backend.rs
index cfe87f98..adca3d5d 100644
--- a/backends/v2/src/backend.rs
+++ b/backends/v2/src/backend.rs
@@ -108,6 +108,10 @@ impl Backend for BackendV2 {
     fn start_health(&self) -> bool {
         true
     }
+
+    fn name(&self) -> &'static str {
+        "tgi-v2"
+    }
 }
 
 /// Batching logic
diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs
index 736301b3..98e8d76f 100644
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@@ -115,6 +115,10 @@ impl Backend for BackendV3 {
     fn start_health(&self) -> bool {
         true
     }
+
+    fn name(&self) -> &'static str {
+        "tgi-v3"
+    }
 }
 
 /// Batching logic
diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs
index 6497d857..7eb8a41b 100644
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@@ -40,6 +40,8 @@ pub trait Backend {
     fn start_health(&self) -> bool {
         false
     }
+
+    fn name(&self) -> &'static str;
 }
 
 /// Inference struct
diff --git a/router/src/server.rs b/router/src/server.rs
index 2781f9fb..9e57af27 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1898,6 +1898,7 @@ pub async fn run(
                 disable_grammar_support,
                 max_client_batch_size,
                 usage_stats_level,
+                backend.name(),
             );
             Some(usage_stats::UserAgent::new(reduced_args))
         }
diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs
index 4139c4c5..c3df0c80 100644
--- a/router/src/usage_stats.rs
+++ b/router/src/usage_stats.rs
@@ -97,6 +97,7 @@ pub struct Args {
     disable_grammar_support: bool,
     max_client_batch_size: usize,
     usage_stats_level: UsageStatsLevel,
+    backend_name: &'static str,
 }
 
 impl Args {
@@ -120,6 +121,7 @@ impl Args {
         disable_grammar_support: bool,
         max_client_batch_size: usize,
         usage_stats_level: UsageStatsLevel,
+        backend_name: &'static str,
     ) -> Self {
         Self {
             model_config,
@@ -140,6 +142,7 @@ impl Args {
             disable_grammar_support,
             max_client_batch_size,
             usage_stats_level,
+            backend_name,
         }
     }
 }

From 4ef2e045c9f8236578c81ee7b5aed1231ec5da9f Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Wed, 29 Jan 2025 18:26:32 +0530
Subject: [PATCH 063/183] Add fp8 support moe models (#2928)

* Add fp8 support moe models

* flatten condition
---
 .../layers/compressed_tensors/w8an_fp.py      |   4 +-
 server/text_generation_server/layers/fp8.py   |  12 +-
 .../layers/moe/__init__.py                    |  15 +-
 .../text_generation_server/layers/moe/fp8.py  | 162 ++++++++++++++++++
 .../layers/moe/unquantized.py                 |  12 +-
 5 files changed, 181 insertions(+), 24 deletions(-)
 create mode 100644 server/text_generation_server/layers/moe/fp8.py

diff --git a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
index ebcc06d6..42c5e633 100644
--- a/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
+++ b/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
@@ -7,7 +7,7 @@ from text_generation_server.layers.fp8 import (
     Fp8Weight,
     _load_scalar_or_matrix_scale,
     requantize_with_max_scale,
-    normalize_e4m3fn_to_e4m3fnuz,
+    normalize_e4m3fn_to_native_float8,
 )
 from text_generation_server.utils.weights import Weights, WeightsLoader
 from text_generation_server.utils.import_utils import SYSTEM
@@ -148,7 +148,7 @@ class W8ANFpLoader(WeightsLoader):
             )
 
         if self.load_weight_scale and SYSTEM == "rocm":
-            w, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+            w, weight_scale, input_scale = normalize_e4m3fn_to_native_float8(
                 w, weight_scale, input_scale
             )
 
diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
index 715974ff..04f8d0c2 100644
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@@ -68,12 +68,12 @@ def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
     return Fp8Linear
 
 
-def normalize_e4m3fn_to_e4m3fnuz(
+def normalize_e4m3fn_to_native_float8(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
     input_scale: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-    if weight.dtype == torch.float8_e4m3fn:
+    if weight.dtype == torch.float8_e4m3fn and SYSTEM == "rocm":
         # The bits pattern 10000000(-128) represents zero in e4m3fn
         # but NaN in e4m3fnuz. So here we set it to 0.
         # https://onnx.ai/onnx/technical/float8.html
@@ -172,7 +172,7 @@ def fp8_quantize(
     qweight = qweight.to(qdtype)
 
     if SYSTEM == "rocm":
-        qweight, scale, _ = normalize_e4m3fn_to_e4m3fnuz(qweight, scale)
+        qweight, scale, _ = normalize_e4m3fn_to_native_float8(qweight, scale)
 
     return qweight, scale
 
@@ -295,7 +295,7 @@ class HybridFP8UnquantLoader(WeightsLoader):
             )
 
             if SYSTEM == "rocm":
-                w, scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                w, scale, input_scale = normalize_e4m3fn_to_native_float8(
                     w, scale, input_scale
                 )
 
@@ -390,8 +390,8 @@ class Fp8Linear(torch.nn.Module):
         if CUTLASS_FP8_AVAILABLE:
             log_once(logger.info, "Using cutlass w8a8 kernels")
         if SYSTEM == "rocm" and qweight.dtype == torch.float8_e4m3fn:
-            qweight, scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
-                weight=qweight, weight_scale=scale
+            qweight, scale, input_scale = normalize_e4m3fn_to_native_float8(
+                weight=qweight, weight_scale=scale, input_scale=input_scale
             )
 
         self.dtype = dtype
diff --git a/server/text_generation_server/layers/moe/__init__.py b/server/text_generation_server/layers/moe/__init__.py
index f0bd76aa..3a274460 100644
--- a/server/text_generation_server/layers/moe/__init__.py
+++ b/server/text_generation_server/layers/moe/__init__.py
@@ -16,6 +16,7 @@ from text_generation_server.layers.moe.gptq_marlin import (
     can_use_marlin_moe_gemm,
 )
 from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
+from text_generation_server.layers.moe.fp8 import FP8SparseMoELayer
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import (
@@ -203,12 +204,16 @@ class SparseMoELayer(nn.Module):
         down_proj_name: str = "down_proj",
     ):
         super().__init__()
-
-        if (
-            isinstance(weights.loader, DefaultWeightsLoader)
-            and isinstance(weights.loader.weight_class, UnquantizedWeight)
-        ) or isinstance(weights.loader, HybridFP8UnquantLoader):
+        if isinstance(weights.loader, DefaultWeightsLoader) and isinstance(
+            weights.loader.weight_class, UnquantizedWeight
+        ):
             cls = UnquantizedSparseMoELayer
+        elif isinstance(weights.loader, HybridFP8UnquantLoader):
+            cls = (
+                FP8SparseMoELayer
+                if weights.loader.to_fp8
+                else UnquantizedSparseMoELayer
+            )
         elif isinstance(
             weights.loader, GPTQMarlinWeightsLoader
         ) and can_use_marlin_moe_gemm(
diff --git a/server/text_generation_server/layers/moe/fp8.py b/server/text_generation_server/layers/moe/fp8.py
new file mode 100644
index 00000000..7ccddb5b
--- /dev/null
+++ b/server/text_generation_server/layers/moe/fp8.py
@@ -0,0 +1,162 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from text_generation_server.utils.weights import Weights
+from text_generation_server.layers.fp8 import (
+    Fp8Weight,
+    fp8_quantize,
+    quant_dtype,
+    normalize_e4m3fn_to_native_float8,
+)
+from moe_kernels.fused_moe import fused_moe
+
+
+class FP8SparseMoELayer(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+    ):
+        super().__init__()
+
+        assert (n_expert_group is None) == (
+            topk_group is None
+        ), "n_expert_group and topk_group must both be None or have some value"
+
+        self.n_expert_group = n_expert_group
+        self.topk = topk
+        self.topk_group = topk_group
+        self.renormalize = renormalize
+
+        (
+            self.gate_up_proj,
+            self.gate_up_proj_weight_scale,
+            self.gate_up_proj_input_scale,
+        ) = _load_expert_multi_weights_col(
+            prefix=prefix,
+            n_experts=n_experts,
+            gate_proj_name=gate_proj_name,
+            up_proj_name=up_proj_name,
+            weights=weights,
+        )
+
+        self.down_proj, self.down_proj_weight_scale, self.down_proj_input_scale = (
+            _load_expert_weights_row(
+                prefix=prefix,
+                n_experts=n_experts,
+                name=down_proj_name,
+                weights=weights,
+            )
+        )
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        return fused_moe(
+            x,
+            w1=self.gate_up_proj,
+            w2=self.down_proj,
+            gating_output=gating_output,
+            topk=self.topk,
+            renormalize=self.renormalize,
+            inplace=True,
+            use_grouped_topk=self.n_expert_group is not None,
+            num_expert_group=self.n_expert_group,
+            topk_group=self.topk_group,
+            use_fp8_w8a8=True,
+            w1_scale=self.gate_up_proj_weight_scale,
+            w2_scale=self.down_proj_weight_scale,
+            a1_scale=self.gate_up_proj_input_scale,
+            a2_scale=self.down_proj_input_scale,
+        )
+
+
+def _load_expert_weights(
+    get_weight_fn,
+    *,
+    prefix: str,
+    n_experts: int,
+    name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    all_weight = None
+    all_weight_scales = None
+    max_input_scale = None
+
+    for i in range(n_experts):
+        weight = get_weight_fn(prefix, i, name, weights)
+
+        assert isinstance(weight, Fp8Weight)
+
+        if all_weight is None:
+            all_weight = torch.empty(
+                (n_experts,) + weight.weight.shape,
+                dtype=quant_dtype,
+                device=weight.weight.device,
+            )
+        if all_weight_scales is None:
+            all_weight_scales = torch.empty(
+                (n_experts,),
+                dtype=torch.float32,
+                device=weight.weight.device,
+            )
+
+        if weight.weight.dtype in {torch.float8_e4m3fn, torch.float8_e4m3fnuz}:
+            all_weight[i], all_weight_scales[i], current_input_scale = (
+                normalize_e4m3fn_to_native_float8(
+                    weight.weight, weight.weight_scale, weight.input_scale
+                )
+            )
+            if current_input_scale is not None:
+                if max_input_scale is None or current_input_scale > max_input_scale:
+                    max_input_scale = current_input_scale
+        else:
+            all_weight[i], all_weight_scales[i] = fp8_quantize(
+                weight.weight, scalar=True
+            )
+
+    assert all_weight is not None
+
+    return all_weight, all_weight_scales, max_input_scale
+
+
+def _load_expert_multi_weights_col(
+    *,
+    prefix: str,
+    n_experts: int,
+    gate_proj_name: str,
+    up_proj_name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    def get_weight_fn(prefix, i, name, weights):
+        return weights.get_multi_weights_col(
+            [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
+        )
+
+    return _load_expert_weights(
+        get_weight_fn, prefix=prefix, n_experts=n_experts, name=None, weights=weights
+    )
+
+
+def _load_expert_weights_row(
+    *,
+    prefix: str,
+    n_experts: int,
+    name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    def get_weight_fn(prefix, i, name, weights):
+        return weights.get_weights_row(f"{prefix}.{i}.{name}")
+
+    return _load_expert_weights(
+        get_weight_fn, prefix=prefix, n_experts=n_experts, name=name, weights=weights
+    )
diff --git a/server/text_generation_server/layers/moe/unquantized.py b/server/text_generation_server/layers/moe/unquantized.py
index 3c9bcaba..32326653 100644
--- a/server/text_generation_server/layers/moe/unquantized.py
+++ b/server/text_generation_server/layers/moe/unquantized.py
@@ -58,17 +58,7 @@ class UnquantizedSparseMoELayer(nn.Module):
             )
 
     def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
-        if SYSTEM == "rocm":
-            return fused_moe(
-                x,
-                self.gate_up_proj,
-                self.down_proj,
-                gating_output,
-                self.topk,
-                renormalize=self.renormalize,
-                inplace=True,
-            )
-        elif SYSTEM == "ipex":
+        if SYSTEM == "ipex":
             return self.ipex_fused_moe(
                 hidden_states=x,
                 router_logits=gating_output,

From ee0dffcd14e2569c9b544eb7f2ce66c193f0bfe8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 29 Jan 2025 18:19:55 +0100
Subject: [PATCH 064/183] Update to moe-kernels 0.8.0 (#2966)

---
 flake.lock            |   8 ++--
 flake.nix             |   2 +-
 server/pyproject.toml |   7 +--
 server/uv.lock        | 104 ++----------------------------------------
 4 files changed, 11 insertions(+), 110 deletions(-)

diff --git a/flake.lock b/flake.lock
index 6bf4ba2f..4779427f 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,16 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1737715219,
-        "narHash": "sha256-oIxoNreSeSILjWxcZHXW3cdcoNQHnXO5deXoIiC1tng=",
+        "lastModified": 1738163501,
+        "narHash": "sha256-MW+HVo3Kjr/W8ra7qyeG2nW/Z6fsZ7nDfWs3Uvw9Xko=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "b91a56628f446c6cb79d224f17c1c66fe1a260f6",
+        "rev": "bfdd9594c7d99cf8442e06f3bb2b4ab08185affe",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "attention-kernels-0.2.0",
+        "ref": "moe-kernels-0.8.0",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 883bae91..d8a8a6cf 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/attention-kernels-0.2.0";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/moe-kernels-0.8.0";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/server/pyproject.toml b/server/pyproject.toml
index dbc1fa7a..ceedc3bd 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -75,12 +75,7 @@ marlin-kernels = [
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
-moe-kernels = [
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
-]
+moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
 
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
diff --git a/server/uv.lock b/server/uv.lock
index ef26b463..5684d581 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -997,101 +997,15 @@ wheels = [
 
 [[package]]
 name = "moe-kernels"
-version = "0.7.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.13'",
-]
+version = "0.8.0"
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
 dependencies = [
-    { name = "nvidia-ml-py", marker = "python_full_version >= '3.13'" },
-    { name = "torch", marker = "python_full_version >= '3.13'" },
-    { name = "triton", marker = "python_full_version >= '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/9c/a1/76f32a7ce5b18e5b841e64e0e2631fa2b94f432743d6ab76b76fb24fe961/moe-kernels-0.7.0.tar.gz", hash = "sha256:1f564affad32077fe17c24566bd6200e38087e90b248636bab55fac2e1ac6874", size = 23629 }
-
-[[package]]
-name = "moe-kernels"
-version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.10.*'",
-]
-dependencies = [
-    { name = "nvidia-ml-py", marker = "python_full_version == '3.10.*'" },
-    { name = "torch", marker = "python_full_version == '3.10.*'" },
-    { name = "triton", marker = "python_full_version == '3.10.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl", hash = "sha256:242d5de087902aa84dff54b6ba140b4066904fe5e7757f934645343b052ab076" },
-]
-
-[package.metadata]
-requires-dist = [
     { name = "nvidia-ml-py" },
     { name = "torch" },
     { name = "triton" },
 ]
-
-[[package]]
-name = "moe-kernels"
-version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.11.*'",
-]
-dependencies = [
-    { name = "nvidia-ml-py", marker = "python_full_version == '3.11.*'" },
-    { name = "torch", marker = "python_full_version == '3.11.*'" },
-    { name = "triton", marker = "python_full_version == '3.11.*'" },
-]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl", hash = "sha256:8198a3388a03a3248d5f5698097e8ce0a73b6a01f9854fc2338aacc57e554e8a" },
-]
-
-[package.metadata]
-requires-dist = [
-    { name = "nvidia-ml-py" },
-    { name = "torch" },
-    { name = "triton" },
-]
-
-[[package]]
-name = "moe-kernels"
-version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.12.*'",
-]
-dependencies = [
-    { name = "nvidia-ml-py", marker = "python_full_version == '3.12.*'" },
-    { name = "torch", marker = "python_full_version == '3.12.*'" },
-    { name = "triton", marker = "python_full_version == '3.12.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl", hash = "sha256:b35fb02ae560b560f4af107791a3308dc97d5ca57d39bab20acec3a0f082ccf2" },
-]
-
-[package.metadata]
-requires-dist = [
-    { name = "nvidia-ml-py" },
-    { name = "torch" },
-    { name = "triton" },
-]
-
-[[package]]
-name = "moe-kernels"
-version = "0.7.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-dependencies = [
-    { name = "nvidia-ml-py", marker = "python_full_version < '3.10'" },
-    { name = "torch", marker = "python_full_version < '3.10'" },
-    { name = "triton", marker = "python_full_version < '3.10'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl", hash = "sha256:cf8d276deb7a4d40fed3eb02e1b6f8d08ccec0f4256260922af13927ca044f56" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:92c4e083c037a325458e731dda6770790495cab273c9bbf5f50fb8e262c099de" },
 ]
 
 [package.metadata]
@@ -2798,11 +2712,7 @@ marlin = [
     { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
 ]
 moe = [
-    { name = "moe-kernels", version = "0.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
-    { name = "moe-kernels", version = "0.7.0", source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
+    { name = "moe-kernels" },
 ]
 outlines = [
     { name = "outlines" },
@@ -2836,11 +2746,7 @@ requires-dist = [
     { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
     { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
     { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "python_full_version == '3.9.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp39-cp39-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "(python_full_version < '3.9' and extra == 'moe') or (python_full_version >= '3.13' and extra == 'moe')" },
-    { name = "moe-kernels", marker = "python_full_version == '3.10.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "python_full_version == '3.11.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "python_full_version == '3.12.*' and extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.7.0/moe_kernels-0.7.0+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
     { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
     { name = "numpy", specifier = ">=1.26,<3" },
     { name = "opentelemetry-api", specifier = ">=1.27.0" },

From 80e7d98f8870364f60e57b66688581d54ea18c65 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 29 Jan 2025 22:34:41 +0100
Subject: [PATCH 065/183] Hotfixing intel-cpu (not sure how it was working
 before). (#2967)

* Hotfixing intel-cpu (not sure how it was working before).

* Do not fail on missing moe-kernels (Intel-cpu).
---
 server/text_generation_server/layers/moe/fp8.py  | 6 +++++-
 server/text_generation_server/models/__init__.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/server/text_generation_server/layers/moe/fp8.py b/server/text_generation_server/layers/moe/fp8.py
index 7ccddb5b..4d0295f6 100644
--- a/server/text_generation_server/layers/moe/fp8.py
+++ b/server/text_generation_server/layers/moe/fp8.py
@@ -10,7 +10,11 @@ from text_generation_server.layers.fp8 import (
     quant_dtype,
     normalize_e4m3fn_to_native_float8,
 )
-from moe_kernels.fused_moe import fused_moe
+
+try:
+    from moe_kernels.fused_moe import fused_moe
+except Exception:
+    fused_moe = None
 
 
 class FP8SparseMoELayer(nn.Module):
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index eb5a8de7..2d735227 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -180,7 +180,7 @@ except ImportError as e:
 if MAMBA_AVAILABLE:
     __all__.append(Mamba)
 
-FLASH_TRANSFORMERS_BACKEND = True
+FLASH_TRANSFORMERS_BACKEND = torch.cuda.is_available()
 try:
     from text_generation_server.models.transformers_flash_causal_lm import (
         TransformersFlashCausalLM,

From cb747b33da9511f93fc469c2977cb7967f8ba99d Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 30 Jan 2025 16:40:25 +0100
Subject: [PATCH 066/183] Add deepseekv3 (#2968)

* Add fp8 support moe models

add deepseekv3

format codfe'

update dockerfile

update doc

* Small modifications.

* Moe kernels 0.8.1

* Upgrade to 0.8.1

* Fixing moe import.

* Black.

* Apply suggestions from code review

Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com>

* Fixing Mixtral + Nits.

* Put link to ref.

* Fix other call locations.

* Scoring func `softmax` is the only one that works.

---------

Co-authored-by: Mohit Sharma <mohit21sharma.ms@gmail.com>
---
 Dockerfile_amd                                |   2 +-
 docs/source/supported_models.md               |   1 +
 flake.lock                                    |   8 +-
 flake.nix                                     |   2 +-
 launcher/src/main.rs                          |   9 +-
 router/src/config.rs                          |   2 +
 server/pyproject.toml                         |   2 +-
 server/text_generation_server/layers/fp8.py   |  76 +-
 .../layers/moe/__init__.py                    |  32 +-
 .../text_generation_server/layers/moe/fp8.py  |   9 +-
 .../layers/moe/gptq_marlin.py                 |   4 +
 .../layers/moe/unquantized.py                 |   8 +-
 .../text_generation_server/models/__init__.py |  43 ++
 .../flash_deepseek_v3_modeling.py             | 676 ++++++++++++++++++
 server/text_generation_server/utils/dist.py   |   2 +
 .../utils/quantization.py                     |  14 +-
 16 files changed, 864 insertions(+), 26 deletions(-)
 create mode 100644 server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py

diff --git a/Dockerfile_amd b/Dockerfile_amd
index 293cac2d..8b7808be 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -279,7 +279,7 @@ RUN git clone https://github.com/danieldk/marlin-kernels.git && \
 
 FROM kernel-builder AS moe-kernels
 WORKDIR /usr/src
-ENV MOE_KERNELS_BRANCH=a67b35841774b2056a73806c36661134b5054edd
+ENV MOE_KERNELS_BRANCH=d7e042bf9f7aff10c631212fc71b24895d66eb59
 ENV VLLM_TARGET_DEVICE=rocm
 RUN git clone https://github.com/danieldk/moe-kernels.git && \
     cd moe-kernels && \
diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index 5ac90351..d6a8b7ae 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -4,6 +4,7 @@
 Text Generation Inference enables serving optimized models. The following sections list which models (VLMs & LLMs) are supported.
 
 - [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
+- [Deepseek V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)
 - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
 - [Idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
diff --git a/flake.lock b/flake.lock
index 4779427f..ba3a7173 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,16 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1738163501,
-        "narHash": "sha256-MW+HVo3Kjr/W8ra7qyeG2nW/Z6fsZ7nDfWs3Uvw9Xko=",
+        "lastModified": 1738229197,
+        "narHash": "sha256-K/YJSFhzP0vN23GMfM1HVMtSzaM488hh12ggsMtKMG0=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "bfdd9594c7d99cf8442e06f3bb2b4ab08185affe",
+        "rev": "cfcddaf3044f59c3fbd335935ac3c0e9f458d824",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "moe-kernels-0.8.0",
+        "ref": "moe_0_8_1",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index d8a8a6cf..6c9f5014 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/moe-kernels-0.8.0";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/moe_0_8_1";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index c6f6b6e9..05ed0202 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1635,6 +1635,7 @@ enum Gpu {
     A40,
     H100,
     A100,
+    H200,
     Unknown(String),
 }
 
@@ -1661,6 +1662,7 @@ impl From<&str> for Gpu {
             "nvidia-a100-sxm4-40gb" => Gpu::A100,
             "nvidia-a100-80gb-pcie" => Gpu::A100,
             "nvidia-a100" => Gpu::A100,
+            "nvidia-h200" => Gpu::H200,
             card => Gpu::Unknown(card.to_string()),
         }
     }
@@ -1678,6 +1680,7 @@ impl std::fmt::Display for Gpu {
             Gpu::A40 => write!(f, "nvidia-a40"),
             Gpu::H100 => write!(f, "nvidia-h100-80fb-hbm3"),
             Gpu::A100 => write!(f, "nvida-a100-sxm4-80gb"),
+            Gpu::H200 => write!(f, "nvida-h200"),
             Gpu::Unknown(card) => write!(f, "{}", card),
         }
     }
@@ -1702,11 +1705,13 @@ impl ComputeType {
             // https://www.nvidia.com/en-us/data-center/a40/
             // https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
             Gpu::A40 => Some(149 * 10u64.pow(12)),
+            // https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
+            Gpu::A100 => Some(312 * 10u64.pow(12)),
             // https://www.nvidia.com/en-us/data-center/h100/
             // https://www.techpowerup.com/gpu-specs/docs/nvidia-gh100-architecture.pdf
             Gpu::H100 => Some(900 * 10u64.pow(12)),
-            // https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
-            Gpu::A100 => Some(312 * 10u64.pow(12)),
+            // https://www.nvidia.com/en-us/data-center/h200/
+            Gpu::H200 => Some(989 * 10u64.pow(12)),
             Gpu::Unknown(card) => {
                 tracing::warn!("Unkown compute for card {card}");
                 None
diff --git a/router/src/config.rs b/router/src/config.rs
index 4d5fcfa0..a1ac107a 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -224,6 +224,8 @@ pub enum Config {
     Qwen2,
     Opt,
     T5,
+    DeepseekV2,
+    DeepseekV3,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
diff --git a/server/pyproject.toml b/server/pyproject.toml
index ceedc3bd..8888f5c6 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -75,7 +75,7 @@ marlin-kernels = [
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
-moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
+moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.1/moe_kernels-0.8.1+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
 
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
index 04f8d0c2..ae20235d 100644
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@@ -19,6 +19,12 @@ try:
 except ImportError:
     marlin_kernels = None
 
+try:
+    from moe_kernels.fp8_utils import w8a8_block_fp8_matmul, per_token_group_quant_fp8
+except ImportError:
+    w8a8_block_fp8_matmul = None
+    per_token_group_quant_fp8 = None
+
 quant_dtype: torch.dtype = (
     torch.float8_e4m3fnuz if SYSTEM == "rocm" else torch.float8_e4m3fn
 )
@@ -38,7 +44,6 @@ def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
     """
 
     if SYSTEM == "cuda":
-
         major, _ = torch.cuda.get_device_capability()
         # Marlin is W8A16, use it when:
         #
@@ -180,14 +185,29 @@ def fp8_quantize(
 class HybridFP8UnquantLoader(WeightsLoader):
     """Weight loader that loads FP8 and unquantized Torch tensors."""
 
-    def __init__(self, activation_scale_ub: Optional[float], to_fp8: bool):
+    def __init__(
+        self,
+        activation_scale_ub: Optional[float],
+        to_fp8: bool,
+        weight_block_size: Optional[List[int]] = None,
+    ):
         self.activation_scale_ub = activation_scale_ub
         self.to_fp8 = to_fp8
+        self.weight_block_size = weight_block_size
 
     def get_weights(self, weights: "Weights", prefix: str):
         w = weights.get_tensor(f"{prefix}.weight")
 
         if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                scale = weights.get_tensor(f"{prefix}.weight_scale_inv")
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
             # FP8 branch
             scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
 
@@ -276,6 +296,21 @@ class HybridFP8UnquantLoader(WeightsLoader):
 
         # FP8 branch
         if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                scale = [
+                    weights.get_sharded(f"{p}.weight_scale_inv", dim=0, to_device=False)
+                    for p in prefixes
+                ]
+                scale = torch.cat(scale, dim=dim)
+                scale = scale.to(weights.device)
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+
             scale = [
                 _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                 for p, shape in zip(prefixes, shapes)
@@ -321,6 +356,18 @@ class HybridFP8UnquantLoader(WeightsLoader):
         w = weights.get_sharded(f"{prefix}.weight", dim=1)
         # FP8 branch
         if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                # XXX: Yes the weights is named scale_inv, but corresponds to scale it seems.
+                scale = weights.get_sharded(f"{prefix}.weight_scale_inv", dim=1)
+
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+
             scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
 
             if SYSTEM == "cuda":
@@ -355,6 +402,7 @@ class Fp8Weight(Weight):
     input_scale: Optional[torch.Tensor] = None
     activation_scale_ub: Optional[float] = None
     force_w8a16: bool = False
+    weight_block_size: Optional[List[int]] = None
 
     def get_linear(self, bias: torch.Tensor):
         if self.weight_scale is None:
@@ -371,6 +419,7 @@ class Fp8Weight(Weight):
             bias=bias,
             input_scale=self.input_scale,
             scale_upper_bound=self.activation_scale_ub,
+            weight_block_size=self.weight_block_size,
         )
 
 
@@ -385,6 +434,7 @@ class Fp8Linear(torch.nn.Module):
         bias: Optional[torch.Tensor] = None,
         input_scale: Optional[torch.Tensor] = None,
         scale_upper_bound: Optional[float] = None,
+        weight_block_size: Optional[List[int]] = None,
     ) -> None:
         super().__init__()
         if CUTLASS_FP8_AVAILABLE:
@@ -398,6 +448,7 @@ class Fp8Linear(torch.nn.Module):
         self.qweight = qweight
         self.scale = scale.float()
         self.input_scale = input_scale.float() if input_scale is not None else None
+        self.weight_block_size = weight_block_size
 
         if CUTLASS_FP8_AVAILABLE and scale_upper_bound is not None:
             self.scale_upper_bound = torch.tensor(
@@ -431,6 +482,7 @@ class Fp8Linear(torch.nn.Module):
     ) -> "Fp8Linear":
         input_scale = kwargs.get("input_scale", None)
         scale_upper_bound = kwargs.get("scale_upper_bound", None)
+        weight_block_size = kwargs.get("weight_block_size", None)
 
         return cls(
             qweight=weight,
@@ -439,6 +491,7 @@ class Fp8Linear(torch.nn.Module):
             scale_upper_bound=scale_upper_bound,
             bias=bias,
             dtype=dtype,
+            weight_block_size=weight_block_size,
         )
 
     @classmethod
@@ -450,6 +503,25 @@ class Fp8Linear(torch.nn.Module):
         return cls._device_identity_cache[device]
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.weight_block_size is not None:
+            # https://arxiv.org/pdf/2412.19437
+            # At a more granular level. As illustrated in Figure 7 (a), (1) for activations, we group and
+            # scale elements on a 1x128 tile basis (i.e., per token per 128 channels); and (2) for weights, we
+            # group and scale elements on a 128x128 block basis (i.e., per 128 input channels per 128 output
+            # channels).
+            qinput, scale = per_token_group_quant_fp8(input, self.weight_block_size[1])
+            output = w8a8_block_fp8_matmul(
+                qinput,
+                self.qweight,
+                scale,
+                self.scale,
+                self.weight_block_size,
+                output_dtype=input.dtype,
+            )
+
+            if self.bias is not None:
+                output = output + self.bias
+            return output.to(dtype=input.dtype)
         if CUTLASS_FP8_AVAILABLE:
             # cutlass FP8 supports per-token scales, so get non-scalar scales.
             qinput, scale = fp8_quantize(
diff --git a/server/text_generation_server/layers/moe/__init__.py b/server/text_generation_server/layers/moe/__init__.py
index 3a274460..23d0d38c 100644
--- a/server/text_generation_server/layers/moe/__init__.py
+++ b/server/text_generation_server/layers/moe/__init__.py
@@ -52,6 +52,8 @@ class MoELayer(Protocol):
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
         hidden_act: str = "silu",
+        scoring_func: Optional[str] = None,
+        e_score_correction_bias: Optional[float] = None,
     ): ...
 
     def forward(
@@ -81,9 +83,14 @@ class DenseMoELayer(nn.Module):
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
         hidden_act: str = "silu",
+        scoring_func: Optional[str] = None,
+        e_score_correction_bias: Optional[float] = None,
     ):
         super().__init__()
 
+        assert scoring_func is None, "scoring func is not handled"
+        assert e_score_correction_bias is None, "scoring correction bias is not handled"
+
         log_once(
             logger.info,
             "No fused layers are available for this model type, using (slower) dense MoE layer",
@@ -199,21 +206,24 @@ class SparseMoELayer(nn.Module):
         topk: int,
         topk_group: Optional[int],
         weights: Weights,
+        scoring_func: Optional[str] = "softmax",
+        e_score_correction_bias: Optional[float] = None,
         gate_proj_name: str = "gate_proj",
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
     ):
         super().__init__()
-        if isinstance(weights.loader, DefaultWeightsLoader) and isinstance(
-            weights.loader.weight_class, UnquantizedWeight
-        ):
-            cls = UnquantizedSparseMoELayer
-        elif isinstance(weights.loader, HybridFP8UnquantLoader):
-            cls = (
-                FP8SparseMoELayer
-                if weights.loader.to_fp8
-                else UnquantizedSparseMoELayer
-            )
+        if (
+            isinstance(weights.loader, DefaultWeightsLoader)
+            and isinstance(weights.loader.weight_class, UnquantizedWeight)
+        ) or isinstance(weights.loader, HybridFP8UnquantLoader):
+            if (
+                isinstance(weights.loader, HybridFP8UnquantLoader)
+                and weights.loader.to_fp8
+            ):
+                cls = FP8SparseMoELayer
+            else:
+                cls = UnquantizedSparseMoELayer
         elif isinstance(
             weights.loader, GPTQMarlinWeightsLoader
         ) and can_use_marlin_moe_gemm(
@@ -240,6 +250,8 @@ class SparseMoELayer(nn.Module):
             topk=topk,
             topk_group=topk_group,
             weights=weights,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
             gate_proj_name=gate_proj_name,
             up_proj_name=up_proj_name,
             down_proj_name=down_proj_name,
diff --git a/server/text_generation_server/layers/moe/fp8.py b/server/text_generation_server/layers/moe/fp8.py
index 4d0295f6..3016c8a2 100644
--- a/server/text_generation_server/layers/moe/fp8.py
+++ b/server/text_generation_server/layers/moe/fp8.py
@@ -28,6 +28,8 @@ class FP8SparseMoELayer(nn.Module):
         topk: int,
         topk_group: Optional[int],
         weights: Weights,
+        scoring_func: Optional[str] = "softmax",
+        e_score_correction_bias: Optional[float] = None,
         gate_proj_name: str = "gate_proj",
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
@@ -42,6 +44,9 @@ class FP8SparseMoELayer(nn.Module):
         self.topk = topk
         self.topk_group = topk_group
         self.renormalize = renormalize
+        self.weight_block_size = weights.weights_loader.weight_block_size
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
 
         (
             self.gate_up_proj,
@@ -76,6 +81,8 @@ class FP8SparseMoELayer(nn.Module):
             use_grouped_topk=self.n_expert_group is not None,
             num_expert_group=self.n_expert_group,
             topk_group=self.topk_group,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias,
             use_fp8_w8a8=True,
             w1_scale=self.gate_up_proj_weight_scale,
             w2_scale=self.down_proj_weight_scale,
@@ -109,7 +116,7 @@ def _load_expert_weights(
             )
         if all_weight_scales is None:
             all_weight_scales = torch.empty(
-                (n_experts,),
+                (n_experts,) + weight.weight_scale.shape,
                 dtype=torch.float32,
                 device=weight.weight.device,
             )
diff --git a/server/text_generation_server/layers/moe/gptq_marlin.py b/server/text_generation_server/layers/moe/gptq_marlin.py
index 3d4ca9d8..014a90dc 100644
--- a/server/text_generation_server/layers/moe/gptq_marlin.py
+++ b/server/text_generation_server/layers/moe/gptq_marlin.py
@@ -69,7 +69,11 @@ class GPTQMarlinSparseMoELayer(nn.Module):
         gate_proj_name: str = "gate_proj",
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
+        scoring_func: Optional[str] = None,
+        e_score_correction_bias: Optional[float] = None,
     ):
+        assert scoring_func == "softmax", f"scoring func {scoring_func} is not handled"
+        assert e_score_correction_bias is None, "scoring correction bias is not handled"
         super().__init__()
 
         if not (
diff --git a/server/text_generation_server/layers/moe/unquantized.py b/server/text_generation_server/layers/moe/unquantized.py
index 32326653..9277384a 100644
--- a/server/text_generation_server/layers/moe/unquantized.py
+++ b/server/text_generation_server/layers/moe/unquantized.py
@@ -23,6 +23,8 @@ class UnquantizedSparseMoELayer(nn.Module):
         topk: int,
         topk_group: Optional[int],
         weights: Weights,
+        scoring_func: Optional[str] = "softmax",
+        e_score_correction_bias: Optional[float] = None,
         gate_proj_name: str = "gate_proj",
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
@@ -37,6 +39,9 @@ class UnquantizedSparseMoELayer(nn.Module):
         self.topk = topk
         self.topk_group = topk_group
         self.renormalize = renormalize
+        self.weight_block_size = weights.weights_loader.weight_block_size
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
 
         self.gate_up_proj = _load_expert_multi_weights_col(
             prefix=prefix,
@@ -68,7 +73,6 @@ class UnquantizedSparseMoELayer(nn.Module):
                 num_expert_group=self.n_expert_group,
                 topk_group=self.topk_group,
             )
-
         return fused_moe(
             x,
             w1=self.gate_up_proj,
@@ -80,6 +84,8 @@ class UnquantizedSparseMoELayer(nn.Module):
             use_grouped_topk=self.n_expert_group is not None,
             num_expert_group=self.n_expert_group,
             topk_group=self.topk_group,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias,
         )
 
 
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 2d735227..205030e9 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -89,6 +89,10 @@ try:
         FlashDeepseekV2ForCausalLM,
         DeepseekV2Config,
     )
+    from text_generation_server.models.custom_modeling.flash_deepseek_v3_modeling import (
+        FlashDeepseekV3ForCausalLM,
+        DeepseekV3Config,
+    )
     from text_generation_server.models.custom_modeling.flash_llama_modeling import (
         FlashLlamaForCausalLM,
     )
@@ -195,6 +199,11 @@ class ModelType(enum.Enum):
         "name": "Deepseek V2",
         "url": "https://huggingface.co/deepseek-ai/DeepSeek-V2",
     }
+    DEEPSEEK_V3 = {
+        "type": "deepseek_v3",
+        "name": "Deepseek V3",
+        "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3",
+    }
     IDEFICS2 = {
         "type": "idefics2",
         "name": "Idefics 2",
@@ -642,6 +651,40 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+    elif model_type == DEEPSEEK_V3:
+        if FLASH_ATTENTION:
+            head_size = max(
+                config_dict.get("qk_nope_dim", 128)
+                + config_dict.get("qk_rope_dim", 64),
+                config_dict.get("v_head_dim", 128),
+            )
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashDeepseekV3ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                default_dtype=torch.bfloat16,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=DeepseekV3Config,
+                head_size=head_size,
+            )
+        elif sharded:
+            raise NotImplementedError(
+                FLASH_ATT_ERROR_MESSAGE.format("Sharded Deepseek V3")
+            )
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
     elif model_type == MAMBA:
         return Mamba(
             model_id,
diff --git a/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py b/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py
new file mode 100644
index 00000000..25fb3cc2
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py
@@ -0,0 +1,676 @@
+# coding=utf-8
+# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Type
+
+import torch
+import torch.distributed
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+
+from text_generation_server.layers import (
+    FastLinear,
+    SpeculativeHead,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    get_linear,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+    attention,
+    paged_attention,
+)
+from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales
+from text_generation_server.layers.layernorm import FastRMSNorm
+from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
+from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.weights import Weights
+
+if SYSTEM == "rocm":
+    try:
+        import vllm._custom_ops as ops
+    except Exception as e:
+        raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")
+
+
+class DeepseekV3Config(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=2,
+        n_routed_experts=160,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method="gready",
+        n_group=8,
+        topk_group=3,
+        num_experts_per_tok=6,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func="softmax",
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError(
+                "tie_word_embeddings is not supported for Deepseek V2 models."
+            )
+
+        if ep_size != 1:
+            raise ValueError(
+                f"Currently only ep_size == 1 is supported for Deepseek V2 models, was {ep_size}"
+            )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekV3Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights: Weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.kv_lora_rank = config.kv_lora_rank
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.value_head_size = config.v_head_dim
+        self.head_pad_size = max(self.head_size, self.value_head_size)
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.qk_rope_head_dim,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        mscale = get_mscale(
+            self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
+        )
+        self.softmax_scale = self.head_size**-0.5 * mscale * mscale
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        if self.q_lora_rank is None:
+            self.q_proj = TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.q_proj",
+                weights=weights,
+                bias=config.attention_bias,
+            )
+        else:
+            self.q_a_proj = get_linear(
+                weight=weights.get_weights(f"{prefix}.q_a_proj"),
+                bias=(
+                    weights.get_tensor(f"{prefix}.q_a_proj.bias")
+                    if config.attention_bias
+                    else None
+                ),
+            )
+            self.q_a_layernorm = FastRMSNorm.load(
+                prefix=f"{prefix}.q_a_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+            self.q_b_proj = TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.q_b_proj",
+                weights=weights,
+                bias=config.attention_bias,
+            )
+
+        self.kv_a_proj_with_mqa = get_linear(
+            weight=weights.get_weights(f"{prefix}.kv_a_proj_with_mqa"),
+            bias=(
+                weights.get_tensor(f"{prefix}.kv_a_proj_with_mqa.bias")
+                if config.attention_bias
+                else None
+            ),
+        )
+
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
+        self.kv_a_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.kv_b_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.kv_b_proj",
+            weights=weights,
+            bias=config.attention_bias,
+        )
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        cu_seqlen_prefill: torch.Tensor,
+        kv_cache: KVCache,
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ):
+        if self.q_lora_rank is None:
+            query = self.q_proj(hidden_states)
+        else:
+            query = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))[0])
+        query = query.view(-1, self.num_heads, self.head_size)
+
+        _, query_pe = torch.split(
+            query, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, key_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+
+        key_pe = key_pe.view(-1, 1, self.qk_rope_head_dim)
+        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv.contiguous())[0]).view(
+            -1, self.num_key_value_heads, self.qk_nope_head_dim + self.value_head_size
+        )
+
+        key_nope, value = torch.split(
+            kv, [self.qk_nope_head_dim, self.value_head_size], dim=-1
+        )
+
+        batch_size, heads, head_dim = query_pe.shape
+        query_pe = (
+            query_pe.view(batch_size, heads, head_dim // 2, 2)
+            .transpose(2, 3)
+            .reshape(batch_size, heads, head_dim)
+        )
+        batch_size, heads, head_dim = key_pe.shape
+        key_pe = (
+            key_pe.view(batch_size, heads, head_dim // 2, 2)
+            .transpose(2, 3)
+            .reshape(batch_size, heads, head_dim)
+        )
+        self.rotary_emb(query_pe, key_pe, cos, sin)
+
+        query[..., self.qk_nope_head_dim :] = query_pe
+        key = torch.empty_like(query)
+        key[..., : self.qk_nope_head_dim] = key_nope
+        key[..., self.qk_nope_head_dim :] = key_pe
+
+        # We need to pad the heads because Flash Attention does not support
+        # qk and v with different head sizes.
+        query = torch.nn.functional.pad(
+            query, (0, self.head_pad_size - self.head_size), value=0
+        )
+        key = torch.nn.functional.pad(
+            key, (0, self.head_pad_size - self.head_size), value=0
+        )
+        value = torch.nn.functional.pad(
+            value, (0, self.head_pad_size - self.value_head_size), value=0
+        )
+
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache,
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+                kv_scales=self.kv_scales,
+            )
+
+        # Remove padding.
+        attn_output = attn_output[..., : self.value_head_size]
+
+        return self.o_proj(
+            attn_output.reshape(-1, self.num_heads * self.value_head_size)
+        )
+
+
+class DeepseekV3MLP(nn.Module):
+    def __init__(self, prefix: str, config, weights, intermediate_size: int):
+        super().__init__()
+        self.hidden_act = config.hidden_act
+        if self.hidden_act != "silu":
+            # Bail out because MoE only supports silu.
+            raise NotImplementedError(
+                "Currently only `silu` is supported as an activation for Deepseek V2."
+            )
+        self.act = ACT2FN[self.hidden_act]
+
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.intermediate_size = intermediate_size // weights.process_group.size()
+
+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
+    def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
+        if (
+            SYSTEM == "rocm"
+            and self.hidden_act == "silu"
+            and hidden_states.dtype == torch.float16
+            and hidden_states.shape[0] == 1
+            and not self.quantize
+        ):
+            out = torch.empty(
+                hidden_states.shape[0],
+                self.intermediate_size,
+                dtype=hidden_states.dtype,
+                device="cuda",
+            )
+            ops.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
+            return self.down_proj(out, reduce=reduce)
+        else:
+            gate_up_states = self.gate_up_proj(hidden_states)
+            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+            return self.down_proj(
+                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
+            )
+
+
+class DeepseekV3MoE(nn.Module):
+    def __init__(
+        self,
+        prefix,
+        config: DeepseekV3Config,
+        moe_layer_cls: Type[MoELayer],
+        weights,
+    ):
+        super().__init__()
+
+        self.hidden_dim = config.hidden_size
+        self.moe_intermediate_size = (
+            config.moe_intermediate_size // weights.process_group.size()
+        )
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        # Gating
+        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
+
+        if config.topk_method == "noaux_tc":
+            self.gate.e_score_correction_bias = torch.zeros(
+                config.n_routed_experts, device=weights.device
+            )
+        else:
+            self.gate.e_score_correction_bias = None
+
+        self.moe_layer = moe_layer_cls(
+            prefix=f"{prefix}.experts",
+            n_experts=config.n_routed_experts,
+            n_expert_group=config.n_group,
+            renormalize=config.norm_topk_prob,
+            topk=config.num_experts_per_tok,
+            topk_group=config.topk_group,
+            weights=weights,
+            scoring_func=config.scoring_func,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+        )
+        assert isinstance(self.moe_layer, MoELayer)
+
+        if config.n_shared_experts is not None:
+            self.shared_experts = DeepseekV3MLP(
+                prefix=f"{prefix}.shared_experts",
+                config=config,
+                weights=weights,
+                intermediate_size=config.moe_intermediate_size
+                * config.n_shared_experts,
+            )
+        else:
+            self.shared_experts = None
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(x, reduce=False)
+        else:
+            shared_output = None
+
+        router_logits = self.gate(x)
+
+        out = self.moe_layer(x, gating_output=router_logits)
+
+        if shared_output is not None:
+            out = out + shared_output
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
+class DeepseekV3Layer(nn.Module):
+    def __init__(self, prefix, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+
+        self.self_attn = DeepseekV3Attention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+        )
+
+        if (
+            config.n_routed_experts is not None
+            and layer_id >= config.first_k_dense_replace
+            and layer_id % config.moe_layer_freq == 0
+        ):
+            moe_layer_cls = (
+                SparseMoELayer
+                if SparseMoELayer.is_supported(weights)
+                else DenseMoELayer
+            )
+            self.mlp = DeepseekV3MoE(f"{prefix}.mlp", config, moe_layer_cls, weights)
+        else:
+            self.mlp = DeepseekV3MLP(
+                prefix=f"{prefix}.mlp",
+                config=config,
+                weights=weights,
+                intermediate_size=config.intermediate_size,
+            )
+
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        cu_seqlen_prefill: torch.Tensor,
+        kv_cache,
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ):
+        normed_hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, residual = self.post_attention_layernorm(
+            attn_output, residual
+        )
+
+        output = self.mlp(normed_attn_res_output)
+
+        return output, residual
+
+
+class DeepseekV3Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights: Weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                DeepseekV3Layer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashDeepseekV3ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights: Weights):
+        super().__init__()
+
+        self.model = DeepseekV3Model(
+            "model" if not prefix else f"{prefix}.model", config, weights
+        )
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/utils/dist.py b/server/text_generation_server/utils/dist.py
index 1b766ddf..613c4784 100644
--- a/server/text_generation_server/utils/dist.py
+++ b/server/text_generation_server/utils/dist.py
@@ -81,12 +81,14 @@ def initialize_torch_distributed():
                     pg_options=options,
                 )
             else:
+                device = torch.device(f"cuda:{RANK}")
                 torch.distributed.init_process_group(
                     backend=backend,
                     world_size=WORLD_SIZE,
                     rank=RANK,
                     timeout=timedelta(seconds=120),
                     pg_options=options,
+                    device_id=device,
                 )
         else:
             logger.warning("torch.distributed is already initialized.")
diff --git a/server/text_generation_server/utils/quantization.py b/server/text_generation_server/utils/quantization.py
index 0d894939..e44cf64f 100644
--- a/server/text_generation_server/utils/quantization.py
+++ b/server/text_generation_server/utils/quantization.py
@@ -1,7 +1,7 @@
 import json
 import os
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, List
 
 from huggingface_hub import hf_hub_download
 from text_generation_server.layers.marlin.gptq import can_use_gptq_marlin
@@ -20,6 +20,7 @@ class _QuantizerConfig:
     groupsize: int
     quant_method: str
     sym: bool
+    weight_block_size: Optional[List[int]]
 
 
 @dataclass
@@ -49,16 +50,17 @@ def _get_quantizer_config(model_id, revision):
     checkpoint_format = None
     sym = False
     desc_act = False
+    weight_block_size = None
 
     filename = "config.json"
     try:
         data = _get_config_json(model_id, revision, filename)
-
         # FP8 config
         if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
             return _FP8QuantizerConfig(
                 activation_scale_ub=data["quantization_config"]["activation_scale_ub"]
             )
+        weight_block_size = data["quantization_config"].get("weight_block_size", None)
 
         if "zero_point" in data["quantization_config"]:
             sym = not data["quantization_config"]["zero_point"]
@@ -107,6 +109,7 @@ def _get_quantizer_config(model_id, revision):
         checkpoint_format=checkpoint_format,
         sym=sym,
         desc_act=desc_act,
+        weight_block_size=weight_block_size,
     )
 
 
@@ -196,9 +199,14 @@ def get_loader(
         # Since the default for the quantize config is _QuantizerConfig,
         # we need to add this check to not get an attribute error
         activation_scale_ub = None
+        weight_block_size = quantizer_config.weight_block_size
         if isinstance(quantizer_config, _FP8QuantizerConfig):
             activation_scale_ub = quantizer_config.activation_scale_ub
 
-        return HybridFP8UnquantLoader(activation_scale_ub, to_fp8=quantize == "fp8")
+        return HybridFP8UnquantLoader(
+            activation_scale_ub,
+            to_fp8=quantize == "fp8",
+            weight_block_size=weight_block_size,
+        )
     else:
         raise ValueError(f"Unknown quantization method: {quantize}")

From 065aabb13da8cce02e4096618518a181eaeb5943 Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Thu, 30 Jan 2025 18:04:42 +0100
Subject: [PATCH 067/183] doc: Update TRTLLM deployment doc.  (#2960)

* doc: Update TRTLLM deployment doc. Update TRTLLM CI to allow release builds when tagging TGI.

* doc: Update TRTLLM deployment doc. Update TRTLLM CI to allow release builds when tagging TGI.

* fix: PR comments
---
 .github/workflows/build.yaml   |  18 ++--
 Dockerfile_trtllm              |  19 ++--
 docs/source/backends/trtllm.md | 165 ++++++++++++++++++++++++++-------
 3 files changed, 155 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 73a55efe..2db466a8 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -64,7 +64,7 @@ jobs:
                 export runs_on="aws-g6-12xl-plus-priv-cache"
                 export platform=""
                 export extra_pytest=""
-                export target="nil"
+                export target=""
                 ;;
             cuda-trtllm)
                 export dockerfile="Dockerfile_trtllm"
@@ -74,7 +74,13 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest=""
-                export build_type="dev"
+                if [[ "${GITHUB_REF}" == "refs/tags/*" ]]; then 
+                  export build_type="release"; 
+                  export target=""; 
+                else 
+                  export build_type="dev";
+                  export target="ci-runtime"; 
+                fi
                 ;;
             rocm)
                 export dockerfile="Dockerfile_amd"
@@ -85,7 +91,7 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest="-k test_flash_gemma_gptq_load"
-                export target="nil"
+                export target=""
                 ;;
             intel-xpu)
                 export dockerfile="Dockerfile_intel"
@@ -95,7 +101,7 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform="xpu"
                 export extra_pytest=""
-                export target="nil"
+                export target=""
                 ;;
             intel-cpu)
                 export dockerfile="Dockerfile_intel"
@@ -106,7 +112,7 @@ jobs:
                 export runs_on="aws-highmemory-32-plus-priv"
                 export platform="cpu"
                 export extra_pytest="-k test_flash_gemma_simple"
-                export target="nil"
+                export target=""
                 ;;
           esac
           echo $dockerfile
@@ -193,7 +199,7 @@ jobs:
             sccache_gha_enabled=on
             actions_cache_url=${{ env.ACTIONS_CACHE_URL }}
             actions_runtime_token=${{ env.ACTIONS_RUNTIME_TOKEN }}
-
+          target: ${{ env.TARGET }}
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
           cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index 6538996a..999d63d7 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -123,15 +123,6 @@ COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
 COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
 COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
 
-FROM runtime
-
-LABEL co.huggingface.vendor="Hugging Face Inc."
-LABEL org.opencontainers.image.authors="hardware@hf.co"
-LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
-
-ENTRYPOINT ["./text-generation-launcher"]
-CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
-
 # This is used only for the CI/CD
 FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
 RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
@@ -152,3 +143,13 @@ COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
 
 # Basically we copy from target/debug instead of target/release
 COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
+
+# This is the final image
+FROM runtime
+
+LABEL co.huggingface.vendor="Hugging Face Inc."
+LABEL org.opencontainers.image.authors="hardware@hf.co"
+LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
+
+ENTRYPOINT ["./text-generation-launcher"]
+CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
diff --git a/docs/source/backends/trtllm.md b/docs/source/backends/trtllm.md
index be6416b1..e89e8f52 100644
--- a/docs/source/backends/trtllm.md
+++ b/docs/source/backends/trtllm.md
@@ -4,8 +4,13 @@ The NVIDIA TensorRT-LLM (TRTLLM) backend is a high-performance backend for LLMs
 that uses NVIDIA's TensorRT library for inference acceleration.
 It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels.
 
-To use the TRTLLM backend you need to compile `engines` for the models you want to use.
-Each `engine` must be compiled on the same GPU architecture that you will use for inference.
+To use the TRTLLM backend **you need to compile** `engines` for the models you want to use.
+Each `engine` must be compiled for a given set of:
+- GPU architecture that you will use for inference (e.g. A100, L40, etc.)
+- Maximum batch size
+- Maximum input length
+- Maximum output length
+- Maximum beams width
 
 ## Supported models
 
@@ -19,63 +24,159 @@ want to use.
 
 ```bash
 MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
-
-# Install huggingface_cli
-python -m pip install huggingface-cli[hf_transfer]
-
-# Login to the Hugging Face Hub
-huggingface-cli login
-
-# Create a directory to store the model
-mkdir -p /tmp/models/$MODEL_NAME
-
-# Create a directory to store the compiled engine
-mkdir -p /tmp/engines/$MODEL_NAME
-
-# Download the model
-HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
-
+DESTINATION="/tmp/engines/$MODEL_NAME"
+HF_TOKEN="hf_xxx"
 # Compile the engine using Optimum-NVIDIA
+# This will create a compiled engine in the /tmp/engines/meta-llama/Llama-3.1-8B-Instruct
+# directory for 1 GPU
 docker run \
   --rm \
   -it \
   --gpus=1 \
-  -v /tmp/models/$MODEL_NAME:/model \
-  -v /tmp/engines/$MODEL_NAME:/engine \
-  huggingface/optimum-nvidia \
-    optimum-cli export trtllm \
+  --shm-size=1g \
+  -v "$DESTINATION":/engine \
+  -e HF_TOKEN=$HF_TOKEN \
+  -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+  huggingface/optimum-nvidia:v0.1.0b9-py310 \
+    bash -c "optimum-cli export trtllm \
     --tp=1 \
     --pp=1 \
-    --max-batch-size=128 \
+    --max-batch-size=64 \
     --max-input-length 4096 \
     --max-output-length 8192 \
     --max-beams-width=1 \
-    --destination /engine \
-    $MODEL_NAME
+    --destination /tmp/engine \
+    $MODEL_NAME && cp -rL /tmp/engine/* /engine/"
 ```
 
-Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory.
+Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory, in a subfolder named after the GPU used to compile the model.
 
 ## Using the TRTLLM backend
 
 Run TGI-TRTLLM Docker image with the compiled engine:
 
 ```bash
+MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
+DESTINATION="/tmp/engines/$MODEL_NAME"
+HF_TOKEN="hf_xxx"
 docker run \
   --gpus 1 \
+  --shm-size=1g \
   -it \
   --rm \
   -p 3000:3000 \
   -e MODEL=$MODEL_NAME \
   -e PORT=3000 \
-  -e HF_TOKEN='hf_XXX' \
-  -v /tmp/engines/$MODEL_NAME:/data \
+  -e HF_TOKEN=$HF_TOKEN \
+  -v "$DESTINATION"/<YOUR_GPU_ARCHITECTURE>/engines:/data \
   ghcr.io/huggingface/text-generation-inference:latest-trtllm \
-  --executor-worker executorWorker \
-  --model-id /data/$MODEL_NAME
+  --model-id /data/ \
+  --tokenizer-name $MODEL_NAME
 ```
 
 ## Development
 
-To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
-`.devcontainer` directory.
+To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) with the following `.devcontainer.json` file:
+```json
+{
+  "name": "CUDA",
+  "build": {
+    "dockerfile": "Dockerfile_trtllm",
+    "context": ".."
+  },
+  "remoteEnv": {
+    "PATH": "${containerEnv:PATH}:/usr/local/cuda/bin",
+    "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64",
+    "XLA_FLAGS": "--xla_gpu_cuda_data_dir=/usr/local/cuda"
+  },
+  "customizations" : {
+    "jetbrains" : {
+      "backend" : "CLion"
+    }
+  }
+}
+```
+
+and `Dockerfile_trtllm`:
+
+```Dockerfile
+ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
+ARG build_type=release
+ARG ompi_version=4.1.7
+
+# CUDA dependent dependencies resolver stage
+FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    cmake \
+    curl \
+    gcc-14  \
+    g++-14 \
+    git \
+    git-lfs \
+    lld \
+    libssl-dev \
+    libucx-dev \
+    libasan8 \
+    libubsan1 \
+    ninja-build \
+    pkg-config \
+    pipx \
+    python3 \
+    python3-dev \
+    python3-setuptools \
+    tar \
+    wget --no-install-recommends && \
+    pipx ensurepath
+
+ENV TGI_INSTALL_PREFIX=/usr/local/tgi
+ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
+
+# Install OpenMPI
+FROM cuda-builder AS mpi-builder
+WORKDIR /opt/src/mpi
+
+ARG ompi_version
+ENV OMPI_VERSION=${ompi_version}
+ENV OMPI_TARBALL_FILENAME=openmpi-${OMPI_VERSION}.tar.bz2
+ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
+    https://download.open-mpi.org/release/open-mpi/v4.1/${OMPI_TARBALL_FILENAME} .
+
+RUN tar --strip-components=1 -xf ${OMPI_TARBALL_FILENAME} &&\
+    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
+    make -j all && \
+    make install && \
+    rm -rf ${OMPI_TARBALL_FILENAME}/..
+
+# Install TensorRT
+FROM cuda-builder AS trt-builder
+COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
+RUN chmod +x /opt/install_tensorrt.sh && \
+    /opt/install_tensorrt.sh
+
+# Build Backend
+FROM cuda-builder AS tgi-builder
+WORKDIR /usr/src/text-generation-inference
+
+# Scoped global args reuse
+ARG cuda_arch_list
+ARG build_type
+ARG sccache_gha_enabled
+ARG actions_cache_url
+ARG actions_runtime_token
+
+# Install Rust
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
+    chmod -R a+w /root/.rustup && \
+    chmod -R a+w /root/.cargo && \
+    cargo install sccache --locked
+
+ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
+ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
+ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt"
+
+ENV USE_LLD_LINKER=ON
+ENV CUDA_ARCH_LIST=${cuda_arch_list}
+```
\ No newline at end of file

From c07a2cc82bdd41a81e16ed534b0c6a3fce56eb30 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Fri, 31 Jan 2025 16:10:00 +0530
Subject: [PATCH 068/183] Update moe-kernel to 0.8.2 for rocm (#2977)

update moe-kernel for amd
---
 Dockerfile_amd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile_amd b/Dockerfile_amd
index 8b7808be..205e46d9 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -279,7 +279,7 @@ RUN git clone https://github.com/danieldk/marlin-kernels.git && \
 
 FROM kernel-builder AS moe-kernels
 WORKDIR /usr/src
-ENV MOE_KERNELS_BRANCH=d7e042bf9f7aff10c631212fc71b24895d66eb59
+ENV MOE_KERNELS_BRANCH=v0.8.2
 ENV VLLM_TARGET_DEVICE=rocm
 RUN git clone https://github.com/danieldk/moe-kernels.git && \
     cd moe-kernels && \

From c9d68945cc5e9dd915e6fcc5280f9740129f3336 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 31 Jan 2025 14:19:01 +0100
Subject: [PATCH 069/183] Prepare for release 3.1.0 (#2972)

* Prepare for release 3.1.0

* Back on main flake.

* Fixing stuff.

* Upgrade to moe-kernels 0.8.2 for Hip support.

* Deactivating the flaky test.
---
 .github/workflows/build.yaml                  | 10 +++---
 Cargo.lock                                    | 14 ++++----
 Cargo.toml                                    |  2 +-
 README.md                                     |  6 ++--
 docs/openapi.json                             |  2 +-
 docs/source/backends/trtllm.md                |  2 +-
 .../basic_tutorials/gated_model_access.md     |  2 +-
 docs/source/conceptual/quantization.md        |  6 ++--
 docs/source/installation_amd.md               |  2 +-
 docs/source/installation_intel.md             |  4 +--
 docs/source/installation_nvidia.md            |  2 +-
 docs/source/quicktour.md                      |  4 +--
 docs/source/reference/api_reference.md        |  2 +-
 flake.lock                                    |  8 ++---
 flake.nix                                     |  2 +-
 .../models/test_flash_starcoder_gptq.py       | 32 ++++++++++---------
 server/pyproject.toml                         |  2 +-
 17 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2db466a8..c991ffcb 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -74,12 +74,12 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest=""
-                if [[ "${GITHUB_REF}" == "refs/tags/*" ]]; then 
-                  export build_type="release"; 
-                  export target=""; 
-                else 
+                if [[ "${GITHUB_REF}" == "refs/tags/*" ]]; then
+                  export build_type="release";
+                  export target="";
+                else
                   export build_type="dev";
-                  export target="ci-runtime"; 
+                  export target="ci-runtime";
                 fi
                 ;;
             rocm)
diff --git a/Cargo.lock b/Cargo.lock
index af3e1902..d6883f9d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4423,7 +4423,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.0.2-dev0"
+version = "3.1.1-dev0"
 dependencies = [
  "async-trait",
  "clap 4.5.21",
@@ -4444,7 +4444,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-benchmark"
-version = "3.0.2-dev0"
+version = "3.1.1-dev0"
 dependencies = [
  "average",
  "clap 4.5.21",
@@ -4464,7 +4464,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "3.0.2-dev0"
+version = "3.1.1-dev0"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -4482,7 +4482,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "3.0.2-dev0"
+version = "3.1.1-dev0"
 dependencies = [
  "clap 4.5.21",
  "ctrlc",
@@ -4503,7 +4503,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "3.0.2-dev0"
+version = "3.1.1-dev0"
 dependencies = [
  "anyhow",
  "async-stream",
@@ -4554,7 +4554,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v2"
-version = "3.0.2-dev0"
+version = "3.1.1-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -4603,7 +4603,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v3"
-version = "3.0.2-dev0"
+version = "3.1.1-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
diff --git a/Cargo.toml b/Cargo.toml
index 9f49c9ab..6fd4b51d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,7 +20,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "3.0.2-dev0"
+version = "3.1.1-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
diff --git a/README.md b/README.md
index 6072c9bd..e4c4486f 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/docs/openapi.json b/docs/openapi.json
index 48120f77..a1df080b 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "3.0.2-dev0"
+    "version": "3.1.1-dev0"
   },
   "paths": {
     "/": {
diff --git a/docs/source/backends/trtllm.md b/docs/source/backends/trtllm.md
index e89e8f52..10db4a50 100644
--- a/docs/source/backends/trtllm.md
+++ b/docs/source/backends/trtllm.md
@@ -179,4 +179,4 @@ ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt"
 
 ENV USE_LLD_LINKER=ON
 ENV CUDA_ARCH_LIST=${cuda_arch_list}
-```
\ No newline at end of file
+```
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 6520a6ab..949aab56 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 \
     --model-id $model
 ```
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index a0eb0c51..449cc79b 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes
 ```
 
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes-nf4
 ```
 
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.2 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize gptq
 ```
 
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
index 797f0a5d..100bc2a9 100644
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
     --device=/dev/kfd --device=/dev/dri --group-add video \
     --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.2-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.1.0-rocm \
     --model-id $model
 ```
 
diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md
index 165d1105..b2279bb4 100644
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.2-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.1.0-intel-xpu \
     --model-id $model --cuda-graphs 0
 ```
 
@@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.2-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.1.0-intel-cpu \
     --model-id $model --cuda-graphs 0
 ```
 
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
index 0092f5ef..8c4bdaee 100644
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.2 \
+    ghcr.io/huggingface/text-generation-inference:3.1.0 \
     --model-id $model
 ```
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index 4ffb9728..bd4956a0 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.0.2 \
+    ghcr.io/huggingface/text-generation-inference:3.1.0 \
     --model-id $model
 ```
 
@@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.0.2 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.1.0 --help
 ```
 
 </Tip>
diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
index f7723187..ee34d587 100644
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@@ -163,7 +163,7 @@ hub = {
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.0.2"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.0"),
  env=hub,
  role=role,
 )
diff --git a/flake.lock b/flake.lock
index ba3a7173..2e7ddb29 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,16 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1738229197,
-        "narHash": "sha256-K/YJSFhzP0vN23GMfM1HVMtSzaM488hh12ggsMtKMG0=",
+        "lastModified": 1738315729,
+        "narHash": "sha256-tizNB3LbhPWgqs/PGgFdTxudqkttqo+R0NBkaaQP3ak=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "cfcddaf3044f59c3fbd335935ac3c0e9f458d824",
+        "rev": "a3872305034ead72328e84628974d66969b46074",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "moe_0_8_1",
+        "ref": "moe_0.8.2",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 6c9f5014..28555424 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/moe_0_8_1";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/moe_0.8.2";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/integration-tests/models/test_flash_starcoder_gptq.py b/integration-tests/models/test_flash_starcoder_gptq.py
index 7a9df329..4677a660 100644
--- a/integration-tests/models/test_flash_starcoder_gptq.py
+++ b/integration-tests/models/test_flash_starcoder_gptq.py
@@ -25,21 +25,23 @@ async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_response_snap
     assert response == generous_response_snapshot
 
 
-@pytest.mark.release
-@pytest.mark.asyncio
-async def test_flash_starcoder_gptq_default_params(
-    flash_starcoder_gptq, generous_response_snapshot
-):
-    response = await flash_starcoder_gptq.generate(
-        "def geometric_mean(L: List[float]):",
-        max_new_tokens=20,
-        temperature=0.2,
-        top_p=0.95,
-        decoder_input_details=True,
-        seed=0,
-    )
-    assert response.details.generated_tokens == 2
-    assert response == generous_response_snapshot
+# Deactivated because it's flaky
+# Only this model seems affected and it's only a logprob precision issue.
+# @pytest.mark.release
+# @pytest.mark.asyncio
+# async def test_flash_starcoder_gptq_default_params(
+#     flash_starcoder_gptq, generous_response_snapshot
+# ):
+#     response = await flash_starcoder_gptq.generate(
+#         "def geometric_mean(L: List[float]):",
+#         max_new_tokens=20,
+#         temperature=0.2,
+#         top_p=0.95,
+#         decoder_input_details=True,
+#         seed=0,
+#     )
+#     assert response.details.generated_tokens == 2
+#     assert response == generous_response_snapshot
 
 
 @pytest.mark.release
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 8888f5c6..da3ba820 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -75,7 +75,7 @@ marlin-kernels = [
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
 ]
-moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.1/moe_kernels-0.8.1+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
+moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
 
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]

From bb69c5b199928fa17097c57e67fcdbff31a25889 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 31 Jan 2025 14:39:52 +0100
Subject: [PATCH 070/183] Back on nix main. (#2979)

---
 flake.lock | 7 +++----
 flake.nix  | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/flake.lock b/flake.lock
index 2e7ddb29..3ad3d698 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,16 +978,15 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1738315729,
-        "narHash": "sha256-tizNB3LbhPWgqs/PGgFdTxudqkttqo+R0NBkaaQP3ak=",
+        "lastModified": 1738323634,
+        "narHash": "sha256-lKPzgEm7pEuQJVhacsxFHqg1MOtrUMZvr+9IuJzC5J4=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "a3872305034ead72328e84628974d66969b46074",
+        "rev": "eb5fede2756f544f75e01f55a4097f9c9a8c5005",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "moe_0.8.2",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 28555424..83cedfa6 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/moe_0.8.2";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {

From e3f2018cb5d7a8bfbfbbbc0d2dc63e2195f4229f Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Mon, 3 Feb 2025 11:11:15 +0100
Subject: [PATCH 071/183] hotfix: fix trtllm CI build on release (#2981)

* hotfix: fix trtllm CI build on release

* fix: test release.

* fix: test release.

* fix: test release. env not recognized https://github.com/actions/runner/issues/1661

* fix: test release. Works.
---
 .github/workflows/build.yaml | 3 ++-
 Dockerfile_trtllm            | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c991ffcb..1bd7163f 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -74,7 +74,7 @@ jobs:
                 export runs_on="ubuntu-latest"
                 export platform=""
                 export extra_pytest=""
-                if [[ "${GITHUB_REF}" == "refs/tags/*" ]]; then
+                if [[ "${GITHUB_REF}" == refs/tags/* ]]; then
                   export build_type="release";
                   export target="";
                 else
@@ -263,4 +263,5 @@ jobs:
 
     steps:
       - name: Run C++/CUDA tests
+        if: ${{ env.LABEL == 'ci-runtime' }}
         run: /usr/local/tgi/bin/tgi_trtllm_backend_tests
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index 999d63d7..2a0636ff 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -1,7 +1,7 @@
 ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
 ARG build_type=release
 ARG ompi_version=4.1.7
-ARG sccache_gha_enabled=no
+ARG sccache_gha_enabled=off
 ARG actions_cache_url=""
 ARG actions_runtime_token=""
 

From 88fd56f549637b36529582b682c4c76eb78e36a5 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Mon, 3 Feb 2025 15:30:48 +0100
Subject: [PATCH 072/183] Add `strftime_now` callable function for `minijinja`
 chat templates (#2983)

* Add `chrono` and `strftime_now` function callable

* Fix `test_chat_template_valid_with_strftime_now`

* Fix `test_chat_template_valid_with_strftime_now`
---
 Cargo.lock                        | 53 ++++++++++++++++++++
 router/Cargo.toml                 |  1 +
 router/src/infer/chat_template.rs | 83 ++++++++++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index d6883f9d..915de0d5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -52,6 +52,21 @@ version = "0.2.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9"
 
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "anstream"
 version = "0.6.18"
@@ -651,6 +666,20 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
 
+[[package]]
+name = "chrono"
+version = "0.4.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "wasm-bindgen",
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "clang-sys"
 version = "1.8.1"
@@ -1801,6 +1830,29 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "iana-time-zone"
+version = "0.1.61"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "icu_collections"
 version = "1.5.0"
@@ -4511,6 +4563,7 @@ dependencies = [
  "axum 0.7.9",
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
+ "chrono",
  "clap 4.5.21",
  "csv",
  "futures",
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 2e621dfc..e4d0179a 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -64,6 +64,7 @@ uuid = { version = "1.9.1", default-features = false, features = [
 csv = "1.3.0"
 ureq = "=2.9"
 pyo3 = { workspace = true }
+chrono = "0.4.39"
 
 
 [build-dependencies]
diff --git a/router/src/infer/chat_template.rs b/router/src/infer/chat_template.rs
index 2bda7193..8303ee76 100644
--- a/router/src/infer/chat_template.rs
+++ b/router/src/infer/chat_template.rs
@@ -1,5 +1,6 @@
 use crate::infer::InferError;
 use crate::{ChatTemplateInputs, Message, MessageChunk, TextMessage, TokenizerConfigToken, Tool};
+use chrono::Local;
 use minijinja::{Environment, ErrorKind, Template};
 use minijinja_contrib::pycompat;
 
@@ -8,6 +9,11 @@ pub(crate) fn raise_exception(err_text: String) -> Result<String, minijinja::Err
     Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
 }
 
+/// Get the current date in a specific format (custom function), similar to `datetime.now().strftime()` in Python
+pub(crate) fn strftime_now(format_str: String) -> Result<String, minijinja::Error> {
+    Ok(Local::now().format(&format_str).to_string())
+}
+
 #[derive(Clone)]
 pub(crate) struct ChatTemplate {
     template: Template<'static, 'static>,
@@ -27,6 +33,7 @@ impl ChatTemplate {
         env.set_unknown_method_callback(pycompat::unknown_method_callback);
         let template_str = template.into_boxed_str();
         env.add_function("raise_exception", raise_exception);
+        env.add_function("strftime_now", strftime_now);
         tracing::debug!("Loading template: {}", template_str);
 
         // leaking env and template_str as read-only, static resources for performance.
@@ -109,11 +116,12 @@ impl ChatTemplate {
 // tests
 #[cfg(test)]
 mod tests {
-    use crate::infer::chat_template::raise_exception;
+    use crate::infer::chat_template::{raise_exception, strftime_now};
     use crate::infer::ChatTemplate;
     use crate::{
         ChatTemplateInputs, Message, MessageContent, TextMessage, TokenizerConfigToken, Tool,
     };
+    use chrono::Local;
     use minijinja::Environment;
 
     #[test]
@@ -182,6 +190,7 @@ mod tests {
     fn test_chat_template_invalid_with_raise() {
         let mut env = Environment::new();
         env.add_function("raise_exception", raise_exception);
+        env.add_function("strftime_now", strftime_now);
 
         let source = r#"
         {{ bos_token }}
@@ -253,6 +262,7 @@ mod tests {
     fn test_chat_template_valid_with_raise() {
         let mut env = Environment::new();
         env.add_function("raise_exception", raise_exception);
+        env.add_function("strftime_now", strftime_now);
 
         let source = r#"
         {{ bos_token }}
@@ -307,10 +317,79 @@ mod tests {
         assert_eq!(result, "[BOS][INST] Hi! [/INST]Hello how can I help?[EOS][INST] What is Deep Learning? [/INST]magic![EOS]");
     }
 
+    #[test]
+    fn test_chat_template_valid_with_strftime_now() {
+        let mut env = Environment::new();
+        env.add_function("raise_exception", raise_exception);
+        env.add_function("strftime_now", strftime_now);
+
+        let source = r#"
+        {% set today = strftime_now("%Y-%m-%d") %}
+        {% set default_system_message = "The current date is " + today + "." %}
+        {{ bos_token }}
+        {% if messages[0]['role'] == 'system' %}
+            { set system_message = messages[0]['content'] %}
+            {%- set loop_messages = messages[1:] %}
+        {% else %}
+            {%- set system_message = default_system_message %}
+            {%- set loop_messages = messages %}
+        {% endif %}
+        {{ '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}
+        {% for message in loop_messages %}
+            {% if message['role'] == 'user' %}
+                {{ '[INST]' + message['content'] + '[/INST]' }}
+            {% elif message['role'] == 'assistant' %}
+                {{ message['content'] + eos_token }}
+            {% else %}
+                {{ raise_exception('Only user and assistant roles are supported!') }}
+            {% endif %}
+        {% endfor %}
+        "#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let current_date = Local::now().format("%Y-%m-%d").to_string();
+        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
+        assert_eq!(result, format!("[BOS][SYSTEM_PROMPT]The current date is {}.[/SYSTEM_PROMPT][INST]Hi![/INST]Hello how can I help?[EOS][INST]What is Deep Learning?[/INST]magic![EOS]", current_date));
+    }
+
     #[test]
     fn test_chat_template_valid_with_add_generation_prompt() {
         let mut env = Environment::new();
         env.add_function("raise_exception", raise_exception);
+        env.add_function("strftime_now", strftime_now);
 
         let source = r#"
         {% for message in messages %}
@@ -502,6 +581,7 @@ mod tests {
         {
             let mut env = Environment::new();
             env.add_function("raise_exception", raise_exception);
+            env.add_function("strftime_now", strftime_now);
             let tmpl = env.template_from_str(chat_template);
             let result = tmpl.unwrap().render(input).unwrap();
             assert_eq!(result, target);
@@ -776,6 +856,7 @@ mod tests {
         {
             let mut env = Environment::new();
             env.add_function("raise_exception", raise_exception);
+            env.add_function("strftime_now", strftime_now);
             // trim all the whitespace
             let chat_template = chat_template
                 .lines()

From dd2bd5fdb34dab6778c6d0107c0fc307c361deb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 4 Feb 2025 17:01:59 +0100
Subject: [PATCH 073/183] impureWithCuda: fix gcc version (#2990)

* impureWithCuda: fix gcc version

* trufflehog: do not fail on unverified results
---
 .github/workflows/trufflehog.yaml | 14 ++++++++------
 nix/impure-shell.nix              | 18 ++++++++++++++----
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/trufflehog.yaml b/.github/workflows/trufflehog.yaml
index b406d43b..7f6646e1 100644
--- a/.github/workflows/trufflehog.yaml
+++ b/.github/workflows/trufflehog.yaml
@@ -10,9 +10,11 @@ jobs:
   trufflehog:
     runs-on: ubuntu-latest
     steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-    - name: Secret Scanning
-      uses: trufflesecurity/trufflehog@main
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Secret Scanning
+        uses: trufflesecurity/trufflehog@main
+        with:
+          extra_args: --results=verified,unknown
diff --git a/nix/impure-shell.nix b/nix/impure-shell.nix
index a13fd711..aebdff84 100644
--- a/nix/impure-shell.nix
+++ b/nix/impure-shell.nix
@@ -94,8 +94,18 @@ mkShell {
     ( cd clients/python ; python -m pip install --no-dependencies -e . )
   '';
 
-  postShellHook = ''
-    unset SOURCE_DATE_EPOCH
-    export PATH=$PATH:~/.cargo/bin
-  '';
+  postShellHook =
+    ''
+      unset SOURCE_DATE_EPOCH
+      export PATH=${cudaPackages.backendStdenv.cc}/bin:$PATH:~/.cargo/bin
+    ''
+    # At various points in time, the latest gcc supported by CUDA differs
+    # from the default version in nixpkgs. A lot of the dependencies in
+    # the impure environment pull in the default gcc from nixpkgs, so we
+    # end up with the CUDA-supported gcc and the nixpkgs default gcc in
+    # the path. To ensure that we can build CUDA kernels, put the CUDA
+    # first in the path. It's a hack, but it works.
+    + lib.optionalString withCuda ''
+      export PATH=${cudaPackages.backendStdenv.cc}/bin:$PATH
+    '';
 }

From c1cf36c0dc2d154375d061ef54addfdfe49439fd Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Tue, 4 Feb 2025 12:44:18 -0500
Subject: [PATCH 074/183] Improve qwen vl impl (#2943)

* feat: refactor model, improve startup and re enable tests

* fix: improve multimodal rotary embed caching

* fix: limit vision flop calc to qwen2 vl models and update config typing

* fix: include clippy lint

* feat: refactor position ids in warmup and bump tests

* fix: prefer default dtype

* fix: enable all cuda graphs and bump snapshots

* fix: adjust rotaty init path

* fix: simplify get position ids and remove usused vision config

* fix: update position ids so first dim is batch, simplify rotary and bump vlm default token limit

* fix: improve position id init during cuda warmup for mrope and simplfy rotary forward

* fix: check existance before accessing rope type in cuda warmup

* fix: check key before access

* fix: improve mrope check in cuda graph warmup

* fix: remove check for default rope type

* fix: add more test and improve model generation

* fix: improve and simplify get_cos_sin, refactors and cleanup  get_position_ids

* fix: adjust signatures with types
---
 ...essed_tensors_w8a8_int_dynamic_weight.json | 422 +++++++++++++++++-
 ...rs_w8a8_int_dynamic_weight_all_params.json |  78 ++--
 ..._tensors_w8a8_int_dynamic_weight_load.json |  80 ++--
 .../test_flash_qwen2_vl_bay.json              |  26 ++
 .../test_flash_qwen2_vl_inpaint.json          |  26 ++
 .../test_flash_qwen2_vl_simple.json           |  12 +-
 .../test_flash_qwen2_vl_simple_streaming.json |   4 +-
 ...pressed_tensors_w8a8_int_dynamic_weight.py |   9 +-
 .../models/test_flash_qwen2_vl.py             | 203 +++++----
 launcher/src/main.rs                          |  11 +-
 .../text_generation_server/layers/rotary.py   |  70 +++
 .../text_generation_server/models/__init__.py |   1 +
 .../custom_modeling/flash_qwen2_modeling.py   |  43 +-
 .../models/custom_modeling/qwen2_vl.py        | 187 ++++----
 .../models/flash_causal_lm.py                 |  17 +-
 15 files changed, 865 insertions(+), 324 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json

diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
index 2525f72c..7dbfc627 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
@@ -1,73 +1,469 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "length",
-    "generated_tokens": 10,
+    "finish_reason": "eos_token",
+    "generated_tokens": 76,
     "prefill": [],
     "seed": null,
     "tokens": [
       {
         "id": 18183,
-        "logprob": -1.6669922,
+        "logprob": -1.5195312,
         "special": false,
         "text": " Deep"
       },
       {
         "id": 6832,
-        "logprob": -0.08959961,
+        "logprob": -0.06817627,
         "special": false,
         "text": " learning"
       },
       {
         "id": 374,
-        "logprob": -0.14685059,
+        "logprob": -0.13122559,
         "special": false,
         "text": " is"
       },
       {
         "id": 264,
-        "logprob": -0.125,
+        "logprob": -0.13415527,
         "special": false,
         "text": " a"
       },
       {
         "id": 25993,
-        "logprob": -0.81640625,
+        "logprob": -0.8769531,
         "special": false,
         "text": " subset"
       },
       {
         "id": 315,
-        "logprob": -0.0013418198,
+        "logprob": -0.0011396408,
         "special": false,
         "text": " of"
       },
       {
         "id": 5662,
-        "logprob": -0.16027832,
+        "logprob": -0.16442871,
         "special": false,
         "text": " machine"
       },
       {
         "id": 6832,
-        "logprob": -0.0016393661,
+        "logprob": -0.0026416779,
         "special": false,
         "text": " learning"
       },
       {
         "id": 429,
-        "logprob": -0.4477539,
+        "logprob": -0.48754883,
         "special": false,
         "text": " that"
       },
       {
         "id": 5711,
-        "logprob": -1.2802734,
+        "logprob": -1.2294922,
         "special": false,
         "text": " uses"
+      },
+      {
+        "id": 29728,
+        "logprob": -0.66503906,
+        "special": false,
+        "text": " neural"
+      },
+      {
+        "id": 14155,
+        "logprob": -0.02960205,
+        "special": false,
+        "text": " networks"
+      },
+      {
+        "id": 311,
+        "logprob": -0.7236328,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 3960,
+        "logprob": -1.1914062,
+        "special": false,
+        "text": " learn"
+      },
+      {
+        "id": 504,
+        "logprob": -0.7089844,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 821,
+        "logprob": -0.7729492,
+        "special": false,
+        "text": " data"
+      },
+      {
+        "id": 13,
+        "logprob": -0.7836914,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 1084,
+        "logprob": -0.9941406,
+        "special": false,
+        "text": " It"
+      },
+      {
+        "id": 374,
+        "logprob": -0.52441406,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -0.9511719,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 943,
+        "logprob": -0.8642578,
+        "special": false,
+        "text": " type"
+      },
+      {
+        "id": 315,
+        "logprob": -0.00030231476,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 20443,
+        "logprob": -0.14416504,
+        "special": false,
+        "text": " artificial"
+      },
+      {
+        "id": 11229,
+        "logprob": -0.013824463,
+        "special": false,
+        "text": " intelligence"
+      },
+      {
+        "id": 429,
+        "logprob": -0.18762207,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 646,
+        "logprob": -1.0087891,
+        "special": false,
+        "text": " can"
+      },
+      {
+        "id": 3960,
+        "logprob": -0.90234375,
+        "special": false,
+        "text": " learn"
+      },
+      {
+        "id": 504,
+        "logprob": -0.54345703,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 323,
+        "logprob": -1.0400391,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 1281,
+        "logprob": -0.072509766,
+        "special": false,
+        "text": " make"
+      },
+      {
+        "id": 19898,
+        "logprob": -0.16516113,
+        "special": false,
+        "text": " predictions"
+      },
+      {
+        "id": 389,
+        "logprob": -0.4416504,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 3460,
+        "logprob": -0.5385742,
+        "special": false,
+        "text": " large"
+      },
+      {
+        "id": 14713,
+        "logprob": -0.4387207,
+        "special": false,
+        "text": " amounts"
+      },
+      {
+        "id": 315,
+        "logprob": -0.00015091896,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 821,
+        "logprob": -0.061431885,
+        "special": false,
+        "text": " data"
+      },
+      {
+        "id": 13,
+        "logprob": -0.71875,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 18183,
+        "logprob": -0.23632812,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 6832,
+        "logprob": -0.0017204285,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 374,
+        "logprob": -1.1738281,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 1483,
+        "logprob": -0.61083984,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 304,
+        "logprob": -0.035003662,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 264,
+        "logprob": -0.118652344,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 8045,
+        "logprob": -0.42016602,
+        "special": false,
+        "text": " variety"
+      },
+      {
+        "id": 315,
+        "logprob": -1.6212463e-05,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 8357,
+        "logprob": -0.1315918,
+        "special": false,
+        "text": " applications"
+      },
+      {
+        "id": 11,
+        "logprob": -0.12915039,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 2670,
+        "logprob": -0.12463379,
+        "special": false,
+        "text": " including"
+      },
+      {
+        "id": 2168,
+        "logprob": -0.37402344,
+        "special": false,
+        "text": " image"
+      },
+      {
+        "id": 323,
+        "logprob": -0.1451416,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 8806,
+        "logprob": -0.028869629,
+        "special": false,
+        "text": " speech"
+      },
+      {
+        "id": 17843,
+        "logprob": -0.00024068356,
+        "special": false,
+        "text": " recognition"
+      },
+      {
+        "id": 11,
+        "logprob": -0.00031018257,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 5810,
+        "logprob": -0.019821167,
+        "special": false,
+        "text": " natural"
+      },
+      {
+        "id": 4128,
+        "logprob": -0.00012528896,
+        "special": false,
+        "text": " language"
+      },
+      {
+        "id": 8692,
+        "logprob": -0.00089263916,
+        "special": false,
+        "text": " processing"
+      },
+      {
+        "id": 11,
+        "logprob": -0.00073862076,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 323,
+        "logprob": -0.040161133,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 38193,
+        "logprob": -0.4519043,
+        "special": false,
+        "text": " autonomous"
+      },
+      {
+        "id": 11474,
+        "logprob": -0.39941406,
+        "special": false,
+        "text": " vehicles"
+      },
+      {
+        "id": 13,
+        "logprob": -0.21166992,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 1084,
+        "logprob": -0.9082031,
+        "special": false,
+        "text": " It"
+      },
+      {
+        "id": 374,
+        "logprob": -0.44213867,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": -1.2177734,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 18512,
+        "logprob": -0.5205078,
+        "special": false,
+        "text": " rapidly"
+      },
+      {
+        "id": 7826,
+        "logprob": -0.15332031,
+        "special": false,
+        "text": " growing"
+      },
+      {
+        "id": 2070,
+        "logprob": -0.0039978027,
+        "special": false,
+        "text": " field"
+      },
+      {
+        "id": 448,
+        "logprob": -0.9091797,
+        "special": false,
+        "text": " with"
+      },
+      {
+        "id": 1657,
+        "logprob": -0.17114258,
+        "special": false,
+        "text": " many"
+      },
+      {
+        "id": 4650,
+        "logprob": -0.70703125,
+        "special": false,
+        "text": " potential"
+      },
+      {
+        "id": 8357,
+        "logprob": -0.025131226,
+        "special": false,
+        "text": " applications"
+      },
+      {
+        "id": 304,
+        "logprob": -0.6699219,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 279,
+        "logprob": -0.35205078,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 3853,
+        "logprob": -0.049194336,
+        "special": false,
+        "text": " future"
+      },
+      {
+        "id": 13,
+        "logprob": -0.21972656,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 151643,
+        "logprob": -2.0019531,
+        "special": true,
+        "text": "<|endoftext|>"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": " Deep learning is a subset of machine learning that uses"
+  "generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
 }
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
index 6b3f5092..2c840e67 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
@@ -7,67 +7,67 @@
     "seed": 0,
     "tokens": [
       {
-        "id": 1939,
-        "logprob": -2.2460938,
+        "id": 5267,
+        "logprob": -1.1464844,
         "special": false,
-        "text": "?\n\n"
+        "text": "?\n"
       },
       {
         "id": 33464,
-        "logprob": 0.0,
+        "logprob": -0.83203125,
         "special": false,
         "text": "Deep"
       },
       {
         "id": 20909,
-        "logprob": -0.48608398,
+        "logprob": -0.5625,
         "special": false,
         "text": " Learning"
       },
-      {
-        "id": 4102,
-        "logprob": -2.265625,
-        "special": false,
-        "text": " "
-      },
-      {
-        "id": 285,
-        "logprob": 0.0,
-        "special": false,
-        "text": "is"
-      },
-      {
-        "id": 458,
-        "logprob": -0.6328125,
-        "special": false,
-        "text": " an"
-      },
-      {
-        "id": 20443,
-        "logprob": -0.1796875,
-        "special": false,
-        "text": " artificial"
-      },
-      {
-        "id": 11229,
-        "logprob": 0.0,
-        "special": false,
-        "text": " intelligence"
-      },
       {
         "id": 320,
-        "logprob": -0.37695312,
+        "logprob": -2.1464844,
         "special": false,
         "text": " ("
       },
       {
-        "id": 15469,
+        "id": 16524,
         "logprob": 0.0,
         "special": false,
-        "text": "AI"
+        "text": "DL"
+      },
+      {
+        "id": 701,
+        "logprob": -2.2089844,
+        "special": false,
+        "text": "),"
+      },
+      {
+        "id": 476,
+        "logprob": -0.27368164,
+        "special": false,
+        "text": " or"
+      },
+      {
+        "id": 20443,
+        "logprob": -0.09442139,
+        "special": false,
+        "text": " artificial"
+      },
+      {
+        "id": 29728,
+        "logprob": 0.0,
+        "special": false,
+        "text": " neural"
+      },
+      {
+        "id": 14155,
+        "logprob": 0.0,
+        "special": false,
+        "text": " networks"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
+  "generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
 }
diff --git a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
index 1fa4e33a..aee5698b 100644
--- a/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
+++ b/integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
@@ -9,61 +9,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
@@ -82,61 +82,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
@@ -155,61 +155,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
@@ -228,61 +228,61 @@
       "tokens": [
         {
           "id": 18183,
-          "logprob": -1.4912109,
+          "logprob": -1.5195312,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6832,
-          "logprob": -0.075683594,
+          "logprob": -0.06817627,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.12408447,
+          "logprob": -0.13122559,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.12768555,
+          "logprob": -0.13415527,
           "special": false,
           "text": " a"
         },
         {
           "id": 25993,
-          "logprob": -0.82128906,
+          "logprob": -0.87353516,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.0012636185,
+          "logprob": -0.0011396408,
           "special": false,
           "text": " of"
         },
         {
           "id": 5662,
-          "logprob": -0.12878418,
+          "logprob": -0.16442871,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6832,
-          "logprob": -0.0015888214,
+          "logprob": -0.0026416779,
           "special": false,
           "text": " learning"
         },
         {
           "id": 429,
-          "logprob": -0.49194336,
+          "logprob": -0.48754883,
           "special": false,
           "text": " that"
         },
         {
           "id": 5711,
-          "logprob": -1.2626953,
+          "logprob": -1.2294922,
           "special": false,
           "text": " uses"
         }
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json
new file mode 100644
index 00000000..25a1abc7
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image showcases a stunning cityscape, featuring the iconic Statue of Liberty in the foreground. The image displays Lady Liberty's imposing presence, with her towering base standing beside her. Behind the statue, the city's skyline extends across the horizon, adorned with numerous tall buildings, including the Empire State Building and other notable skyscrapers. The water reflecting the sun's rays creates a serene and picturesque scene, emphasizing the beauty and resilience of this global landmark. The sky is a clear, pale blue, adding to the overall tranquility of the scene.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1738348090,
+  "id": "",
+  "model": "Qwen/Qwen2-VL-7B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.1.1-dev0-native",
+  "usage": {
+    "completion_tokens": 110,
+    "prompt_tokens": 8736,
+    "total_tokens": 8846
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json
new file mode 100644
index 00000000..325e658f
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image shows a stylized scene set in what appears to be a diner or restaurant. In the foreground, there is a table with various food items, including a burger with lettuce and tomato, a bowl of fries, and a drink in a cup with a straw. On the right side of the table, there is an owl sitting alertly, looking directly at the camera. Behind the owl and the table, there is a large, green, dinosaur-like creature resembling Godzilla, with its mouth open and tongue visible. In the background, the diner's decor includes various signs and posters, with a green sign reading \"Basta\" and another sign that says \"Tabasco.\" The setting has a retro or vintage feel, with fluorescent lighting overhead and clean, polished surfaces.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1738348100,
+  "id": "",
+  "model": "Qwen/Qwen2-VL-7B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.1.1-dev0-native",
+  "usage": {
+    "completion_tokens": 156,
+    "prompt_tokens": 5375,
+    "total_tokens": 5531
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
index 131631e6..6b6017c9 100644
--- a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.",
+        "content": "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character.",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1730164250,
+  "created": 1738347908,
   "id": "",
   "model": "Qwen/Qwen2-VL-7B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.1.1-dev0-native",
   "usage": {
-    "completion_tokens": 58,
-    "prompt_tokens": 349,
-    "total_tokens": 407
+    "completion_tokens": 89,
+    "prompt_tokens": 1364,
+    "total_tokens": 1453
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
index 3e2faca7..3dc8fc6d 100644
--- a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
@@ -11,10 +11,10 @@
       "logprobs": null
     }
   ],
-  "created": 1730416361,
+  "created": 1737646031,
   "id": "",
   "model": "Qwen/Qwen2-VL-7B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.0.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
index a0b0416b..17e12c22 100644
--- a/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
+++ b/integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
@@ -27,15 +27,16 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight(
 ):
     response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
         "What is deep learning?",
-        max_new_tokens=10,
+        # prefer a longer response than the default, allow the llm to end generation
+        max_new_tokens=1000,
         decoder_input_details=True,
     )
 
     assert (
         response.generated_text
-        == " Deep learning is a subset of machine learning that uses"
+        == " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
     )
-    assert response.details.generated_tokens == 10
+    assert response.details.generated_tokens == 76
     assert response == response_snapshot
 
 
@@ -64,7 +65,7 @@ async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "What is deep learning?\n\nDeep Learning is an artificial intelligence (AI"
+        == "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
     )
     assert response == response_snapshot
 
diff --git a/integration-tests/models/test_flash_qwen2_vl.py b/integration-tests/models/test_flash_qwen2_vl.py
index 97a533fc..5a12eba8 100644
--- a/integration-tests/models/test_flash_qwen2_vl.py
+++ b/integration-tests/models/test_flash_qwen2_vl.py
@@ -1,81 +1,122 @@
-# Disabled because it's broken.
-# import pytest
-#
-#
-# @pytest.fixture(scope="module")
-# def flash_qwen2_vl_handle(launcher):
-#     with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
-#         yield handle
-#
-#
-# @pytest.fixture(scope="module")
-# async def flash_qwen2(flash_qwen2_vl_handle):
-#     await flash_qwen2_vl_handle.health(300)
-#     return flash_qwen2_vl_handle.client
-#
-#
-# @pytest.mark.private
-# async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
-#     response = await flash_qwen2.chat(
-#         max_tokens=100,
-#         seed=42,
-#         messages=[
-#             {
-#                 "role": "user",
-#                 "content": [
-#                     {
-#                         "type": "image_url",
-#                         "image_url": {
-#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-#                         },
-#                     },
-#                     {"type": "text", "text": "Describe this image."},
-#                 ],
-#             },
-#         ],
-#     )
-#
-#     assert (
-#         response.choices[0].message.content
-#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
-#     )
-#
-#     assert response == response_snapshot
-#
-#
-# @pytest.mark.private
-# async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
-#     responses = await flash_qwen2.chat(
-#         max_tokens=100,
-#         seed=42,
-#         messages=[
-#             {
-#                 "role": "user",
-#                 "content": [
-#                     {
-#                         "type": "image_url",
-#                         "image_url": {
-#                             "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
-#                         },
-#                     },
-#                     {"type": "text", "text": "Describe this image."},
-#                 ],
-#             },
-#         ],
-#         stream=True,
-#     )
-#
-#     count = 0
-#     generated = ""
-#     last_response = None
-#     async for response in responses:
-#         count += 1
-#         generated += response.choices[0].delta.content
-#         last_response = response
-#
-#     assert (
-#         generated
-#         == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape."
-#     )
-#     assert count == 58
-#     assert last_response == response_snapshot
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_qwen2_vl_handle(launcher):
+    with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_qwen2(flash_qwen2_vl_handle):
+    await flash_qwen2_vl_handle.health(300)
+    return flash_qwen2_vl_handle.client
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+        ],
+    )
+
+    assert (
+        response.choices[0].message.content
+        == "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character."
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
+    responses = await flash_qwen2.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        generated += response.choices[0].delta.content
+        last_response = response
+
+    assert (
+        generated
+        == "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character."
+    )
+    assert count == 89
+    assert last_response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_bay(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                        },
+                    },
+                    {"type": "text", "text": "Describe the image"},
+                ],
+            },
+        ],
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_vl_inpaint(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe the image"},
+                ],
+            },
+        ],
+    )
+    assert response == response_snapshot
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 05ed0202..3c9ee850 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -2049,7 +2049,16 @@ fn main() -> Result<(), LauncherError> {
             None => {
                 let compute_type = compute_type(num_shard);
                 let compute_optimal = compute_optimal(config.as_ref(), compute_type.as_ref());
-                let default = compute_optimal.unwrap_or(4096);
+                // TODO: remove this when we correctly esimate the flops for VLMs
+                // this is a short term temporary fix to enable vlms to avoid rejecting images
+                let default_optimal = match config {
+                    Some(ref config) => match config.model_type.as_deref() {
+                        Some("qwen2_vl") => 10_000,
+                        _ => 4096,
+                    },
+                    None => 4096,
+                };
+                let default = compute_optimal.unwrap_or(default_optimal);
                 let vram_maximum = vram_maximum(
                     config.as_ref(),
                     compute_type.as_ref(),
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
index e346d0f8..f38f6859 100644
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@@ -86,11 +86,18 @@ class PositionRotaryEmbedding(nn.Module):
             # `rope_type` is now standard in transformers, but some existing models
             # have `type` instead.
             rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
+            mrope_section = rope_scaling.get("mrope_section", None)
 
             if rope_type == "linear":
                 pass
             elif rope_type == "default":
                 pass
+            elif rope_type == "mrope":
+                mrope_section = rope_scaling["mrope_section"]
+                if mrope_section is not None:
+                    return RotaryPositionEmbeddingMultimodalSections(
+                        inv_freq, scaling_factor, mrope_section
+                    )
             elif rope_type == "dynamic":
                 scaling_factor = rope_scaling["factor"]
                 return DynamicPositionRotaryEmbedding(
@@ -548,3 +555,66 @@ def apply_llama3_scaling(
             new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
 
     return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+
+
+class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
+    def __init__(self, inv_freq: torch.Tensor, scaling_factor: float, sections: list):
+        super().__init__(inv_freq, scaling_factor)
+        self.sections = sections
+        self._cos_cached = None
+        self._sin_cached = None
+        self.section_indices = (
+            torch.arange(len(self.sections))
+            .repeat_interleave(torch.tensor(self.sections))
+            .view(1, 1, -1)
+            .to(inv_freq.device)
+        )
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # rotate half the sequence length
+        rot = cos.shape[-1] // 2
+        q2 = torch.cat([-query[..., rot:], query[..., :rot]], dim=-1)
+        k2 = torch.cat([-key[..., rot:], key[..., :rot]], dim=-1)
+
+        # apply the rotation
+        rotary_emb.apply_rotary(query, q2, cos, sin, query, q2, True)
+        rotary_emb.apply_rotary(key, k2, cos, sin, key, k2, True)
+
+    def _update_cos_sin_cache(
+        self, dtype: torch.dtype, device: torch.device, seqlen: int
+    ):
+        # always cache the cos/sin for the full sequence length to avoid
+        # recomputing if the sequence length is smaller than the cached one
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+            self._sections = self.section_indices.expand(seqlen, -1, -1)
+
+    def get_cos_sin(
+        self,
+        position_ids: torch.Tensor,
+        max_s: int,
+        dtype: torch.dtype,
+    ):
+        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
+        slen = position_ids.shape[0]
+
+        cos = self._cos_cached[position_ids].gather(1, self._sections[:slen])
+        sin = self._sin_cached[position_ids].gather(1, self._sections[:slen])
+
+        cos = torch.cat([cos, cos], dim=-1)
+        sin = torch.cat([sin, sin], dim=-1)
+        return cos, sin
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 205030e9..f8150b5e 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -1363,6 +1363,7 @@ def get_model(
             quantize=quantize,
             speculator=speculator,
             dtype=dtype,
+            default_dtype=torch.bfloat16,
             kv_cache_dtype=kv_cache_dtype,
             trust_remote_code=trust_remote_code,
             lora_adapter_ids=lora_adapter_ids,
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index cc4039b1..d6569a1d 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -61,11 +61,6 @@ class Qwen2Attention(torch.nn.Module):
             config.sliding_window if config.sliding_window is not None else -1
         )
         self.num_heads = config.num_attention_heads
-        self.mrope_section = (
-            config.rope_scaling.get("mrope_section", None)
-            if config.rope_scaling is not None
-            else None
-        )
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_heads
 
@@ -127,17 +122,6 @@ class Qwen2Attention(torch.nn.Module):
         query = query.view(-1, self.num_heads, self.head_size)
         kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
 
-        if self.mrope_section is not None:
-            # if mrope_section is set, we need to split the cos and sin into 3 parts and concatenate them in a specific order
-            cos = torch.cat(
-                [m[i % 3] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
-                dim=-1,
-            )
-            sin = torch.cat(
-                [m[i % 3] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
-                dim=-1,
-            )
-
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
         if prefill_cache_indices is not None:
@@ -251,7 +235,7 @@ class Qwen2Layer(nn.Module):
         max_s,
         prefill_cache_indices,
     ):
-        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+        normed_hidden_states, residual = self.input_layernorm(hidden_states)
 
         # Self Attention
         attn_output = self.self_attn(
@@ -266,15 +250,13 @@ class Qwen2Layer(nn.Module):
             max_s,
             prefill_cache_indices,
         )
+        hidden_states = attn_output + residual
 
         # faster post attention rms norm
-        normed_attn_res_output, attn_res = self.post_attention_layernorm(
-            attn_output, res
-        )
-
-        mlp_output = self.mlp(normed_attn_res_output)
-
-        return mlp_output, attn_res
+        hidden_states, residual = self.post_attention_layernorm(hidden_states)
+        mlp_output = self.mlp(hidden_states)
+        hidden_states = mlp_output + residual
+        return hidden_states
 
 
 class Qwen2Model(torch.nn.Module):
@@ -322,18 +304,15 @@ class Qwen2Model(torch.nn.Module):
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
-        # flatten position ids from 2D to 1D
         cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids.flatten(), true_max_s, hidden_states.dtype
+            position_ids,
+            true_max_s,
+            hidden_states.dtype,
         )
-        # reshape back to 2D if the position_ids were 2D
-        if position_ids.size(0) != cos.size(0):
-            cos = cos.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2)
-            sin = sin.view(position_ids.size(0), position_ids.size(-1), -1).unsqueeze(2)
 
         residual = None
         for i, layer in enumerate(self.layers):
-            hidden_states, residual = layer(
+            hidden_states = layer(
                 hidden_states,
                 residual,
                 cos,
@@ -347,7 +326,7 @@ class Qwen2Model(torch.nn.Module):
                 prefill_cache_indices,
             )
 
-        hidden_states, _ = self.norm(hidden_states, residual)
+        hidden_states, _ = self.norm(hidden_states)
 
         return hidden_states
 
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
index a8e1e8c1..2d017e38 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -222,12 +222,11 @@ class Qwen2VLVisionBlock(nn.Module):
     def forward(
         self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
     ) -> torch.Tensor:
-        hidden_states_post_norm1, res = self.norm1(hidden_states)
-        hidden_states = hidden_states + self.attn(
-            hidden_states_post_norm1, cu_seqlens, rotary_pos_emb, max_seqlen
-        )
-        hidden_states_post_norm2, res = self.norm2(hidden_states)
-        hidden_states = hidden_states + self.mlp(hidden_states_post_norm2)
+        norm1_out, residual = self.norm1(hidden_states)
+        attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen)
+        hidden_states = attn_out + residual
+        norm2_out, residual = self.norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(norm2_out)
         return hidden_states
 
 
@@ -378,8 +377,12 @@ class Qwen2VLForConditionalGeneration(nn.Module):
         self.config = config
         config.vision_config.quantize = None
         config.vision_config.speculator = config.speculator
+        # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
+        # returns rope_scaling.type == "default" for Qwen2-VL model at the moment
+        config.rope_scaling.update({"rope_type": "mrope"})
         self.hidden_size = config.hidden_size
         self.vision_start_token_id = config.vision_start_token_id
+        self.vision_end_token_id = config.vision_end_token_id
         self.image_token_id = config.image_token_id
         self.video_token_id = config.video_token_id
         self.spatial_merge_size = config.vision_config.spatial_merge_size
@@ -407,99 +410,89 @@ class Qwen2VLForConditionalGeneration(nn.Module):
         )
         self.device = weights.device
 
+    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
+    # modified to first find segments then initialize position ids for each segment
+    # Steps:
+    #  locate all vision and text segments
+    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
+    #  calculate `text_segment_lengths` for each text segment to be used as offset
+    #  create position ids for each vision segment based on the image grid
+    #  create position ids for each text segment
+    #  combine all the position ids
+    #  the final segment is the difference between the last vision segment and the end of the input
+    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
     def get_position_ids(
         self,
-        batch_input_ids: torch.Tensor,
-        image_grid_thw: Optional[torch.LongTensor] = None,
-        # video_grid_thw is not implemented yet as we do not accept video inputs at the moment
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if batch_input_ids.dim() == 1:
-            batch_input_ids = batch_input_ids.unsqueeze(0)
-
-        position_ids = torch.ones(
-            3,
-            batch_input_ids.shape[0],
-            batch_input_ids.shape[1],
-            dtype=batch_input_ids.dtype,
-            device=batch_input_ids.device,
-        )
-        d = batch_input_ids.device
-        if image_grid_thw is not None:
-            image_index = 0
-            llm_pos_ids_list = []
-
-            for i, input_ids in enumerate(batch_input_ids):
-                vision_start_indices = torch.argwhere(
-                    input_ids == self.vision_start_token_id
-                ).squeeze(1)
-                vision_tokens = input_ids[vision_start_indices + 1]
-                # only copy the sum of the image tokens GPU<->CPU
-                image_count = (vision_tokens == self.image_token_id).sum().item()
-
-                current_pos = 0
-                for _ in range(image_count):
-                    # copy the value position of the next image token from GPU<->CPU
-                    next_image_pos = (
-                        (input_ids[current_pos:] == self.image_token_id)
-                        .nonzero()[0]
-                        .item()
-                    )
-                    # TODO: revisit above to get all next_image_pos in one go to avoid copying in the loop
-                    time_steps, height, width = image_grid_thw[image_index].clone()
-                    height //= self.spatial_merge_size
-                    width //= self.spatial_merge_size
-
-                    # calculate the length of the text and image tokens
-                    text_length = next_image_pos
-                    start_idx = (
-                        llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                    )
-
-                    # text position ids
-                    text_pos_ids = torch.arange(text_length, device=d)
-                    text_pos_ids = text_pos_ids.view(1, -1).expand(3, -1) + start_idx
-                    llm_pos_ids_list.append(text_pos_ids)
-
-                    # image position ids
-                    t_indices = torch.arange(time_steps, device=d).repeat_interleave(
-                        height * width
-                    )
-                    h_indices = (
-                        torch.arange(height, device=d)
-                        .repeat_interleave(width)
-                        .repeat(time_steps)
-                    )
-                    w_indices = torch.arange(width, device=d).repeat(
-                        height * time_steps
-                    )
-
-                    image_pos_ids = (
-                        torch.stack([t_indices, h_indices, w_indices])
-                        + text_length
-                        + start_idx
-                    )
-                    llm_pos_ids_list.append(image_pos_ids)
-
-                    current_pos += next_image_pos + time_steps * height * width
-                    image_index += 1
-
-            if current_pos < batch_input_ids.size(1):
-                st_idx = (
-                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-                )
-                text_len = batch_input_ids.size(1) - current_pos
-                llm_pos_ids_list.append(
-                    torch.arange(text_len, device=d).view(1, -1).expand(3, -1) + st_idx
-                )
-
-            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-            position_ids[:, i, :] = llm_positions.to(position_ids.device)
-        else:
-            position_ids = (
-                torch.arange(batch_input_ids.shape[1], device=batch_input_ids.device)
-                .view(1, 1, -1)
-                .repeat(3, batch_input_ids.shape[0], 1)
+        input_ids: torch.Tensor,
+        image_grid_thw: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if image_grid_thw is None:
+            return (
+                torch.arange(input_ids.shape[0], device=input_ids.device)
+                .unsqueeze(1)
+                .repeat(1, 3)
             )
+
+        spatial_merge_size = self.spatial_merge_size
+        vision_start_token_id = self.vision_start_token_id
+        vision_end_token_id = self.vision_end_token_id
+        device = input_ids.device
+        dtype = input_ids.dtype
+        input_ids_len = input_ids.shape[0]
+
+        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
+        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
+        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
+        prev_vision_end = torch.cat(
+            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
+        )
+        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
+        vision_widths_max = torch.cat(
+            [
+                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
+                image_grid_thw[:-1, 2] // spatial_merge_size,
+            ]
+        )
+        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
+        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
+        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision
+
+        # create position ids for each vision segment based on the image grid
+        llm_pos_ids_list = []
+        for i, _ in enumerate(vision_segments):
+            t, h, w = (
+                image_grid_thw[i][0],
+                image_grid_thw[i][1] // spatial_merge_size,
+                image_grid_thw[i][2] // spatial_merge_size,
+            )
+            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
+            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
+            w_indices = torch.arange(w, device=device).repeat(t * h)
+            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)
+
+            # offset by the position of the last vision segment
+            im = image_position_ids + vision_segment_lengths[i]
+            llm_pos_ids_list.append(im)
+
+        # create position ids for each text segment
+        text_ranges = [
+            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
+            + text_segment_lengths[i]
+            for i, seq_len in enumerate(text_lengths_between_vision)
+        ]
+
+        full_llm_pos_ids_list = [
+            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
+        ]
+        max_s = full_llm_pos_ids_list[-1].max() + 1
+        final_text_len = input_ids_len - vision_ends[-1]
+        if final_text_len > 0:
+            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
+            full_llm_pos_ids_list.append(m + max_s)
+
+        position_ids = (
+            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
+        )
         return position_ids
 
     def forward(
@@ -527,6 +520,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
 
         # apply the visual model to the pixel values if they are provided
         if pixel_values is not None and len(pixel_values) > 0:
+            pixel_values = pixel_values.to(inputs_embeds.dtype)
             if pixel_values is not None:
                 image_embeds = self.visual(
                     pixel_values, grid_thw=image_grid_thw
@@ -545,7 +539,6 @@ class Qwen2VLForConditionalGeneration(nn.Module):
             true_max_s=max_s,
             prefill_cache_indices=prefill_cache_indices,
         )
-        hidden_states, _ = self.norm(hidden_states)
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
         logits, speculative_logits = self.lm_head(hidden_states)
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 1073f4f9..f268e499 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1401,6 +1401,13 @@ class FlashCausalLM(Model):
         if max_bs is None:
             input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
             position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
+            config = getattr(self.model, "config", None)
+            rope_scaling = getattr(config, "rope_scaling", None) if config else None
+            if (  # mrope have position_ids per section, if so repeat n times
+                isinstance(rope_scaling, dict) and rope_scaling["rope_type"] == "mrope"
+            ):
+                n_sections = len(self.model.config.rope_scaling["mrope_section"])
+                position_ids = position_ids.unsqueeze(1).repeat(1, n_sections)
             slots = torch.arange(bs, dtype=torch.int64, device=self.device)
             input_lengths_tensor = (
                 torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
@@ -1456,14 +1463,6 @@ class FlashCausalLM(Model):
         else:
             state = None
 
-        if (
-            hasattr(self.model, "config")
-            and hasattr(self.model.config, "model_type")
-            and self.model.config.model_type == "qwen2_vl"
-        ):
-            if position_ids.dim() == 1:
-                position_ids = self.model.get_position_ids(input_ids)
-
         graph = torch.cuda.CUDAGraph()
         self.cuda_graphs[bs] = {
             "input_ids": input_ids,
@@ -2050,7 +2049,7 @@ class FlashCausalLM(Model):
         # instantly become of shape [BATCH_SIZE]
         if prefill and finished_prefilling:
             indices = batch.cu_seqlen_prefill[1:] - 1
-            batch.position_ids = batch.position_ids[(..., indices)]
+            batch.position_ids = batch.position_ids[indices]
             batch.slot_indices = batch.slot_indices[indices]
             batch.adapter_meta.adapter_indices = batch.adapter_meta.adapter_indices[
                 indices

From 0ef8c8a97aa919151c3cf55fa14b1c1a551aa592 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 6 Feb 2025 12:28:24 +0100
Subject: [PATCH 075/183] Using the "lockfile". (#2992)

* Using the "lockfile".

* Revert dummy modifications.

* Lock on python 3.11

* Another attempt.

* ..

* Bad cache hits.

* The good old monkey.

* How in the world...

* We need the launcher still.

* .

* ..

* Attempt #42

* Don't break all other builds.

* Mode max.

* Applying to other builds.
---
 .github/workflows/build.yaml  |   4 +-
 Dockerfile                    |  72 +++---
 Dockerfile_amd                |  11 +-
 Dockerfile_intel              |  16 +-
 server/Makefile               |  15 +-
 server/pyproject.toml         |   4 +
 server/req.txt                | 212 ++++++++++++++++
 server/requirements_cuda.txt  | 439 +++++++++++++++++++++++++++++-----
 server/requirements_gen.txt   | 180 ++++++++++++++
 server/requirements_intel.txt | 422 +++++++++++++++++++++++++++-----
 server/requirements_rocm.txt  | 422 +++++++++++++++++++++++++++-----
 server/uv.lock                |  28 +--
 tgi-entrypoint.sh             |   1 +
 13 files changed, 1602 insertions(+), 224 deletions(-)
 create mode 100644 server/req.txt
 create mode 100644 server/requirements_gen.txt

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 1bd7163f..720a13cb 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -202,8 +202,8 @@ jobs:
           target: ${{ env.TARGET }}
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
-          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
-          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
+          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=max,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
+          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
       - name: Final
         id: final
         run: |
diff --git a/Dockerfile b/Dockerfile
index 72005333..a963db2f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -195,50 +195,56 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         git \
         && rm -rf /var/lib/apt/lists/*
 
-# Copy conda with PyTorch installed
-COPY --from=pytorch-install /opt/conda /opt/conda
-
-# Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from lorax punica kernels builder
-COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-# Copy build artifacts from mamba builder
-COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
-COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
-COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
-
 # Install flash-attention dependencies
 # RUN pip install einops --no-cache-dir
 
+# Copy conda with PyTorch installed
+COPY --from=pytorch-install /opt/conda /opt/conda
+
 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
-    make gen-server && \
-    python -c "from text_generation_server.pb import generate_pb2" && \
     pip install -U pip uv && \
-    uv pip install -e ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir # && \
-    # uv pip install nvidia-nccl-cu12==2.22.3
+	uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project && \
+    . ./.venv/bin/activate && \
+    make gen-server-raw
 
-ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
+RUN cd server && \
+    uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
+    . ./.venv/bin/activate && \
+    pwd && \
+    text-generation-server --help
+
+# Copy build artifacts from flash attention builder
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+
+# Copy build artifacts from flash attention v2 builder
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/server/.venv/lib/python3.11/site-packages
+
+# Copy build artifacts from custom kernels builder
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from awq kernels builder
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from eetq kernels builder
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from lorax punica kernels builder
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+# Copy build artifacts from mamba builder
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /usr/src/server/.venv/lib/python3.11/site-packages/flashinfer/
+
+
+# ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
 # Required to find libpython within the rust binaries
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 # This is needed because exl2 tries to load flash-attn
diff --git a/Dockerfile_amd b/Dockerfile_amd
index 205e46d9..f95f3b6d 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -320,9 +320,16 @@ COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
-    make gen-server && \
     pip install -U pip uv && \
-    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project && \
+    . ./.venv/bin/activate && \
+    make gen-server-raw
+
+RUN cd server && \
+	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines && \
+    . ./.venv/bin/activate && \
+    pwd && \
+    text-generation-server --help
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 0f0d4383..3bd697fd 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -215,9 +215,16 @@ COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
-    make gen-server && \
     pip install -U pip uv && \
-    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project && \
+    . ./.venv/bin/activate && \
+    make gen-server-raw
+
+RUN cd server && \
+	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines && \
+    . ./.venv/bin/activate && \
+    pwd && \
+    text-generation-server --help
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
@@ -231,5 +238,8 @@ ENV ATTENTION=flashdecoding-ipex
 ENV PREFIX_CACHING=1
 ENV PREFILL_CHUNKING=1
 ENV CUDA_GRAPHS=0
-ENTRYPOINT ["text-generation-launcher"]
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
 CMD ["--json-output"]
diff --git a/server/Makefile b/server/Makefile
index 252e355d..746b7faa 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -16,7 +16,14 @@ unit-tests:
 gen-server:
 	# Compile protos
 	pip install -U pip uv
-	uv pip install ".[gen]"
+	uv pip install -r requirements_gen.txt
+	mkdir text_generation_server/pb || true
+	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
+		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
+	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+	touch text_generation_server/pb/__init__.py
+
+gen-server-raw:
 	mkdir text_generation_server/pb || true
 	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
 		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
@@ -35,3 +42,9 @@ install-cuda: install-server install-flash-attention-v2-cuda install-flash-atten
 	uv pip install nvidia-nccl-cu12==2.22.3
 
 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
+
+export-requirements:
+	uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
+	uv pip compile pyproject.toml --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11
+	uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11
+	uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11
diff --git a/server/pyproject.toml b/server/pyproject.toml
index da3ba820..d64a143f 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     "sentencepiece>=0.2.0",
     "tokenizers>=0.20.3",
     "typer>=0.15.1",
+    "transformers>=4.48.0"
 ]
 
 [project.scripts]
@@ -83,5 +84,8 @@ markers = ["private: marks tests as requiring an admin hf token (deselect with '
 [tool.isort]
 profile = "black"
 
+[tool.uv]
+package = true
+
 [tool.setuptools.packages.find]
 include = ["text_generation_server*"]
diff --git a/server/req.txt b/server/req.txt
new file mode 100644
index 00000000..f653999e
--- /dev/null
+++ b/server/req.txt
@@ -0,0 +1,212 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra attention --extra bnb -o req.txt
+attention-kernels @ https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl
+    # via text-generation-server (pyproject.toml)
+bitsandbytes==0.45.1
+    # via text-generation-server (pyproject.toml)
+certifi==2025.1.31
+    # via requests
+charset-normalizer==3.4.1
+    # via requests
+click==8.1.8
+    # via typer
+deprecated==1.2.18
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.17.0
+    # via
+    #   huggingface-hub
+    #   torch
+fsspec==2025.2.0
+    # via
+    #   huggingface-hub
+    #   torch
+googleapis-common-protos==1.66.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.70.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.70.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.70.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.9
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via tokenizers
+idna==3.10
+    # via requests
+importlib-metadata==8.5.0
+    # via opentelemetry-api
+jinja2==3.1.5
+    # via torch
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+networkx==3.4.2
+    # via torch
+numpy==2.2.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   bitsandbytes
+    #   scipy
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+packaging==24.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-proto
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pygments==2.19.1
+    # via rich
+pyyaml==6.0.2
+    # via huggingface-hub
+requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+safetensors==0.5.2
+    # via text-generation-server (pyproject.toml)
+scipy==1.15.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+setuptools==75.8.0
+    # via torch
+shellingham==1.5.4
+    # via typer
+sympy==1.13.1
+    # via torch
+tokenizers==0.21.0
+    # via text-generation-server (pyproject.toml)
+torch==2.6.0
+    # via
+    #   attention-kernels
+    #   bitsandbytes
+tqdm==4.67.1
+    # via huggingface-hub
+triton==3.2.0
+    # via torch
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   torch
+    #   typer
+urllib3==2.3.0
+    # via requests
+wrapt==1.17.2
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+zipp==3.21.0
+    # via importlib-metadata
diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt
index ee75b2b5..051045bc 100644
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@@ -1,55 +1,384 @@
-certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
-click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
-einops==0.8.0 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
-grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.68.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
-importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
-loguru==0.7.2 ; python_version >= "3.9" and python_version < "3.13"
-markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
-mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-common==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
-pillow==11.0.0 ; python_version >= "3.9" and python_version < "3.13"
-prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13"
-py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-rich==13.9.4 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
-shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.46.3 ; python_version >= "3.9" and python_version < "3.13"
-typer==0.12.5 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
-win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
-wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11
+accelerate==1.3.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   peft
+aiohappyeyeballs==2.4.4
+    # via aiohttp
+aiohttp==3.11.11
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.2
+    # via aiohttp
+airportsdata==20241001
+    # via outlines
+annotated-types==0.7.0
+    # via pydantic
+attention-kernels @ https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl
+    # via text-generation-server (pyproject.toml)
+attrs==25.1.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+bitsandbytes==0.45.1
+    # via text-generation-server (pyproject.toml)
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via typer
+cloudpickle==3.1.1
+    # via outlines
+compressed-tensors==0.9.1
+    # via text-generation-server (pyproject.toml)
+datasets==2.21.0
+    # via text-generation-server (pyproject.toml)
+deprecated==1.2.14
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+diskcache==5.6.3
+    # via outlines
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+genson==1.3.0
+    # via outlines
+googleapis-common-protos==1.65.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.68.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.68.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.68.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.8
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via
+    #   accelerate
+    #   datasets
+    #   peft
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+importlib-metadata==7.1.0
+    # via opentelemetry-api
+interegular==0.3.3
+    # via
+    #   outlines
+    #   outlines-core
+jinja2==3.1.5
+    # via
+    #   outlines
+    #   torch
+jsonschema==4.23.0
+    # via
+    #   outlines
+    #   outlines-core
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+lark==1.2.2
+    # via outlines
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+marlin-kernels @ https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl
+    # via text-generation-server (pyproject.toml)
+mdurl==0.1.2
+    # via markdown-it-py
+moe-kernels @ https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl
+    # via text-generation-server (pyproject.toml)
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+nest-asyncio==1.6.0
+    # via outlines
+networkx==3.4.2
+    # via torch
+numpy==1.26.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   bitsandbytes
+    #   datasets
+    #   outlines
+    #   pandas
+    #   peft
+    #   scipy
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
+nvidia-ml-py==12.570.86
+    # via moe-kernels
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+outlines==0.1.14
+    # via text-generation-server (pyproject.toml)
+outlines-core==0.1.26
+    # via outlines
+packaging==24.1
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+    #   peft
+    #   transformers
+pandas==2.2.3
+    # via datasets
+peft==0.14.0
+    # via text-generation-server (pyproject.toml)
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+propcache==0.2.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-proto
+psutil==6.1.1
+    # via
+    #   accelerate
+    #   peft
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pyarrow==19.0.0
+    # via datasets
+pycountry==24.6.1
+    # via outlines
+pydantic==2.10.6
+    # via
+    #   compressed-tensors
+    #   outlines
+pydantic-core==2.27.2
+    # via pydantic
+pygments==2.18.0
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2025.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   peft
+    #   transformers
+referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   outlines
+regex==2024.9.11
+    # via transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+    #   outlines
+    #   transformers
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+rpds-py==0.22.3
+    # via
+    #   jsonschema
+    #   referencing
+safetensors==0.4.5
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   peft
+    #   transformers
+scipy==1.13.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sympy==1.13.1
+    # via torch
+texttable==1.7.0
+    # via text-generation-server (pyproject.toml)
+tokenizers==0.21.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+torch==2.6.0
+    # via
+    #   accelerate
+    #   attention-kernels
+    #   bitsandbytes
+    #   compressed-tensors
+    #   marlin-kernels
+    #   moe-kernels
+    #   outlines
+    #   peft
+tqdm==4.66.5
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   outlines
+    #   peft
+    #   transformers
+transformers==4.48.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   compressed-tensors
+    #   peft
+triton==3.2.0
+    # via
+    #   moe-kernels
+    #   torch
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   outlines
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   torch
+    #   typer
+tzdata==2025.1
+    # via pandas
+urllib3==2.2.3
+    # via requests
+wrapt==1.16.0
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+xxhash==3.5.0
+    # via datasets
+yarl==1.18.3
+    # via aiohttp
+zipp==3.20.2
+    # via importlib-metadata
diff --git a/server/requirements_gen.txt b/server/requirements_gen.txt
new file mode 100644
index 00000000..d9836ad7
--- /dev/null
+++ b/server/requirements_gen.txt
@@ -0,0 +1,180 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
+certifi==2025.1.31
+    # via requests
+charset-normalizer==3.4.1
+    # via requests
+click==8.1.8
+    # via typer
+deprecated==1.2.18
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.17.0
+    # via
+    #   huggingface-hub
+    #   transformers
+fsspec==2025.2.0
+    # via huggingface-hub
+googleapis-common-protos==1.66.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.70.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   grpcio-tools
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.70.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.70.0
+    # via text-generation-server (pyproject.toml)
+grpcio-tools==1.70.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.9
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via requests
+importlib-metadata==8.5.0
+    # via opentelemetry-api
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+mdurl==0.1.2
+    # via markdown-it-py
+mypy-protobuf==3.6.0
+    # via text-generation-server (pyproject.toml)
+numpy==2.2.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   scipy
+    #   transformers
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+packaging==24.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+    #   transformers
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   grpcio-tools
+    #   mypy-protobuf
+    #   opentelemetry-proto
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pygments==2.19.1
+    # via rich
+pyyaml==6.0.2
+    # via
+    #   huggingface-hub
+    #   transformers
+regex==2024.11.6
+    # via transformers
+requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+    #   transformers
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+safetensors==0.5.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+scipy==1.15.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+setuptools==75.8.0
+    # via grpcio-tools
+shellingham==1.5.4
+    # via typer
+tokenizers==0.21.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+tqdm==4.67.1
+    # via
+    #   huggingface-hub
+    #   transformers
+transformers==4.48.2
+    # via text-generation-server (pyproject.toml)
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+types-protobuf==5.29.1.20241207
+    # via mypy-protobuf
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   typer
+urllib3==2.3.0
+    # via requests
+wrapt==1.17.2
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+zipp==3.21.0
+    # via importlib-metadata
diff --git a/server/requirements_intel.txt b/server/requirements_intel.txt
index ee75b2b5..778d892e 100644
--- a/server/requirements_intel.txt
+++ b/server/requirements_intel.txt
@@ -1,55 +1,367 @@
-certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
-click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
-einops==0.8.0 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
-grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.68.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
-importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
-loguru==0.7.2 ; python_version >= "3.9" and python_version < "3.13"
-markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
-mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-common==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
-pillow==11.0.0 ; python_version >= "3.9" and python_version < "3.13"
-prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13"
-py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-rich==13.9.4 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
-shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.46.3 ; python_version >= "3.9" and python_version < "3.13"
-typer==0.12.5 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
-win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
-wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11
+accelerate==1.3.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   peft
+aiohappyeyeballs==2.4.4
+    # via aiohttp
+aiohttp==3.11.11
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.2
+    # via aiohttp
+airportsdata==20241001
+    # via outlines
+annotated-types==0.7.0
+    # via pydantic
+attrs==25.1.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via typer
+cloudpickle==3.1.1
+    # via outlines
+compressed-tensors==0.9.1
+    # via text-generation-server (pyproject.toml)
+datasets==2.21.0
+    # via text-generation-server (pyproject.toml)
+deprecated==1.2.14
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+diskcache==5.6.3
+    # via outlines
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+genson==1.3.0
+    # via outlines
+googleapis-common-protos==1.65.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.68.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.68.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.68.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.8
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via
+    #   accelerate
+    #   datasets
+    #   peft
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+importlib-metadata==7.1.0
+    # via opentelemetry-api
+interegular==0.3.3
+    # via
+    #   outlines
+    #   outlines-core
+jinja2==3.1.5
+    # via
+    #   outlines
+    #   torch
+jsonschema==4.23.0
+    # via
+    #   outlines
+    #   outlines-core
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+lark==1.2.2
+    # via outlines
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+nest-asyncio==1.6.0
+    # via outlines
+networkx==3.4.2
+    # via torch
+numpy==1.26.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   datasets
+    #   outlines
+    #   pandas
+    #   peft
+    #   scipy
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+outlines==0.1.14
+    # via text-generation-server (pyproject.toml)
+outlines-core==0.1.26
+    # via outlines
+packaging==24.1
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+    #   peft
+    #   transformers
+pandas==2.2.3
+    # via datasets
+peft==0.14.0
+    # via text-generation-server (pyproject.toml)
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+propcache==0.2.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-proto
+psutil==6.1.1
+    # via
+    #   accelerate
+    #   peft
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pyarrow==19.0.0
+    # via datasets
+pycountry==24.6.1
+    # via outlines
+pydantic==2.10.6
+    # via
+    #   compressed-tensors
+    #   outlines
+pydantic-core==2.27.2
+    # via pydantic
+pygments==2.18.0
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2025.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   peft
+    #   transformers
+referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   outlines
+regex==2024.9.11
+    # via transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+    #   outlines
+    #   transformers
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+rpds-py==0.22.3
+    # via
+    #   jsonschema
+    #   referencing
+safetensors==0.4.5
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   peft
+    #   transformers
+scipy==1.13.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sympy==1.13.1
+    # via torch
+texttable==1.7.0
+    # via text-generation-server (pyproject.toml)
+tokenizers==0.21.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+torch==2.6.0
+    # via
+    #   accelerate
+    #   compressed-tensors
+    #   outlines
+    #   peft
+tqdm==4.66.5
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   outlines
+    #   peft
+    #   transformers
+transformers==4.48.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   compressed-tensors
+    #   peft
+triton==3.2.0
+    # via torch
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   outlines
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   torch
+    #   typer
+tzdata==2025.1
+    # via pandas
+urllib3==2.2.3
+    # via requests
+wrapt==1.16.0
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+xxhash==3.5.0
+    # via datasets
+yarl==1.18.3
+    # via aiohttp
+zipp==3.20.2
+    # via importlib-metadata
diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt
index ee75b2b5..65eb998b 100644
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@@ -1,55 +1,367 @@
-certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
-click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
-einops==0.8.0 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
-grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.68.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
-importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13"
-loguru==0.7.2 ; python_version >= "3.9" and python_version < "3.13"
-markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
-mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-common==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.27.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.48b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
-pillow==11.0.0 ; python_version >= "3.9" and python_version < "3.13"
-prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13"
-py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-rich==13.9.4 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
-shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.46.3 ; python_version >= "3.9" and python_version < "3.13"
-typer==0.12.5 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
-win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
-wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11
+accelerate==1.3.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   peft
+aiohappyeyeballs==2.4.4
+    # via aiohttp
+aiohttp==3.11.11
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.2
+    # via aiohttp
+airportsdata==20241001
+    # via outlines
+annotated-types==0.7.0
+    # via pydantic
+attrs==25.1.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via typer
+cloudpickle==3.1.1
+    # via outlines
+compressed-tensors==0.9.1
+    # via text-generation-server (pyproject.toml)
+datasets==2.21.0
+    # via text-generation-server (pyproject.toml)
+deprecated==1.2.14
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-semantic-conventions
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+diskcache==5.6.3
+    # via outlines
+einops==0.8.0
+    # via text-generation-server (pyproject.toml)
+filelock==3.16.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.6.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+genson==1.3.0
+    # via outlines
+googleapis-common-protos==1.65.0
+    # via
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+grpc-interceptor==0.15.4
+    # via text-generation-server (pyproject.toml)
+grpcio==1.68.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   grpc-interceptor
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio-reflection==1.68.0
+    # via text-generation-server (pyproject.toml)
+grpcio-status==1.68.0
+    # via text-generation-server (pyproject.toml)
+hf-transfer==0.1.8
+    # via text-generation-server (pyproject.toml)
+huggingface-hub==0.28.1
+    # via
+    #   accelerate
+    #   datasets
+    #   peft
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+importlib-metadata==7.1.0
+    # via opentelemetry-api
+interegular==0.3.3
+    # via
+    #   outlines
+    #   outlines-core
+jinja2==3.1.5
+    # via
+    #   outlines
+    #   torch
+jsonschema==4.23.0
+    # via
+    #   outlines
+    #   outlines-core
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+lark==1.2.2
+    # via outlines
+loguru==0.7.3
+    # via text-generation-server (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+nest-asyncio==1.6.0
+    # via outlines
+networkx==3.4.2
+    # via torch
+numpy==1.26.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   datasets
+    #   outlines
+    #   pandas
+    #   peft
+    #   scipy
+    #   transformers
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+opentelemetry-api==1.30.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp==1.30.0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-exporter-otlp-proto-common==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-grpc==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-exporter-otlp-proto-http==1.30.0
+    # via opentelemetry-exporter-otlp
+opentelemetry-instrumentation==0.51b0
+    # via opentelemetry-instrumentation-grpc
+opentelemetry-instrumentation-grpc==0.51b0
+    # via text-generation-server (pyproject.toml)
+opentelemetry-proto==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.30.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.51b0
+    # via
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+    #   opentelemetry-sdk
+outlines==0.1.14
+    # via text-generation-server (pyproject.toml)
+outlines-core==0.1.26
+    # via outlines
+packaging==24.1
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-instrumentation
+    #   peft
+    #   transformers
+pandas==2.2.3
+    # via datasets
+peft==0.14.0
+    # via text-generation-server (pyproject.toml)
+pillow==11.1.0
+    # via text-generation-server (pyproject.toml)
+prometheus-client==0.21.1
+    # via text-generation-server (pyproject.toml)
+propcache==0.2.1
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.29.3
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   grpcio-status
+    #   opentelemetry-proto
+psutil==6.1.1
+    # via
+    #   accelerate
+    #   peft
+py-cpuinfo==9.0.0
+    # via text-generation-server (pyproject.toml)
+pyarrow==19.0.0
+    # via datasets
+pycountry==24.6.1
+    # via outlines
+pydantic==2.10.6
+    # via
+    #   compressed-tensors
+    #   outlines
+pydantic-core==2.27.2
+    # via pydantic
+pygments==2.18.0
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2025.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   peft
+    #   transformers
+referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   outlines
+regex==2024.9.11
+    # via transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   opentelemetry-exporter-otlp-proto-http
+    #   outlines
+    #   transformers
+rich==13.9.4
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   typer
+rpds-py==0.22.3
+    # via
+    #   jsonschema
+    #   referencing
+safetensors==0.4.5
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   accelerate
+    #   peft
+    #   transformers
+scipy==1.13.1
+    # via text-generation-server (pyproject.toml)
+sentencepiece==0.2.0
+    # via text-generation-server (pyproject.toml)
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sympy==1.13.1
+    # via torch
+texttable==1.7.0
+    # via text-generation-server (pyproject.toml)
+tokenizers==0.21.0
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   transformers
+torch==2.6.0
+    # via
+    #   accelerate
+    #   compressed-tensors
+    #   outlines
+    #   peft
+tqdm==4.66.5
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   outlines
+    #   peft
+    #   transformers
+transformers==4.48.2
+    # via
+    #   text-generation-server (pyproject.toml)
+    #   compressed-tensors
+    #   peft
+triton==3.2.0
+    # via torch
+typer==0.15.1
+    # via text-generation-server (pyproject.toml)
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   opentelemetry-sdk
+    #   outlines
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   torch
+    #   typer
+tzdata==2025.1
+    # via pandas
+urllib3==2.2.3
+    # via requests
+wrapt==1.16.0
+    # via
+    #   deprecated
+    #   opentelemetry-instrumentation
+    #   opentelemetry-instrumentation-grpc
+xxhash==3.5.0
+    # via datasets
+yarl==1.18.3
+    # via aiohttp
+zipp==3.20.2
+    # via importlib-metadata
diff --git a/server/uv.lock b/server/uv.lock
index 5684d581..5410a58c 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -997,15 +997,15 @@ wheels = [
 
 [[package]]
 name = "moe-kernels"
-version = "0.8.0"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
+version = "0.8.2"
+source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
 dependencies = [
     { name = "nvidia-ml-py" },
     { name = "torch" },
     { name = "triton" },
 ]
 wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:92c4e083c037a325458e731dda6770790495cab273c9bbf5f50fb8e262c099de" },
+    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:1ed5b26f52339d25ea2513e99e8b6239cf1921af3eac54e03a46bb8f8efb380b" },
 ]
 
 [package.metadata]
@@ -1308,7 +1308,6 @@ name = "nvidia-cublas-cu12"
 version = "12.4.5.8"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 },
     { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 },
 ]
 
@@ -1317,7 +1316,6 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 },
     { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 },
 ]
 
@@ -1326,7 +1324,6 @@ name = "nvidia-cuda-nvrtc-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 },
     { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 },
 ]
 
@@ -1335,7 +1332,6 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 },
     { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 },
 ]
 
@@ -1358,7 +1354,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 },
     { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
 ]
 
@@ -1367,7 +1362,6 @@ name = "nvidia-curand-cu12"
 version = "10.3.5.147"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 },
     { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 },
 ]
 
@@ -1381,7 +1375,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 },
     { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
 ]
 
@@ -1393,7 +1386,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 },
     { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
 ]
 
@@ -1419,7 +1411,6 @@ name = "nvidia-nvjitlink-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 },
     { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 },
 ]
 
@@ -1428,7 +1419,6 @@ name = "nvidia-nvtx-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 },
     { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
 ]
 
@@ -2656,7 +2646,7 @@ wheels = [
 [[package]]
 name = "text-generation-server"
 version = "2.0.5.dev0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
     { name = "einops" },
     { name = "grpc-interceptor" },
@@ -2680,6 +2670,7 @@ dependencies = [
     { name = "scipy", version = "1.15.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "sentencepiece" },
     { name = "tokenizers" },
+    { name = "transformers" },
     { name = "typer" },
 ]
 
@@ -2746,7 +2737,7 @@ requires-dist = [
     { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
     { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
     { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.0/moe_kernels-0.8.0+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
+    { name = "moe-kernels", marker = "extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
     { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
     { name = "numpy", specifier = ">=1.26,<3" },
     { name = "opentelemetry-api", specifier = ">=1.27.0" },
@@ -2765,6 +2756,7 @@ requires-dist = [
     { name = "sentencepiece", specifier = ">=0.2.0" },
     { name = "texttable", marker = "extra == 'quantize'", specifier = ">=1.6.7,<2" },
     { name = "tokenizers", specifier = ">=0.20.3" },
+    { name = "transformers", specifier = ">=4.48.0" },
     { name = "typer", specifier = ">=0.15.1" },
 ]
 
@@ -2902,7 +2894,7 @@ wheels = [
 
 [[package]]
 name = "transformers"
-version = "4.48.0"
+version = "4.48.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -2917,9 +2909,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ea/71/93a6331682d6f15adf7d646956db0c43e5f1759bbbd05f2ef53029bae107/transformers-4.48.0.tar.gz", hash = "sha256:03fdfcbfb8b0367fb6c9fbe9d1c9aa54dfd847618be9b52400b2811d22799cb1", size = 8372101 }
+sdist = { url = "https://files.pythonhosted.org/packages/c5/cf/1093586e09c8d889d2f6b8ffe6a1369e1e179eb7b8e732fc0f348a8fe58f/transformers-4.48.2.tar.gz", hash = "sha256:dcfb73473e61f22fb3366fe2471ed2e42779ecdd49527a1bdf1937574855d516", size = 8370945 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/45/d6/a69764e89fc5c2c957aa473881527c8c35521108d553df703e9ba703daeb/transformers-4.48.0-py3-none-any.whl", hash = "sha256:6d3de6d71cb5f2a10f9775ccc17abce9620195caaf32ec96542bd2a6937f25b0", size = 9673380 },
+    { url = "https://files.pythonhosted.org/packages/bd/40/902c95a2a6f5d2d120c940ac4bd1f937c01035af529803c13d65ca33c2d1/transformers-4.48.2-py3-none-any.whl", hash = "sha256:493bc5b0268b116eff305edf6656367fc89cf570e7a9d5891369e04751db698a", size = 9667774 },
 ]
 
 [[package]]
diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh
index 278c7d96..94ea9436 100755
--- a/tgi-entrypoint.sh
+++ b/tgi-entrypoint.sh
@@ -2,4 +2,5 @@
 
 ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
 
+source ./server/.venv/bin/activate
 exec text-generation-launcher $@

From 36223f834e44a117bc11ad3e1d83f5d793dd0f23 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Thu, 6 Feb 2025 19:28:41 +0800
Subject: [PATCH 076/183] Triton fix (#2995)

fix triton to 3.1.0 to fix ipex import issue

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_intel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index 3bd697fd..78617641 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -190,7 +190,7 @@ RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.5.0.dev2024
 RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.20.0.dev20240815%2Bcpu-cp311-cp311-linux_x86_64.whl
 RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240815%2Bcpu-cp311-cp311-linux_x86_64.whl
 
-RUN pip install triton py-libnuma
+RUN pip install triton==3.1.0 py-libnuma
 
 WORKDIR /usr/src
 

From 856709d5c3d24a9cad53c2ab94835c6190bacf3c Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 6 Feb 2025 16:45:03 +0100
Subject: [PATCH 077/183] [Backend] Bump TRTLLM to v.0.17.0 (#2991)

* backend(trtllm): bump TRTLLM to v.0.17.0

* backend(trtllm): forget to bump dockerfile

* backend(trtllm): use arg instead of env

* backend(trtllm): use correct library reference decoder_attention_src

* backend(trtllm): link against decoder_attention_{0|1}

* backend(trtllm): build against gcc-14 with cuda12.8

* backend(trtllm): use return value optimization flag as as error if available

* backend(trtllm): make sure we escalade all warnings as errors on the backend impl in debug mode

* backend(trtllm): link against CUDA 12.8
---
 Dockerfile_trtllm                           | 14 +++++++++-----
 backends/trtllm/CMakeLists.txt              |  7 +++++--
 backends/trtllm/build.rs                    |  7 ++++---
 backends/trtllm/cmake/trtllm.cmake          |  2 +-
 backends/trtllm/scripts/install_tensorrt.sh | 16 ++++++++--------
 5 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index 2a0636ff..14a74c00 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -1,12 +1,14 @@
-ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
+ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real;100-real;120-real"
+ARG cuda_base=12.8.0
 ARG build_type=release
 ARG ompi_version=4.1.7
 ARG sccache_gha_enabled=off
 ARG actions_cache_url=""
 ARG actions_runtime_token=""
 
+
 # CUDA dependent dependencies resolver stage
-FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
+FROM nvidia/cuda:${cuda_base}-cudnn-devel-ubuntu24.04 AS cuda-builder
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     build-essential \
@@ -98,14 +100,16 @@ COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 
 ENV RUSTC_WRAPPER=sccache
 ENV CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX
-RUN export CMAKE_C_COMPILER_LAUNCHER=sccache && \
+RUN export CC=gcc-14 \
+    export CXX=g++-14 \
+    export CMAKE_C_COMPILER_LAUNCHER=sccache && \
     export CMAKE_CXX_COMPILER_LAUNCHER=sccache && \
     export CMAKE_CUDA_COMPILER_LAUNCHER=sccache && \
     mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
     cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \
     sccache --show-stats
 
-FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
+FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS runtime
 RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
     rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
     pipx ensurepath && \
@@ -124,7 +128,7 @@ COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
 COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
 
 # This is used only for the CI/CD
-FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
+FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS ci-runtime
 RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
     rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
     pipx ensurepath && \
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 26af80be..e54fd116 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -59,7 +59,9 @@ target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugi
 
 # This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
 install(TARGETS tgi_trtllm_backend_impl)
-install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
+#install(TARGETS cutlass_src fb_gemm_src fpA_intB_gemm_src gemm_swiglu_sm90_src kernels_src)
+install(TARGETS decoder_attention_0 decoder_attention_1)
+install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention_src executorWorker)
 install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
 if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
     install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
@@ -82,8 +84,9 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug")
     check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
     if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
         message(STATUS "Enabling non-NVRO detection")
-        target_compile_options(tgi_trtllm_backend_impl "-Wnvro")
+        target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wnrvo)
     endif ()
+    target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wall)
 
     cmake_path(GET TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH PARENT_PATH TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH)
     message(STATUS "Adding linking path: ${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index 4d559fd4..c9918e2c 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -7,7 +7,7 @@ use std::sync::LazyLock;
 
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
-const CUDA_REQUIRED_VERSION: &str = "12.6";
+const CUDA_REQUIRED_VERSION: &str = "12.8";
 const MPI_REQUIRED_VERSION: &str = "4.1";
 const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
 const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
@@ -25,11 +25,12 @@ const IS_GHA_BUILD: LazyLock<bool> = LazyLock::new(|| {
 // Dependencies
 const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl";
 const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
-const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [
+const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
     ("dylib", "tensorrt_llm"),
     ("dylib", "tensorrt_llm_nvrtc_wrapper"),
     ("dylib", "nvinfer_plugin_tensorrt_llm"),
-    ("dylib", "decoder_attention"),
+    ("dylib", "decoder_attention_0"),
+    ("dylib", "decoder_attention_1"),
 ];
 
 macro_rules! probe {
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
index 3e9712c0..95a99e9b 100644
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@@ -28,7 +28,7 @@ find_package(Python3 REQUIRED Interpreter)
 fetchcontent_declare(
         trtllm
         GIT_REPOSITORY https://github.com/nvidia/TensorRT-LLM.git
-        GIT_TAG v0.16.0
+        GIT_TAG v0.17.0
         GIT_SHALLOW ON
         DOWNLOAD_EXTRACT_TIMESTAMP
 )
diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh
index f3e7270a..e09db6b1 100755
--- a/backends/trtllm/scripts/install_tensorrt.sh
+++ b/backends/trtllm/scripts/install_tensorrt.sh
@@ -2,13 +2,13 @@
 
 set -ex
 
-TRT_VER_BASE="10.7.0"
-TRT_VER_FULL="${TRT_VER_BASE}.23"
-CUDA_VER="12.6"
-CUDNN_VER="9.5.0.50-1"
-NCCL_VER="2.22.3-1+cuda12.6"
-CUBLAS_VER="12.6.3.3-1"
-NVRTC_VER="12.6.77-1"
+TRT_VER_BASE="10.8.0"
+TRT_VER_FULL="${TRT_VER_BASE}.43"
+CUDA_VER="12.8"
+CUDNN_VER="9.7.0.66-1"
+NCCL_VER="2.25.1-1+cuda${CUDA_VER}"
+CUBLAS_VER="${CUDA_VER}.3.14-1"
+NVRTC_VER="${CUDA_VER}.61-1"
 
 for i in "$@"; do
     case $i in
@@ -73,7 +73,7 @@ install_centos_requirements() {
 install_tensorrt() {
     #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
     #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
-    TRT_CUDA_VERSION="12.6"
+    TRT_CUDA_VERSION="12.8"
 
     if [ -z "$RELEASE_URL_TRT" ];then
         ARCH=${TRT_TARGETARCH}

From 4b8cda684b45b799de01a65e3fe3422a34a621d3 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 7 Feb 2025 10:38:13 +0100
Subject: [PATCH 078/183] Updating mllama after strftime. (#2993)

* Updating mllama after strftime.

* Town instead village.

* Forgot the integration snapshot.

* Attempt to fix intel CPU.

* Intel extension fix.

* Workaround intel.

* Moving those deps directly into pyproject.

* Revert "Moving those deps directly into pyproject."

This reverts commit 98c1496ea61d90ee912caa8461f872568bf73a2f.

* Non system uv.

* Fixing the docker environment hopefully.

* Missed a step.

* Move workdir up a bit.

* Bailing out of reproducible python env.

* Triton version.
---
 Dockerfile_intel                              | 16 +----
 .../test_mllama/test_mllama_load.json         | 64 ++-----------------
 .../test_mllama/test_mllama_simpl.json        |  6 +-
 integration-tests/models/test_mllama.py       | 17 +++--
 4 files changed, 20 insertions(+), 83 deletions(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index 78617641..be248866 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -215,16 +215,9 @@ COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
+    make gen-server && \
     pip install -U pip uv && \
-	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project && \
-    . ./.venv/bin/activate && \
-    make gen-server-raw
-
-RUN cd server && \
-	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines && \
-    . ./.venv/bin/activate && \
-    pwd && \
-    text-generation-server --help
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
@@ -238,8 +231,5 @@ ENV ATTENTION=flashdecoding-ipex
 ENV PREFIX_CACHING=1
 ENV PREFILL_CHUNKING=1
 ENV CUDA_GRAPHS=0
-COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
-RUN chmod +x /tgi-entrypoint.sh
-
-ENTRYPOINT ["/tgi-entrypoint.sh"]
+ENTRYPOINT ["text-generation-launcher"]
 CMD ["--json-output"]
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
index d719a7f3..65154e89 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
@@ -6,7 +6,7 @@
         "index": 0,
         "logprobs": null,
         "message": {
-          "content": "In a bustling city, a chicken named Cluck",
+          "content": "In a small town, a chicken named Cluck",
           "name": null,
           "role": "assistant",
           "tool_calls": null
@@ -14,11 +14,11 @@
         "usage": null
       }
     ],
-    "created": 1727773835,
+    "created": 1738753835,
     "id": "",
     "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
-    "system_fingerprint": "2.4.2-dev0-native",
+    "system_fingerprint": "3.1.1-dev0-native",
     "usage": {
       "completion_tokens": 10,
       "prompt_tokens": 50,
@@ -32,7 +32,7 @@
         "index": 0,
         "logprobs": null,
         "message": {
-          "content": "In a world where even chickens could dream big,",
+          "content": "In a small town, a chicken named Cluck",
           "name": null,
           "role": "assistant",
           "tool_calls": null
@@ -40,63 +40,11 @@
         "usage": null
       }
     ],
-    "created": 1727773835,
+    "created": 1738753835,
     "id": "",
     "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
-    "system_fingerprint": "2.4.2-dev0-native",
-    "usage": {
-      "completion_tokens": 10,
-      "prompt_tokens": 50,
-      "total_tokens": 60
-    }
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 0,
-        "logprobs": null,
-        "message": {
-          "content": "In a world where even chickens could dream big,",
-          "name": null,
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "usage": null
-      }
-    ],
-    "created": 1727773835,
-    "id": "",
-    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
-    "object": "chat.completion",
-    "system_fingerprint": "2.4.2-dev0-native",
-    "usage": {
-      "completion_tokens": 10,
-      "prompt_tokens": 50,
-      "total_tokens": 60
-    }
-  },
-  {
-    "choices": [
-      {
-        "finish_reason": "length",
-        "index": 0,
-        "logprobs": null,
-        "message": {
-          "content": "In a world where even chickens could dream big,",
-          "name": null,
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "usage": null
-      }
-    ],
-    "created": 1727773835,
-    "id": "",
-    "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
-    "object": "chat.completion",
-    "system_fingerprint": "2.4.2-dev0-native",
+    "system_fingerprint": "3.1.1-dev0-native",
     "usage": {
       "completion_tokens": 10,
       "prompt_tokens": 50,
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
index a520e797..14ca3f4e 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "In a bustling city, a chicken named Cluck",
+        "content": "In a small town, a chicken named Cluck",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,11 +13,11 @@
       "usage": null
     }
   ],
-  "created": 1727556016,
+  "created": 1738753833,
   "id": "",
   "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.1.1-dev0-native",
   "usage": {
     "completion_tokens": 10,
     "prompt_tokens": 50,
diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py
index f2335690..90b2bff1 100644
--- a/integration-tests/models/test_mllama.py
+++ b/integration-tests/models/test_mllama.py
@@ -47,8 +47,7 @@ async def test_mllama_simpl(mllama, response_snapshot):
         "total_tokens": 60,
     }
     assert (
-        response.choices[0].message.content
-        == "In a bustling city, a chicken named Cluck"
+        response.choices[0].message.content == "In a small town, a chicken named Cluck"
     )
     assert response == response_snapshot
 
@@ -84,12 +83,12 @@ async def test_mllama_load(mllama, generate_load, response_snapshot):
     ]
     responses = await asyncio.gather(*futures)
 
-    _ = [response.choices[0].message.content for response in responses]
+    generated_texts = [response.choices[0].message.content for response in responses]
 
     # XXX: TODO: Fix this test.
-    # assert generated_texts[0] == "In a bustling city, a chicken named Cluck"
-    # assert len(generated_texts) == 4
-    # assert generated_texts, all(
-    #     [text == generated_texts[0] for text in generated_texts]
-    # )
-    # assert responses == response_snapshot
+    assert generated_texts[0] == "In a small town, a chicken named Cluck"
+    assert len(generated_texts) == 2
+    assert generated_texts, all(
+        [text == generated_texts[0] for text in generated_texts]
+    )
+    assert responses == response_snapshot

From 571ac9b5071441ba7045df45ea844c845ebfe628 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 10 Feb 2025 19:19:25 +0100
Subject: [PATCH 079/183] Use kernels from the kernel hub (#2988)

* Use Hub kernels for Marlin and cutlass quantization kernels

* Use hub kernels for MoE/GPTQ-Marlin MoE

* Use attention kernels from the Hub

* Cache the kernels in the Docker image

* Update moe kernels

* Support loading local kernels for development

* Support latest moe kernels

* Update to moe 0.1.1

* CI: download locked kernels for server tests

* Fixup some imports

* CI: activate venv

* Fix unused imports

* Nix: add attention/moe/quantization kernels

* Update hf-kernels to 0.1.5

* Update kernels

* Update tgi-nix flake for hf-kernels

* Fix EOF

* Take `load_kernel` out of a frequently-called function

* Hoist another case of kernel loading out of a somewhat hot function

* marlin-kernels -> quantization

* attention -> paged-attention

* EOF fix

* Update hf-kernels, fixup Docker

* ipex fix

* Remove outdated TODO
---
 .github/workflows/tests.yaml                  |    4 +
 Dockerfile                                    |    4 +-
 flake.lock                                    |   12 +-
 nix/impure-shell.nix                          |    2 +-
 nix/server.nix                                |   14 +-
 server/hf-kernels.lock                        | 6740 +++++++++++++++++
 server/pyproject.toml                         |   23 +-
 .../layers/attention/cuda.py                  |   26 +-
 .../layers/attention/kv_cache.py              |   33 +-
 .../layers/compressed_tensors/w8a8_int.py     |   24 +-
 server/text_generation_server/layers/fp8.py   |   22 +-
 .../layers/marlin/fp8.py                      |   20 +-
 .../layers/marlin/gptq.py                     |   43 +-
 .../layers/marlin/marlin.py                   |   34 +-
 .../layers/marlin/util.py                     |   13 +-
 .../layers/moe/__init__.py                    |    5 +
 .../text_generation_server/layers/moe/fp8.py  |    2 +-
 .../layers/moe/gptq_marlin.py                 |  118 +-
 .../layers/moe/unquantized.py                 |  126 +-
 .../custom_modeling/flash_dbrx_modeling.py    |    7 +-
 .../text_generation_server/utils/kernels.py   |   22 +
 server/uv.lock                                |  169 +-
 22 files changed, 7206 insertions(+), 257 deletions(-)
 create mode 100644 server/hf-kernels.lock
 create mode 100644 server/text_generation_server/utils/kernels.py

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index cbb3392a..f9ccb842 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -48,6 +48,10 @@ jobs:
           uv venv
           source ./.venv/bin/activate
           make install-cpu
+      - name: Download locked kernels
+        run: |
+          source ./.venv/bin/activate
+          hf-kernels download server
       - name: Run server tests
         run: |
           source ./.venv/bin/activate
diff --git a/Dockerfile b/Dockerfile
index a963db2f..ead6c8c5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -206,11 +206,13 @@ COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
 ENV UV_SYSTEM_PYTHON=1
+ENV HF_KERNELS_CACHE=/kernels
 RUN cd server && \
     pip install -U pip uv && \
 	uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project && \
     . ./.venv/bin/activate && \
-    make gen-server-raw
+    make gen-server-raw && \
+    hf-kernels download .
 
 RUN cd server && \
     uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
diff --git a/flake.lock b/flake.lock
index 3ad3d698..3a9d9c7c 100644
--- a/flake.lock
+++ b/flake.lock
@@ -853,11 +853,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1737685583,
-        "narHash": "sha256-p+NVABRpGi+pT+xxf9HcLcFVxG6L+vEEy+NwzB9T0f8=",
+        "lastModified": 1738549608,
+        "narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "eb64cbcc8eee0fa87ebded92805280d2ec97415a",
+        "rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d",
         "type": "github"
       },
       "original": {
@@ -978,11 +978,11 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1738323634,
-        "narHash": "sha256-lKPzgEm7pEuQJVhacsxFHqg1MOtrUMZvr+9IuJzC5J4=",
+        "lastModified": 1738769628,
+        "narHash": "sha256-hgHf1mscFbH9XtT3dYtFQcxRfict9N+Vi6QSW1c+FjU=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "eb5fede2756f544f75e01f55a4097f9c9a8c5005",
+        "rev": "9a5a58219dead9704d83d9d32f105b6b90bd31f2",
         "type": "github"
       },
       "original": {
diff --git a/nix/impure-shell.nix b/nix/impure-shell.nix
index aebdff84..9b8d7e35 100644
--- a/nix/impure-shell.nix
+++ b/nix/impure-shell.nix
@@ -90,7 +90,7 @@ mkShell {
 
   postVenvCreation = ''
     unset SOURCE_DATE_EPOCH
-    ( cd server ; python -m pip install --no-dependencies -e . )
+    ( cd server ; python -m pip install --no-build-isolation --no-dependencies -e . )
     ( cd clients/python ; python -m pip install --no-dependencies -e . )
   '';
 
diff --git a/nix/server.nix b/nix/server.nix
index 237102a8..b638449b 100644
--- a/nix/server.nix
+++ b/nix/server.nix
@@ -3,7 +3,6 @@
   buildPythonPackage,
   poetry-core,
   mypy-protobuf,
-  attention-kernels,
   awq-inference-engine,
   causal-conv1d,
   compressed-tensors,
@@ -19,22 +18,24 @@
   grpcio-reflection,
   grpcio-status,
   grpcio-tools,
+  hf-kernels,
   hf-transfer,
   loguru,
   mamba-ssm,
-  marlin-kernels,
-  moe-kernels,
+  moe,
   opentelemetry-api,
   opentelemetry-exporter-otlp,
   opentelemetry-instrumentation-grpc,
   opentelemetry-semantic-conventions,
   outlines,
+  paged-attention,
   peft,
   pillow,
   prometheus-client,
   punica-kernels,
   py-cpuinfo,
   pydantic,
+  quantization,
   safetensors,
   tokenizers,
   torch,
@@ -78,7 +79,6 @@ buildPythonPackage {
   pythonRemoveDeps = [ "scipy" ];
 
   dependencies = [
-    attention-kernels
     awq-inference-engine
     eetq
     causal-conv1d
@@ -93,22 +93,24 @@ buildPythonPackage {
     grpcio-reflection
     grpcio-status
     grpcio-tools
+    hf-kernels
     hf-transfer
     loguru
     mamba-ssm
-    marlin-kernels
-    moe-kernels
+    moe
     opentelemetry-api
     opentelemetry-exporter-otlp
     opentelemetry-instrumentation-grpc
     opentelemetry-semantic-conventions
     outlines
+    paged-attention
     peft
     pillow
     prometheus-client
     punica-kernels
     py-cpuinfo
     pydantic
+    quantization
     safetensors
     sentencepiece
     tokenizers
diff --git a/server/hf-kernels.lock b/server/hf-kernels.lock
new file mode 100644
index 00000000..43f7f17d
--- /dev/null
+++ b/server/hf-kernels.lock
@@ -0,0 +1,6740 @@
+[
+  {
+    "repo_id": "kernels-community/paged-attention",
+    "sha": "331b7e63a6b592799c8bc992f681bb1ee2c865a2",
+    "files": [
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "609570440c63122010e6254ac2f92d4e4e52ec02"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_paged_attention_fao6f4gjjrpl6.abi3.so",
+        "blob_id": "a4e60f2c567eb63c84430e9b80acaa0aa6974b1e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "9e52382b912b4e2d07f84982f762345debdbbfc8"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_paged_attention_eo7ts45r6k64y.abi3.so",
+        "blob_id": "c20f9501a41daa820dfda27434674d032931b51e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "5f01e3f8c4ae3a031f109f78e010014d34347647"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_paged_attention_5odgyxqhwqtv2.abi3.so",
+        "blob_id": "74f9714690337f49661c641a4f60f6e1e1f56cfa"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "a3016a6b1cd7ae051012084bbd39d6f2e0913ace"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_paged_attention_uy2moinaww2jc.abi3.so",
+        "blob_id": "445652acd4719542710cda86a2d08c70a56c8094"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "e2cd992a80d4b938f243f0e6060e863278aca7f6"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_paged_attention_35dt23tewn2p2.abi3.so",
+        "blob_id": "1f6414c382a753edb7512927ac5f3e31b196531d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "150412d67365be8ae5668f83d1939148bb576050"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_paged_attention_fhq57q56w3m5o.abi3.so",
+        "blob_id": "ee97eee26a4de8d14d7ccdadaf406eed8405de39"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "2bfef111c96308e595eb628bc88ab660a443089c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_paged_attention_xvepb4loq5mm2.abi3.so",
+        "blob_id": "1ea51bd49f8ec76bbe306a261021da52fe6a980f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "8928daeec47128544cef187bf18f214fc2238019"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_paged_attention_uyfdujhnc2xoe.abi3.so",
+        "blob_id": "cf8ebe40f27db0fa87c46d7b4066494e65843820"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "dff8537df63e1ef37769a6b7ba6b8c58192d7faa"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_paged_attention_pervvqmod6pi4.abi3.so",
+        "blob_id": "77eb42e3471e9aa84d1f5d9854995c9737ed6bf3"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "543c64d1589cb1747d7dc1ac29bd8f2cbeb61ab7"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_paged_attention_24rowhxd5ebcc.abi3.so",
+        "blob_id": "43ec3529d8eac816c31cc1eaad4cc2baa3cbd3d6"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "1d62b9bb1cfb040d7f68cd108ac9067100b4cf2d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_paged_attention_5yleoqr3zje4w.abi3.so",
+        "blob_id": "ffed60cc0a3948bdea6aa7fb4d486d9b943215ec"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/__init__.py",
+        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_custom_ops.py",
+        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_ops.py",
+        "blob_id": "ee817d13be64b46e3cb44ad192af4a5f3817bbf7"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_paged_attention_3rbp7xipfucgo.abi3.so",
+        "blob_id": "5d5b3ffda2fd6a830d12341bab26dc5ec03f4a86"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/platforms.py",
+        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
+      }
+    ]
+  },
+  {
+    "repo_id": "kernels-community/moe",
+    "sha": "605a216f507b9a97b543140dee8937a4622069a8",
+    "files": [
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_2f2wzwk42r5t2.abi3.so",
+        "blob_id": "b1f0ac7d52d2cbb7b49dd4e3e23eaf0b6acd3364"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py",
+        "blob_id": "83a6a6a42d633c9b40e263b40b028086d2609b80"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_qx7m4hiw6tx7s.abi3.so",
+        "blob_id": "cfdb823fabc296c258f58ce8e03a347be7eb558f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py",
+        "blob_id": "0d77c5d3e29106cf62a45153770fafbff59b2932"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_ctvji3e7dq64w.abi3.so",
+        "blob_id": "db5c3be15bc329bc0aa5b87d34223b747751484e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py",
+        "blob_id": "b22bd5b27938464e6c7359b1974db9b472effa6b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_2dhx2xkm6c5wu.abi3.so",
+        "blob_id": "56dc9c91ddc02b281dcf7c996071bee341ef026c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py",
+        "blob_id": "97f4f6344f4a61a52c8077cdc7884400e56c558b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_wfwoejktckaue.abi3.so",
+        "blob_id": "a679d38667a74beffaca30bb9c6628c6b7d0b1c0"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py",
+        "blob_id": "9aebf2be86d230b6de5510163c7b53dcc3aa7c51"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_plhnk6yxrdq3c.abi3.so",
+        "blob_id": "a16cd30c7ff53b3d73fa081369b6443efa5fb184"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py",
+        "blob_id": "5b5d43b13c586ead5f177bcb71ba17c078eb016d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_mrhezivmofzdg.abi3.so",
+        "blob_id": "5423719c9ac75f96528fc0b7386a108aedc996b1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py",
+        "blob_id": "f78c4d78eceaa86a9d245eea4d5562167db8f59b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_lvqy7x44edhqo.abi3.so",
+        "blob_id": "51f9c5792c1d7bcb03a8906e9bc60e779ba1b343"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py",
+        "blob_id": "510c892dcc2479877c6c2fc5c20f6a534dc90d51"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_gasoel7noy6kw.abi3.so",
+        "blob_id": "cd914e2830fbe3fbdcf31c5fa2f37c384d2c36d5"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py",
+        "blob_id": "c69fb498baf329dda803ec0f90dc4b7756fb1ff0"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_cobe53r755p6a.abi3.so",
+        "blob_id": "0082e984366e264ad72eb429f4e138d45f5cbcaf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py",
+        "blob_id": "153d250d92d9f1bf4b6e318287370706aa5cd385"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_iqmfy23diekyw.abi3.so",
+        "blob_id": "25f80aad80865365eae32a1609be0219f7e6582e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py",
+        "blob_id": "73e0213d234c5717aea5d708cff8e0938f14bce9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/__init__.py",
+        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_6xluzhr5x6fw4.abi3.so",
+        "blob_id": "63a2723397b7c031719534dd6c23e3eaa1a85a23"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py",
+        "blob_id": "82fd79cf07706341ad17f8ea5d841b6d018cd676"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
+        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
+        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
+        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
+        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
+        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
+        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
+        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
+        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
+        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fp8.py",
+        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fp8_utils.py",
+        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fused_marlin_moe.py",
+        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py",
+        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/platforms.py",
+        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/scalar_type.py",
+        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/marlin_utils.py",
+        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/marlin_utils_test.py",
+        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/quant_utils.py",
+        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
+      }
+    ]
+  },
+  {
+    "repo_id": "kernels-community/quantization",
+    "sha": "95272c71ca71b1ddbacb0105dab54e5d5240bd5c",
+    "files": [
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py",
+        "blob_id": "07486f19bf899de0eef6e7de9a2a1b08f48e4530"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_diycyzqnjjd5k.abi3.so",
+        "blob_id": "4f729314a4b4a86bc21b05b5e50f6677e7a83a06"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py",
+        "blob_id": "5430a686ea7ef8be7217b48a28b5429606cf93d6"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_elb4wso45znfy.abi3.so",
+        "blob_id": "831303548db0b8341f67b8c281c5dff35d68c5e3"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py",
+        "blob_id": "f5217e75a63c8f07110478117396d92226c0c0ca"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_unicgkq3a7la6.abi3.so",
+        "blob_id": "c16ea0b257847bdac8068a18921e0a3b5dce6b92"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py",
+        "blob_id": "1e16b3f78f15e21f94d1241338a5ad6fcf9fa6dc"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_f4o2yj2oj7kni.abi3.so",
+        "blob_id": "bda41c03212d0ca513c15046c8b4ca07edf23544"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py",
+        "blob_id": "0d812eef750061481eba1b7ed5fa708cfec31f42"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_6nd6n6ctlfohq.abi3.so",
+        "blob_id": "3a69017602930aa71d55f179769d10bcba21b444"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py",
+        "blob_id": "e5bdaf0f73a5c870ed0d8ae6345cbec989274e16"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_cxckebwxmlb3i.abi3.so",
+        "blob_id": "b51fa64aa95165e295a32926f8ca1e9ecea61ed3"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/_ops.py",
+        "blob_id": "fd057ea2d2b103efdf01641c9767641e093ed947"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/_quantization_vsrdj55erbiei.abi3.so",
+        "blob_id": "eb6e2055b9b9330bcb04c316d2fb4570918d24c3"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/_ops.py",
+        "blob_id": "0527a08325accf310054df153fbcaf26151b921e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/_quantization_gsd2xjzq76rwy.abi3.so",
+        "blob_id": "46d5dc19eb270598aecb4654446449097da3ad8f"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/_ops.py",
+        "blob_id": "83eb9dbc2c91b6b305544ea9f84d41ef73f5ea01"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/_quantization_hbfrcozzte6aq.abi3.so",
+        "blob_id": "d8feec129f933b3eecb28862aec4167876fd5ca8"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/_ops.py",
+        "blob_id": "3b54f0bfc1030429b27a48d4e574778ebe86f820"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/_quantization_womr3pvjbirhe.abi3.so",
+        "blob_id": "f799e8bfc915d482daef4890ee6f8d6b342e0da1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/_ops.py",
+        "blob_id": "c83bf352ab1ca3283391b3dd8c209c1bf6a60eb1"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/_quantization_55qjuxe2uqrp6.abi3.so",
+        "blob_id": "4d1b45a1c98552ca7e4ba4c0e583e266e9f70060"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/__init__.py",
+        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/_ops.py",
+        "blob_id": "7552c079934b6f76bb1c221358e0ac2f1ca449be"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/_quantization_cyhrq7sx4uskw.abi3.so",
+        "blob_id": "0a4c95a688fcacd6a10c04e101e1deec5a03ffc9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/compressed_tensors.py",
+        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/cutlass.py",
+        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/marlin.py",
+        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/scalar_type.py",
+        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/__init__.py",
+        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils.py",
+        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
+        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_test.py",
+        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
+        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
+        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/quant_utils.py",
+        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
+      }
+    ]
+  }
+]
diff --git a/server/pyproject.toml b/server/pyproject.toml
index d64a143f..3f02b4ec 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "grpcio>=1.67.0",
     "grpcio-reflection>=1.67.0",
     "grpcio-status>=1.67.0",
+    "hf-kernels>=0.1.5",
     "hf-transfer>=0.1.8",
     "loguru>=0.7.3",
     "numpy>=1.26,<3",
@@ -33,6 +34,15 @@ dependencies = [
     "transformers>=4.48.0"
 ]
 
+[build-system]
+requires = ["hf-kernels>=0.1.2", "setuptools"]
+build-backend = "setuptools.build_meta"
+
+[tool.kernels.dependencies]
+"kernels-community/paged-attention" = ">=0.0.2"
+"kernels-community/moe" = ">=0.1.1"
+"kernels-community/quantization" = ">=0.0.3"
+
 [project.scripts]
 text-generation-server = "text_generation_server.cli:app"
 
@@ -60,24 +70,11 @@ quantize = [
     "texttable>=1.6.7,<2",
     "datasets>=2.21,<3",
 ]
-moe = [ "moe-kernels" ]
-attention = [ "attention-kernels" ]
-marlin = [ "marlin-kernels" ]
 gen = [
     "grpcio-tools>=1.69.0",
     "mypy-protobuf>=3.6.0",
 ]
 
-[tool.uv.sources]
-attention-kernels.url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
-marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", marker = "python_version == '3.9'" },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", marker = "python_version == '3.10'" },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", marker = "python_version == '3.11'" },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", marker = "python_version == '3.12'" },
-]
-moe-kernels.url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl"
-
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
 
diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py
index 6b77f591..4f25cc19 100644
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -1,6 +1,7 @@
 import torch
 from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.models.globals import (
     ATTENTION,
     BLOCK_SIZE,
@@ -13,6 +14,18 @@ major, minor = torch.cuda.get_device_capability()
 is_sm75 = major == 7 and minor == 5
 _PARTITION_SIZE = 512
 
+if SYSTEM == "cuda":
+    try:
+        paged_attention_kernels = load_kernel(
+            module="paged_attention", repo_id="kernels-community/paged-attention"
+        )
+    except Exception as e:
+        raise ImportError(
+            f"Could not import attention kernels. Make sure your installation is correct. Complete error: {e}"
+        )
+else:
+    paged_attention_kernels = None
+
 
 def paged_attention(
     query: torch.Tensor,
@@ -107,7 +120,6 @@ def paged_attention(
         if softcap is not None:
             raise RuntimeError("Paged attention doesn't support softcapping")
         input_lengths = seqlen.input_lengths + seqlen.cache_lengths
-        import attention_kernels
 
         out = torch.empty_like(query)
 
@@ -117,7 +129,7 @@ def paged_attention(
             max_num_partitions == 1 or num_seqs * num_heads > 512
         )
         if use_v1:
-            attention_kernels.paged_attention_v1(
+            paged_attention_kernels.paged_attention_v1(
                 out,
                 query,
                 kv_cache.key,
@@ -130,8 +142,8 @@ def paged_attention(
                 max_s,
                 None,
                 kv_cache_dtype,
-                kv_scales.key_scale_cpu,
-                kv_scales.value_scale_cpu,
+                torch.tensor(kv_scales.key_scale_cpu if can_scale else 1.0),
+                torch.tensor(kv_scales.value_scale_cpu if can_scale else 1.0),
             )
         else:
             # Run PagedAttention V2.
@@ -148,7 +160,7 @@ def paged_attention(
             )
             max_logits = torch.empty_like(exp_sums)
 
-            attention_kernels.paged_attention_v2(
+            paged_attention_kernels.paged_attention_v2(
                 out,
                 exp_sums,
                 max_logits,
@@ -164,8 +176,8 @@ def paged_attention(
                 max_s,
                 None,
                 kv_cache_dtype,
-                kv_scales.key_scale_cpu,
-                kv_scales.value_scale_cpu,
+                torch.tensor(kv_scales.key_scale_cpu if can_scale else 1.0),
+                torch.tensor(kv_scales.value_scale_cpu if can_scale else 1.0),
             )
     return out
 
diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py
index bca7959b..aaf4d2b2 100644
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@@ -7,9 +7,22 @@ import torch
 from text_generation_server.layers.fp8 import fp8_quantize
 from text_generation_server.models.globals import ATTENTION, BLOCK_SIZE
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import Weights
 
+if SYSTEM == "cuda":
+    try:
+        paged_attention = load_kernel(
+            module="paged_attention", repo_id="kernels-community/paged-attention"
+        )
+    except Exception as e:
+        raise ImportError(
+            f"Could not import attention kernels. Make sure your installation is correct. Complete error: {e}"
+        )
+else:
+    paged_attention = None
+
 
 @dataclass
 class KVScales:
@@ -119,7 +132,7 @@ class KVCache:
         if kv_scales.key_scale_cpu == 1.0 and kv_scales.value_scale_cpu == 1.0:
             return False
         elif self.dtype == torch.float8_e4m3fn and (
-            (ATTENTION == "flashinfer" and SYSTEM == "cuda")
+            (ATTENTION in ("paged", "flashinfer") and SYSTEM == "cuda")
             or (ATTENTION == "paged" and SYSTEM == "rocm")
         ):
             log_once(logger.info, "Using FP8 KV cache scales")
@@ -220,19 +233,19 @@ def paged_reshape_and_cache(
 ):
 
     if SYSTEM == "cuda":
-        try:
-            import attention_kernels
-        except Exception as e:
-            raise ImportError(
-                f"Could not import attention_kernels. Make sure your installation is correct. Complete error: {e}"
-            )
-
         kv_cache_dtype = "auto"
         if key_cache.dtype == torch.float8_e4m3fn:
             kv_cache_dtype = "fp8"
 
-        attention_kernels.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, kv_cache_dtype, k_scale, v_scale
+        paged_attention.reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slots,
+            kv_cache_dtype,
+            torch.tensor(k_scale),
+            torch.tensor(v_scale),
         )
     elif SYSTEM == "rocm":
         try:
diff --git a/server/text_generation_server/layers/compressed_tensors/w8a8_int.py b/server/text_generation_server/layers/compressed_tensors/w8a8_int.py
index fc6d81e4..b66057ec 100644
--- a/server/text_generation_server/layers/compressed_tensors/w8a8_int.py
+++ b/server/text_generation_server/layers/compressed_tensors/w8a8_int.py
@@ -6,13 +6,17 @@ import torch
 from compressed_tensors.quantization import QuantizationArgs, QuantizationType
 
 from text_generation_server.layers.fp8 import _load_scalar_or_matrix_scale
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
 
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
+if SYSTEM == "cuda":
+    quantization = load_kernel(
+        module="quantization", repo_id="kernels-community/quantization"
+    )
+else:
+    quantization = None
 
 
 class W8A8IntLoader(WeightsLoader):
@@ -159,8 +163,8 @@ class Int8Weight(Weight):
 
     def get_linear(self, bias: torch.Tensor):
         if self.weight_scale is None:
-            assert marlin_kernels is not None
-            qweight, weight_scale, _ = marlin_kernels.scaled_int8_quant(self.weight)
+            assert quantization is not None
+            qweight, weight_scale, _ = quantization.scaled_int8_quant(self.weight)
             return W8A8IntLinear(
                 bias=bias,
                 input_symmetric=self.input_symmetric,
@@ -204,9 +208,9 @@ class W8A8IntLinear(torch.nn.Module):
             )
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
+        assert quantization is not None
 
-        qinput, input_scale, input_zero_point = marlin_kernels.scaled_int8_quant(
+        qinput, input_scale, input_zero_point = quantization.scaled_int8_quant(
             input=input,
             scale=None,
             azp=None,
@@ -214,7 +218,7 @@ class W8A8IntLinear(torch.nn.Module):
         )
 
         if self.input_symmetric:
-            return marlin_kernels.cutlass_scaled_mm(
+            return quantization.cutlass_scaled_mm(
                 a=qinput,
                 b=self.weight,
                 scale_a=input_scale,
@@ -229,7 +233,7 @@ class W8A8IntLinear(torch.nn.Module):
                 and (self.input_symmetric or input_zero_point is not None)
             )
 
-            return marlin_kernels.cutlass_scaled_mm_azp(
+            return quantization.cutlass_scaled_mm_azp(
                 a=qinput,
                 b=self.weight,
                 scale_a=input_scale,
diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
index ae20235d..04689ed9 100644
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@@ -6,6 +6,7 @@ import torch
 from loguru import logger
 
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.utils.weights import (
     Weight,
     WeightsLoader,
@@ -14,10 +15,12 @@ from text_generation_server.utils.weights import (
 )
 from text_generation_server.utils.log import log_once
 
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
+if SYSTEM == "cuda":
+    quantization = load_kernel(
+        module="quantization", repo_id="kernels-community/quantization"
+    )
+else:
+    quantization = None
 
 try:
     from moe_kernels.fp8_utils import w8a8_block_fp8_matmul, per_token_group_quant_fp8
@@ -29,9 +32,9 @@ quant_dtype: torch.dtype = (
     torch.float8_e4m3fnuz if SYSTEM == "rocm" else torch.float8_e4m3fn
 )
 
-if SYSTEM == "cuda" and marlin_kernels is not None:
+if SYSTEM == "cuda" and quantization is not None:
     major, minor = torch.cuda.get_device_capability()
-    CUTLASS_FP8_AVAILABLE = marlin_kernels.cutlass_scaled_mm_supports_fp8(
+    CUTLASS_FP8_AVAILABLE = quantization.cutlass_scaled_mm_supports_fp8(
         major * 10 + minor
     )
 else:
@@ -143,11 +146,10 @@ def fp8_quantize(
     argument, it must also be a reciprocal (so that scales from an FP8 checkpoint can
     be used without modification).
     """
-    if marlin_kernels is not None:
+    if quantization is not None:
         shape = weight.shape
-        qweight, scale = marlin_kernels.scaled_fp8_quant(
+        qweight, scale = quantization.scaled_fp8_quant(
             weight.reshape(-1, shape[-1]),
-            dtype=quant_dtype,
             scale=scale,
             scale_ub=scale_upper_bound,
             # TODO: don't do this when we have to use the Torch kernel.
@@ -527,7 +529,7 @@ class Fp8Linear(torch.nn.Module):
             qinput, scale = fp8_quantize(
                 input, scale_upper_bound=self.scale_upper_bound, scalar=False
             )
-            return marlin_kernels.cutlass_scaled_mm(
+            return quantization.cutlass_scaled_mm(
                 qinput, self.qweight.t(), scale, self.scale, input.dtype, self.bias
             )
 
diff --git a/server/text_generation_server/layers/marlin/fp8.py b/server/text_generation_server/layers/marlin/fp8.py
index e07b9fc6..10751a05 100644
--- a/server/text_generation_server/layers/marlin/fp8.py
+++ b/server/text_generation_server/layers/marlin/fp8.py
@@ -8,11 +8,15 @@ from text_generation_server.layers.marlin.util import (
     _check_marlin_kernels,
     permute_scales,
 )
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
+if SYSTEM == "cuda":
+    quantization = load_kernel(
+        module="quantization", repo_id="kernels-community/quantization"
+    )
+else:
+    quantization = None
 
 
 MARLIN_TILE_SIZE = 16
@@ -32,7 +36,7 @@ class GPTQMarlinFP8Linear(nn.Module):
         super().__init__()
 
         _check_marlin_kernels()
-        assert marlin_kernels is not None
+        assert quantization is not None
 
         scales = scales.unsqueeze(0)
         if scales.shape[1] == 1:
@@ -69,10 +73,10 @@ class GPTQMarlinFP8Linear(nn.Module):
         return cls(qweight=weight, scales=scale.to(dtype), bias=bias)
 
     def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
+        assert quantization is not None
 
         A_flat = A.view(-1, A.shape[-1])
-        C = marlin_kernels.fp8_marlin_gemm(
+        C = quantization.fp8_marlin_gemm(
             A_flat,
             self.qweight,
             self.scales,
@@ -134,7 +138,7 @@ def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor):
     qweight = pack_fp8_as_int32(weight.t())
 
     perm = torch.empty(0, dtype=torch.int, device=qweight.device)
-    repacked = marlin_kernels.gptq_marlin_repack(
+    repacked = quantization.gptq_marlin_repack(
         qweight, perm, in_features, out_features, 8
     )
 
diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py
index 5c1bb549..e85c8333 100644
--- a/server/text_generation_server/layers/marlin/gptq.py
+++ b/server/text_generation_server/layers/marlin/gptq.py
@@ -12,13 +12,17 @@ from text_generation_server.layers.marlin.util import (
     unpack_cols,
 )
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
 
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
+if SYSTEM == "cuda":
+    quantization = load_kernel(
+        module="quantization", repo_id="kernels-community/quantization"
+    )
+else:
+    quantization = None
+
 
 try:
     major, _minor = torch.cuda.get_device_capability()
@@ -37,7 +41,7 @@ def can_use_gptq_marlin(
 ) -> bool:
     return (
         SYSTEM == "cuda"
-        and marlin_kernels is not None
+        and quantization is not None
         and has_sm_8_0
         and quantize in {"awq", "gptq"}
         and quant_method in {"awq", "gptq"}
@@ -287,7 +291,7 @@ def repack_gptq_for_marlin(
 ) -> GPTQMarlinWeight:
     """Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels."""
     _check_marlin_kernels()
-    assert marlin_kernels is not None
+    assert quantization is not None
 
     if bits not in GPTQ_MARLIN_BITS:
         supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
@@ -330,7 +334,7 @@ def repack_gptq_for_marlin(
         g_idx = torch.empty(0, dtype=torch.int, device=qweight.device)
 
     if quant_method == "awq":
-        repacked = marlin_kernels.awq_marlin_repack(
+        repacked = quantization.awq_marlin_repack(
             qweight, in_features, out_features, bits
         )
         if qzeros is not None:
@@ -342,7 +346,7 @@ def repack_gptq_for_marlin(
             )
 
     else:
-        repacked = marlin_kernels.gptq_marlin_repack(
+        repacked = quantization.gptq_marlin_repack(
             qweight, perm, in_features, out_features, bits
         )
 
@@ -379,13 +383,26 @@ class GPTQMarlinLinear(nn.Module):
         super().__init__()
 
         _check_marlin_kernels()
-        assert marlin_kernels is not None
+        assert quantization is not None
 
         in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE
         out_features = weight.scales.shape[1]
         _check_valid_shape(in_features=in_features, out_features=out_features)
 
-        self.bits = weight.bits
+        if weight.bits not in (4, 8):
+            raise ValueError("GPTQMarlinLinear only supports 4 and 8-bit quantization")
+
+        if weight.qzeros.numel() > 0:
+            if weight.bits == 4:
+                self.quant_type = quantization.scalar_types.uint4
+            else:
+                self.quant_type = quantization.scalar_types.uint8
+        else:
+            if weight.bits == 4:
+                self.quant_type = quantization.scalar_types.uint4b8
+            else:
+                self.quant_type = quantization.scalar_types.uint8b128
+
         self.is_full_k = weight.is_full_k
 
         self.qweight = weight.qweight
@@ -403,10 +420,10 @@ class GPTQMarlinLinear(nn.Module):
         )
 
     def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
+        assert quantization is not None
 
         A_flat = A.view(-1, A.shape[-1])
-        C = marlin_kernels.gptq_marlin_gemm(
+        C = quantization.gptq_marlin_gemm(
             A_flat,
             self.qweight,
             self.scales,
@@ -414,7 +431,7 @@ class GPTQMarlinLinear(nn.Module):
             self.g_idx,
             self.perm,
             self.workspace,
-            self.bits,
+            self.quant_type,
             A_flat.shape[0],
             self.scales.shape[1],
             A_flat.shape[1],
diff --git a/server/text_generation_server/layers/marlin/marlin.py b/server/text_generation_server/layers/marlin/marlin.py
index 1c80e31e..48aedc72 100644
--- a/server/text_generation_server/layers/marlin/marlin.py
+++ b/server/text_generation_server/layers/marlin/marlin.py
@@ -3,13 +3,18 @@ from typing import List, Optional, Union
 
 import torch
 import torch.nn as nn
+
 from text_generation_server.layers.marlin.util import _check_marlin_kernels
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
 
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
+if SYSTEM == "cuda":
+    quantization = load_kernel(
+        module="quantization", repo_id="kernels-community/quantization"
+    )
+else:
+    quantization = None
 
 
 class MarlinWeightsLoader(WeightsLoader):
@@ -187,7 +192,7 @@ class MarlinLinear(nn.Module):
         super().__init__()
 
         _check_marlin_kernels()
-        assert marlin_kernels is not None
+        assert quantization is not None
 
         in_features = weight.B.shape[0] * MARLIN_TILE_SIZE
         out_features = weight.s.shape[1]
@@ -216,9 +221,9 @@ class MarlinLinear(nn.Module):
         )
 
     def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
+        assert quantization is not None
 
-        C = marlin_kernels.marlin_gemm(
+        C = quantization.marlin_gemm(
             A.view(-1, A.shape[-1]),
             self.B,
             self.s,
@@ -277,7 +282,7 @@ class GPTQMarlin24Linear(nn.Module):
         super().__init__()
 
         _check_marlin_kernels()
-        assert marlin_kernels is not None
+        assert quantization is not None
 
         if weight.bits not in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS:
             supported_bits = ", ".join(
@@ -303,8 +308,11 @@ class GPTQMarlin24Linear(nn.Module):
                 f"Group size {groupsize} is not supported, must be one of: {supported_sizes}"
             )
 
-        self.bits = weight.bits
-        weights_per_int32 = 32 // self.bits
+        if weight.bits == 4:
+            self.quant_type = quantization.scalar_types.uint4b8
+        else:
+            self.quant_type = quantization.scalar_types.uint8b128
+        weights_per_int32 = 32 // weight.bits
 
         assert (
             out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0
@@ -336,15 +344,15 @@ class GPTQMarlin24Linear(nn.Module):
         )
 
     def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
+        assert quantization is not None
 
-        C = marlin_kernels.gptq_marlin_24_gemm(
+        C = quantization.gptq_marlin_24_gemm(
             A.view(-1, A.shape[-1]),
             self.weight_packed,
             self.meta,
             self.scale_packed,
             self.workspace,
-            self.bits,
+            self.quant_type,
             A.shape[0],
             self.scale_packed.shape[1],
             A.shape[1],
diff --git a/server/text_generation_server/layers/marlin/util.py b/server/text_generation_server/layers/marlin/util.py
index 250d1714..0c5d715f 100644
--- a/server/text_generation_server/layers/marlin/util.py
+++ b/server/text_generation_server/layers/marlin/util.py
@@ -4,11 +4,14 @@ from typing import List, Tuple
 import numpy
 import torch
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
+if SYSTEM == "cuda":
+    quantization = load_kernel(
+        module="quantization", repo_id="kernels-community/quantization"
+    )
+else:
+    quantization = None
 
 try:
     major, _minor = torch.cuda.get_device_capability()
@@ -23,7 +26,7 @@ def _check_marlin_kernels():
             "Using quantized Marlin models requires a GPU with CUDA capability 8.0 or later."
         )
 
-    if marlin_kernels is None:
+    if quantization is None:
         raise NotImplementedError(
             "marlin is not installed, install it with: pip install server/marlin"
         )
diff --git a/server/text_generation_server/layers/moe/__init__.py b/server/text_generation_server/layers/moe/__init__.py
index 23d0d38c..e148f165 100644
--- a/server/text_generation_server/layers/moe/__init__.py
+++ b/server/text_generation_server/layers/moe/__init__.py
@@ -18,6 +18,7 @@ from text_generation_server.layers.moe.gptq_marlin import (
 from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
 from text_generation_server.layers.moe.fp8 import FP8SparseMoELayer
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import (
     DefaultWeightsLoader,
@@ -27,6 +28,10 @@ from text_generation_server.utils.weights import (
 
 if SYSTEM == "ipex":
     from .fused_moe_ipex import fused_topk, grouped_topk
+elif SYSTEM == "cuda":
+    moe_kernels = load_kernel(module="moe", repo_id="kernels-community/moe")
+    fused_topk = moe_kernels.fused_topk
+    grouped_topk = moe_kernels.grouped_topk
 else:
     from moe_kernels.fused_moe import fused_topk, grouped_topk
 
diff --git a/server/text_generation_server/layers/moe/fp8.py b/server/text_generation_server/layers/moe/fp8.py
index 3016c8a2..071b2abe 100644
--- a/server/text_generation_server/layers/moe/fp8.py
+++ b/server/text_generation_server/layers/moe/fp8.py
@@ -12,7 +12,7 @@ from text_generation_server.layers.fp8 import (
 )
 
 try:
-    from moe_kernels.fused_moe import fused_moe
+    from .unquantized import fused_moe
 except Exception:
     fused_moe = None
 
diff --git a/server/text_generation_server/layers/moe/gptq_marlin.py b/server/text_generation_server/layers/moe/gptq_marlin.py
index 014a90dc..75c076ab 100644
--- a/server/text_generation_server/layers/moe/gptq_marlin.py
+++ b/server/text_generation_server/layers/moe/gptq_marlin.py
@@ -1,10 +1,12 @@
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Callable, List, Optional
 
 import torch
 import torch.nn as nn
 
+from text_generation_server.layers import moe
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.utils.weights import Weights
 from text_generation_server.layers.marlin.gptq import (
     GPTQMarlinWeight,
@@ -12,9 +14,9 @@ from text_generation_server.layers.marlin.gptq import (
 )
 
 if SYSTEM == "cuda":
-    from moe_kernels.fused_marlin_moe import fused_marlin_moe
+    moe_kernels = load_kernel(module="moe", repo_id="kernels-community/moe")
 else:
-    fused_marlin_moe = None
+    moe_kernels = None
 
 
 try:
@@ -32,7 +34,7 @@ def can_use_marlin_moe_gemm(
 ):
     return (
         SYSTEM == "cuda"
-        and fused_marlin_moe is not None
+        and moe is not None
         and has_sm_8_0
         and quantize in {"awq", "gptq"}
         and quant_method in {"awq", "gptq"}
@@ -230,3 +232,111 @@ def _pack_weight(
     moe_weight.perm[expert] = weight.perm
 
     return moe_weight
+
+
+def fused_marlin_moe(
+    *,
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    gating_output: torch.Tensor,
+    g_idx1: torch.Tensor,
+    g_idx2: torch.Tensor,
+    sort_indices1: torch.Tensor,
+    sort_indices2: torch.Tensor,
+    w1_zeros: Optional[torch.Tensor] = None,
+    w2_zeros: Optional[torch.Tensor] = None,
+    is_k_full: bool,
+    topk: int,
+    renormalize: bool,
+    num_bits: int = 8,
+    use_grouped_topk: bool = False,
+    num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    topk_group: Optional[int] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx1 (torch.Tensor): The first set of act_order indices.
+    - g_idx2 (torch.Tensor): The second set of act_order indices.
+    - sort_indices1 (torch.Tensor): The first act_order input permutation.
+    - sort_indices2 (torch.Tensor): The second act_order input permutation.
+    - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
+    - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - num_bits (bool): The number of bits in expert weights quantization.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[1] == w2.shape[2] // (
+        num_bits // 2
+    ), "Hidden size mismatch w2"
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype == torch.float16
+    assert num_bits in [4, 8]
+
+    # DeekSeekv2 uses grouped_top_k
+    if use_grouped_topk:
+        assert topk_group is not None
+        assert num_expert_group is not None
+        topk_weights, topk_ids = moe_kernels.grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
+            topk=topk,
+            renormalize=renormalize,
+            num_expert_group=num_expert_group,
+            topk_group=topk_group,
+        )
+    elif custom_routing_function is None:
+        topk_weights, topk_ids = moe_kernels.fused_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
+            topk=topk,
+            renormalize=renormalize,
+        )
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
+            topk=topk,
+            renormalize=renormalize,
+        )
+    return moe_kernels.fused_marlin_moe(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        gating_output=gating_output,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        g_idx1=g_idx1,
+        g_idx2=g_idx2,
+        sort_indices1=sort_indices1,
+        sort_indices2=sort_indices2,
+        w1_zeros=w1_zeros,
+        w2_zeros=w2_zeros,
+        num_bits=num_bits,
+        is_k_full=is_k_full,
+    )
diff --git a/server/text_generation_server/layers/moe/unquantized.py b/server/text_generation_server/layers/moe/unquantized.py
index 9277384a..77214286 100644
--- a/server/text_generation_server/layers/moe/unquantized.py
+++ b/server/text_generation_server/layers/moe/unquantized.py
@@ -1,15 +1,18 @@
-from typing import Optional
+from typing import Callable, List, Optional
 
 import torch
 import torch.nn as nn
 
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.utils.weights import UnquantizedWeight, Weights
 
 if SYSTEM == "ipex":
     from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
+elif SYSTEM == "cuda":
+    moe_kernels = load_kernel(module="moe", repo_id="kernels-community/moe")
 else:
-    from moe_kernels.fused_moe import fused_moe
+    import moe_kernels
 
 
 class UnquantizedSparseMoELayer(nn.Module):
@@ -63,7 +66,17 @@ class UnquantizedSparseMoELayer(nn.Module):
             )
 
     def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
-        if SYSTEM == "ipex":
+        if SYSTEM == "rocm":
+            return moe_kernels.fused_moe(
+                x,
+                self.gate_up_proj,
+                self.down_proj,
+                gating_output,
+                self.topk,
+                renormalize=self.renormalize,
+                inplace=True,
+            )
+        elif SYSTEM == "ipex":
             return self.ipex_fused_moe(
                 hidden_states=x,
                 router_logits=gating_output,
@@ -146,3 +159,110 @@ def _load_expert_weights_row(
     assert all_weight is not None
 
     return all_weight
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    inplace: bool = False,
+    use_grouped_topk: bool = False,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - num_expert_group: Optional[int]: additional parameter for grouped_topk
+    - topk_group: Optional[int]: additional parameter for grouped_topk
+    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
+        note: Deepseekv2 model uses grouped_topk
+    - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+    - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a1.
+    - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a2.
+    - block_shape: (Optional[List[int]]): Optional block size for block-wise
+        quantization.
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Check constraints.
+    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
+
+    if use_grouped_topk:
+        assert num_expert_group is not None and topk_group is not None
+        from loguru import logger
+        import inspect
+
+        logger.info(f"{inspect.signature(moe_kernels.grouped_topk)}")
+        topk_weights, topk_ids = moe_kernels.grouped_topk(
+            hidden_states,
+            gating_output,
+            topk,
+            renormalize,
+            num_expert_group,
+            topk_group,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+    elif custom_routing_function is None:
+        topk_weights, topk_ids = moe_kernels.fused_topk(
+            hidden_states, gating_output, topk, renormalize
+        )
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states, gating_output, topk, renormalize
+        )
+
+    return moe_kernels.fused_experts(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        inplace=inplace,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        use_int4_w4a16=use_int4_w4a16,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+    )
diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
index aa032782..225fae77 100644
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -22,11 +22,14 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
 from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.kernels import load_kernel
 
 if SYSTEM == "ipex":
     from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
+elif SYSTEM == "cuda":
+    moe_kernels = load_kernel(module="moe", repo_id="kernels-community/moe")
 else:
-    from moe_kernels.fused_moe import fused_moe
+    import moe_kernels
 
 from text_generation_server.layers.attention import (
     paged_attention,
@@ -510,7 +513,7 @@ class BlockSparseMoE(nn.Module):
                 topk_group=None,
             )
         else:
-            out = fused_moe(
+            out = moe_kernels.fused_moe(
                 x,
                 self.wv1,
                 self.w2,
diff --git a/server/text_generation_server/utils/kernels.py b/server/text_generation_server/utils/kernels.py
new file mode 100644
index 00000000..42745c71
--- /dev/null
+++ b/server/text_generation_server/utils/kernels.py
@@ -0,0 +1,22 @@
+import importlib
+
+from loguru import logger
+from hf_kernels import load_kernel as hf_load_kernel
+
+from text_generation_server.utils.log import log_once
+
+
+def load_kernel(*, module: str, repo_id: str):
+    """
+    Load a kernel. First try to load it as the given module (e.g. for
+    local development), falling back to a locked Hub kernel.
+    """
+    try:
+        m = importlib.import_module(module)
+        log_once(logger.info, f"Using local module for `{module}`")
+        return m
+    except ModuleNotFoundError:
+        return hf_load_kernel(repo_id=repo_id)
+
+
+__all__ = ["load_kernel"]
diff --git a/server/uv.lock b/server/uv.lock
index 5410a58c..284def0b 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -168,20 +168,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233 },
 ]
 
-[[package]]
-name = "attention-kernels"
-version = "0.2.0.post2"
-source = { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
-dependencies = [
-    { name = "torch" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:863e02dda4b30e9d04ef6cf4d17d16c154f54bdcb8a8b87b8b46075eabf62d25" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
 [[package]]
 name = "attrs"
 version = "24.3.0"
@@ -676,6 +662,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/64/51/f6b198152399d17247d962340947728fb1b06da6bc0c0a542446b2ffee49/grpcio_tools-1.69.0-cp39-cp39-win_amd64.whl", hash = "sha256:5d47abf7e0662dd5dbb9cc252c3616e5fbc5f71d34e3f6332cd24bcdf2940abd", size = 1114931 },
 ]
 
+[[package]]
+name = "hf-kernels"
+version = "0.1.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "packaging" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+    { name = "torch" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/01/fe/5aa3ea1b66bcc7d81aff19683ea04d4a9cd414c8d4ff05b150fc1f196ccd/hf_kernels-0.1.6.tar.gz", hash = "sha256:5effee5046552ce226ff86d3870a799f4ecae399bcb2beb4046c28c2dd736d2f", size = 8704 }
+
 [[package]]
 name = "hf-transfer"
 version = "0.1.9"
@@ -906,86 +904,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/73/085399401383ce949f727afec55ec3abd76648d04b9f22e1c0e99cb4bec3/MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a", size = 15506 },
 ]
 
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.13'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version >= '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b2/82/886d1eece474ef23668c4780f5053ea654999704a0195aadc651631b740d/marlin-kernels-0.3.7.tar.gz", hash = "sha256:8be8a65fd9ae21b2406afba9e460e3922582479b85a1372096e87e3a15684a77", size = 15662 }
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.10.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.10.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl", hash = "sha256:dd91a4e2c3b5e954833c5c34b0322e4c02cd92a967eb94654b6bbcece131340b" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.11.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.11.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl", hash = "sha256:b24d92135fbd156c55ce43158ab4a90fa880ba0df965528895cf1870b03a64bf" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version == '3.12.*'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version == '3.12.*'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl", hash = "sha256:8a407f1435a571a8d4ca3b9f533da83fde323043a9836b739cf8018c77782d49" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
-[[package]]
-name = "marlin-kernels"
-version = "0.3.7"
-source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }
-resolution-markers = [
-    "python_full_version < '3.10'",
-]
-dependencies = [
-    { name = "torch", marker = "python_full_version < '3.10'" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl", hash = "sha256:bf7003753c364c504b3998fffdfcf619a42ab04f908903dbad8d54347b6b142b" },
-]
-
-[package.metadata]
-requires-dist = [{ name = "torch" }]
-
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@@ -995,26 +913,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]
 
-[[package]]
-name = "moe-kernels"
-version = "0.8.2"
-source = { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" }
-dependencies = [
-    { name = "nvidia-ml-py" },
-    { name = "torch" },
-    { name = "triton" },
-]
-wheels = [
-    { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl", hash = "sha256:1ed5b26f52339d25ea2513e99e8b6239cf1921af3eac54e03a46bb8f8efb380b" },
-]
-
-[package.metadata]
-requires-dist = [
-    { name = "nvidia-ml-py" },
-    { name = "torch" },
-    { name = "triton" },
-]
-
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -1308,6 +1206,7 @@ name = "nvidia-cublas-cu12"
 version = "12.4.5.8"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 },
     { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 },
 ]
 
@@ -1316,6 +1215,7 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 },
     { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 },
 ]
 
@@ -1324,6 +1224,7 @@ name = "nvidia-cuda-nvrtc-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 },
     { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 },
 ]
 
@@ -1332,6 +1233,7 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 },
     { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 },
 ]
 
@@ -1354,6 +1256,7 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 },
     { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
 ]
 
@@ -1362,6 +1265,7 @@ name = "nvidia-curand-cu12"
 version = "10.3.5.147"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 },
     { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 },
 ]
 
@@ -1375,6 +1279,7 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 },
     { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
 ]
 
@@ -1386,18 +1291,10 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 },
     { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
 ]
 
-[[package]]
-name = "nvidia-ml-py"
-version = "12.560.30"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/53/10/5f34de4a71db8b2b7ec4269f4a33287f24c23e2857ea3187c977b7bc3604/nvidia-ml-py-12.560.30.tar.gz", hash = "sha256:f0254dc7400647680a072ee02509bfd46102b60bdfeca321576d4d4817e7fe97", size = 39194 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/f3/a69ce0b1a1e12fbf6b2ad9f4c14c9999fdbdf15f2478d210f0fd501ddc98/nvidia_ml_py-12.560.30-py3-none-any.whl", hash = "sha256:fea371c94d63e38a611c17bbb85fe400e9c8ddb9e8684a9cd0e47786a4bc3c73", size = 40526 },
-]
-
 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.21.5"
@@ -1411,6 +1308,7 @@ name = "nvidia-nvjitlink-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 },
     { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 },
 ]
 
@@ -1419,6 +1317,7 @@ name = "nvidia-nvtx-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 },
     { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
 ]
 
@@ -2653,6 +2552,7 @@ dependencies = [
     { name = "grpcio" },
     { name = "grpcio-reflection" },
     { name = "grpcio-status" },
+    { name = "hf-kernels" },
     { name = "hf-transfer" },
     { name = "loguru" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
@@ -2678,9 +2578,6 @@ dependencies = [
 accelerate = [
     { name = "accelerate" },
 ]
-attention = [
-    { name = "attention-kernels" },
-]
 bnb = [
     { name = "bitsandbytes" },
 ]
@@ -2695,16 +2592,6 @@ gen = [
     { name = "grpcio-tools" },
     { name = "mypy-protobuf" },
 ]
-marlin = [
-    { name = "marlin-kernels", version = "0.3.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" }, marker = "python_full_version == '3.10.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" }, marker = "python_full_version == '3.11.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*'" },
-    { name = "marlin-kernels", version = "0.3.7", source = { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" }, marker = "python_full_version < '3.10'" },
-]
-moe = [
-    { name = "moe-kernels" },
-]
 outlines = [
     { name = "outlines" },
 ]
@@ -2719,7 +2606,6 @@ quantize = [
 [package.metadata]
 requires-dist = [
     { name = "accelerate", marker = "extra == 'accelerate'", specifier = ">=1.2.1,<2" },
-    { name = "attention-kernels", marker = "extra == 'attention'", url = "https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
     { name = "bitsandbytes", marker = "extra == 'bnb'", specifier = ">=0.45.0" },
     { name = "compressed-tensors", marker = "extra == 'compressed-tensors'", specifier = ">=0.9.0" },
     { name = "datasets", marker = "extra == 'quantize'", specifier = ">=2.21,<3" },
@@ -2730,14 +2616,9 @@ requires-dist = [
     { name = "grpcio-status", specifier = ">=1.67.0" },
     { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" },
     { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
+    { name = "hf-kernels", specifier = ">=0.1.5" },
     { name = "hf-transfer", specifier = ">=0.1.8" },
     { name = "loguru", specifier = ">=0.7.3" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.9.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp39-cp39-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "(python_full_version < '3.9' and extra == 'marlin') or (python_full_version >= '3.13' and extra == 'marlin')" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.10.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp310-cp310-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.11.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp311-cp311-linux_x86_64.whl" },
-    { name = "marlin-kernels", marker = "python_full_version == '3.12.*' and extra == 'marlin'", url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.7/marlin_kernels-0.3.7+cu123torch2.5-cp312-cp312-linux_x86_64.whl" },
-    { name = "moe-kernels", marker = "extra == 'moe'", url = "https://github.com/danieldk/moe-kernels/releases/download/v0.8.2/moe_kernels-0.8.2+cu123torch2.5-cp39-abi3-linux_x86_64.whl" },
     { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
     { name = "numpy", specifier = ">=1.26,<3" },
     { name = "opentelemetry-api", specifier = ">=1.27.0" },
@@ -2919,7 +2800,7 @@ name = "triton"
 version = "3.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "filelock" },
+    { name = "filelock", marker = "python_full_version < '3.13'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 },

From 8a870b31b92bf3c1a07567435c874d6d37add1c8 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 11 Feb 2025 17:10:36 +0100
Subject: [PATCH 080/183] Update the flaky mllama test.

---
 .../test_mllama/test_mllama_load.json            | 16 ++++++++--------
 .../test_mllama/test_mllama_simpl.json           |  8 ++++----
 integration-tests/models/test_mllama.py          | 13 +++++++------
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
index 65154e89..690e275f 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
@@ -6,7 +6,7 @@
         "index": 0,
         "logprobs": null,
         "message": {
-          "content": "In a small town, a chicken named Cluck",
+          "content": "A chicken sits on a pile of money, looking",
           "name": null,
           "role": "assistant",
           "tool_calls": null
@@ -14,15 +14,15 @@
         "usage": null
       }
     ],
-    "created": 1738753835,
+    "created": 1739290197,
     "id": "",
     "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
     "system_fingerprint": "3.1.1-dev0-native",
     "usage": {
       "completion_tokens": 10,
-      "prompt_tokens": 50,
-      "total_tokens": 60
+      "prompt_tokens": 45,
+      "total_tokens": 55
     }
   },
   {
@@ -32,7 +32,7 @@
         "index": 0,
         "logprobs": null,
         "message": {
-          "content": "In a small town, a chicken named Cluck",
+          "content": "A chicken sits on a pile of money, looking",
           "name": null,
           "role": "assistant",
           "tool_calls": null
@@ -40,15 +40,15 @@
         "usage": null
       }
     ],
-    "created": 1738753835,
+    "created": 1739290197,
     "id": "",
     "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
     "system_fingerprint": "3.1.1-dev0-native",
     "usage": {
       "completion_tokens": 10,
-      "prompt_tokens": 50,
-      "total_tokens": 60
+      "prompt_tokens": 45,
+      "total_tokens": 55
     }
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
index 14ca3f4e..c9c3c1b8 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "In a small town, a chicken named Cluck",
+        "content": "A chicken sits on a pile of money, looking",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1738753833,
+  "created": 1739290152,
   "id": "",
   "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
   "object": "chat.completion",
   "system_fingerprint": "3.1.1-dev0-native",
   "usage": {
     "completion_tokens": 10,
-    "prompt_tokens": 50,
-    "total_tokens": 60
+    "prompt_tokens": 45,
+    "total_tokens": 55
   }
 }
diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py
index 90b2bff1..f023ad1d 100644
--- a/integration-tests/models/test_mllama.py
+++ b/integration-tests/models/test_mllama.py
@@ -28,7 +28,7 @@ async def test_mllama_simpl(mllama, response_snapshot):
                 "content": [
                     {
                         "type": "text",
-                        "text": "Can you tell me a very short story based on the image?",
+                        "text": "Describe the image in 10 words.",
                     },
                     {
                         "type": "image_url",
@@ -43,11 +43,12 @@ async def test_mllama_simpl(mllama, response_snapshot):
 
     assert response.usage == {
         "completion_tokens": 10,
-        "prompt_tokens": 50,
-        "total_tokens": 60,
+        "prompt_tokens": 45,
+        "total_tokens": 55,
     }
     assert (
-        response.choices[0].message.content == "In a small town, a chicken named Cluck"
+        response.choices[0].message.content
+        == "A chicken sits on a pile of money, looking"
     )
     assert response == response_snapshot
 
@@ -65,7 +66,7 @@ async def test_mllama_load(mllama, generate_load, response_snapshot):
                     "content": [
                         {
                             "type": "text",
-                            "text": "Can you tell me a very short story based on the image?",
+                            "text": "Describe the image in 10 words.",
                         },
                         {
                             "type": "image_url",
@@ -86,7 +87,7 @@ async def test_mllama_load(mllama, generate_load, response_snapshot):
     generated_texts = [response.choices[0].message.content for response in responses]
 
     # XXX: TODO: Fix this test.
-    assert generated_texts[0] == "In a small town, a chicken named Cluck"
+    assert generated_texts[0] == "A chicken sits on a pile of money, looking"
     assert len(generated_texts) == 2
     assert generated_texts, all(
         [text == generated_texts[0] for text in generated_texts]

From b86c3947abff5a5955386a81c1486c574d9d68de Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 11 Feb 2025 17:13:06 +0100
Subject: [PATCH 081/183] Revert "Update the flaky mllama test."

This reverts commit 8a870b31b92bf3c1a07567435c874d6d37add1c8.
---
 .../test_mllama/test_mllama_load.json            | 16 ++++++++--------
 .../test_mllama/test_mllama_simpl.json           |  8 ++++----
 integration-tests/models/test_mllama.py          | 13 ++++++-------
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
index 690e275f..65154e89 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
@@ -6,7 +6,7 @@
         "index": 0,
         "logprobs": null,
         "message": {
-          "content": "A chicken sits on a pile of money, looking",
+          "content": "In a small town, a chicken named Cluck",
           "name": null,
           "role": "assistant",
           "tool_calls": null
@@ -14,15 +14,15 @@
         "usage": null
       }
     ],
-    "created": 1739290197,
+    "created": 1738753835,
     "id": "",
     "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
     "system_fingerprint": "3.1.1-dev0-native",
     "usage": {
       "completion_tokens": 10,
-      "prompt_tokens": 45,
-      "total_tokens": 55
+      "prompt_tokens": 50,
+      "total_tokens": 60
     }
   },
   {
@@ -32,7 +32,7 @@
         "index": 0,
         "logprobs": null,
         "message": {
-          "content": "A chicken sits on a pile of money, looking",
+          "content": "In a small town, a chicken named Cluck",
           "name": null,
           "role": "assistant",
           "tool_calls": null
@@ -40,15 +40,15 @@
         "usage": null
       }
     ],
-    "created": 1739290197,
+    "created": 1738753835,
     "id": "",
     "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
     "system_fingerprint": "3.1.1-dev0-native",
     "usage": {
       "completion_tokens": 10,
-      "prompt_tokens": 45,
-      "total_tokens": 55
+      "prompt_tokens": 50,
+      "total_tokens": 60
     }
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
index c9c3c1b8..14ca3f4e 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "A chicken sits on a pile of money, looking",
+        "content": "In a small town, a chicken named Cluck",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1739290152,
+  "created": 1738753833,
   "id": "",
   "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
   "object": "chat.completion",
   "system_fingerprint": "3.1.1-dev0-native",
   "usage": {
     "completion_tokens": 10,
-    "prompt_tokens": 45,
-    "total_tokens": 55
+    "prompt_tokens": 50,
+    "total_tokens": 60
   }
 }
diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py
index f023ad1d..90b2bff1 100644
--- a/integration-tests/models/test_mllama.py
+++ b/integration-tests/models/test_mllama.py
@@ -28,7 +28,7 @@ async def test_mllama_simpl(mllama, response_snapshot):
                 "content": [
                     {
                         "type": "text",
-                        "text": "Describe the image in 10 words.",
+                        "text": "Can you tell me a very short story based on the image?",
                     },
                     {
                         "type": "image_url",
@@ -43,12 +43,11 @@ async def test_mllama_simpl(mllama, response_snapshot):
 
     assert response.usage == {
         "completion_tokens": 10,
-        "prompt_tokens": 45,
-        "total_tokens": 55,
+        "prompt_tokens": 50,
+        "total_tokens": 60,
     }
     assert (
-        response.choices[0].message.content
-        == "A chicken sits on a pile of money, looking"
+        response.choices[0].message.content == "In a small town, a chicken named Cluck"
     )
     assert response == response_snapshot
 
@@ -66,7 +65,7 @@ async def test_mllama_load(mllama, generate_load, response_snapshot):
                     "content": [
                         {
                             "type": "text",
-                            "text": "Describe the image in 10 words.",
+                            "text": "Can you tell me a very short story based on the image?",
                         },
                         {
                             "type": "image_url",
@@ -87,7 +86,7 @@ async def test_mllama_load(mllama, generate_load, response_snapshot):
     generated_texts = [response.choices[0].message.content for response in responses]
 
     # XXX: TODO: Fix this test.
-    assert generated_texts[0] == "A chicken sits on a pile of money, looking"
+    assert generated_texts[0] == "In a small town, a chicken named Cluck"
     assert len(generated_texts) == 2
     assert generated_texts, all(
         [text == generated_texts[0] for text in generated_texts]

From 76bcb4948dbfc2244d27ab76a9c759e52be2feec Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Wed, 12 Feb 2025 18:31:34 +0800
Subject: [PATCH 082/183] fix Qwen VL break in intel platform (#3002)

* fix Qwen VL break in intel platform

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* could use PositionRotaryEmbedding impl so rocm and ipex could all work

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 .../text_generation_server/layers/rotary.py   | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
index f38f6859..a7c7872e 100644
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@@ -570,22 +570,6 @@ class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
             .to(inv_freq.device)
         )
 
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        cos: torch.Tensor,
-        sin: torch.Tensor,
-    ):
-        # rotate half the sequence length
-        rot = cos.shape[-1] // 2
-        q2 = torch.cat([-query[..., rot:], query[..., :rot]], dim=-1)
-        k2 = torch.cat([-key[..., rot:], key[..., :rot]], dim=-1)
-
-        # apply the rotation
-        rotary_emb.apply_rotary(query, q2, cos, sin, query, q2, True)
-        rotary_emb.apply_rotary(key, k2, cos, sin, key, k2, True)
-
     def _update_cos_sin_cache(
         self, dtype: torch.dtype, device: torch.device, seqlen: int
     ):
@@ -614,7 +598,4 @@ class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
 
         cos = self._cos_cached[position_ids].gather(1, self._sections[:slen])
         sin = self._sin_cached[position_ids].gather(1, self._sections[:slen])
-
-        cos = torch.cat([cos, cos], dim=-1)
-        sin = torch.cat([sin, sin], dim=-1)
         return cos, sin

From 4cccce4b4442284d9566ab44e51028e29e07b422 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 12 Feb 2025 12:26:52 +0100
Subject: [PATCH 083/183] Update the flaky mllama test. (#3015)

---
 .../test_mllama/test_mllama_load.json            | 16 ++++++++--------
 .../test_mllama/test_mllama_simpl.json           |  8 ++++----
 integration-tests/models/test_mllama.py          | 13 +++++++------
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
index 65154e89..690e275f 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
@@ -6,7 +6,7 @@
         "index": 0,
         "logprobs": null,
         "message": {
-          "content": "In a small town, a chicken named Cluck",
+          "content": "A chicken sits on a pile of money, looking",
           "name": null,
           "role": "assistant",
           "tool_calls": null
@@ -14,15 +14,15 @@
         "usage": null
       }
     ],
-    "created": 1738753835,
+    "created": 1739290197,
     "id": "",
     "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
     "system_fingerprint": "3.1.1-dev0-native",
     "usage": {
       "completion_tokens": 10,
-      "prompt_tokens": 50,
-      "total_tokens": 60
+      "prompt_tokens": 45,
+      "total_tokens": 55
     }
   },
   {
@@ -32,7 +32,7 @@
         "index": 0,
         "logprobs": null,
         "message": {
-          "content": "In a small town, a chicken named Cluck",
+          "content": "A chicken sits on a pile of money, looking",
           "name": null,
           "role": "assistant",
           "tool_calls": null
@@ -40,15 +40,15 @@
         "usage": null
       }
     ],
-    "created": 1738753835,
+    "created": 1739290197,
     "id": "",
     "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "object": "chat.completion",
     "system_fingerprint": "3.1.1-dev0-native",
     "usage": {
       "completion_tokens": 10,
-      "prompt_tokens": 50,
-      "total_tokens": 60
+      "prompt_tokens": 45,
+      "total_tokens": 55
     }
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
index 14ca3f4e..c9c3c1b8 100644
--- a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
+++ b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "In a small town, a chicken named Cluck",
+        "content": "A chicken sits on a pile of money, looking",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1738753833,
+  "created": 1739290152,
   "id": "",
   "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
   "object": "chat.completion",
   "system_fingerprint": "3.1.1-dev0-native",
   "usage": {
     "completion_tokens": 10,
-    "prompt_tokens": 50,
-    "total_tokens": 60
+    "prompt_tokens": 45,
+    "total_tokens": 55
   }
 }
diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py
index 90b2bff1..f023ad1d 100644
--- a/integration-tests/models/test_mllama.py
+++ b/integration-tests/models/test_mllama.py
@@ -28,7 +28,7 @@ async def test_mllama_simpl(mllama, response_snapshot):
                 "content": [
                     {
                         "type": "text",
-                        "text": "Can you tell me a very short story based on the image?",
+                        "text": "Describe the image in 10 words.",
                     },
                     {
                         "type": "image_url",
@@ -43,11 +43,12 @@ async def test_mllama_simpl(mllama, response_snapshot):
 
     assert response.usage == {
         "completion_tokens": 10,
-        "prompt_tokens": 50,
-        "total_tokens": 60,
+        "prompt_tokens": 45,
+        "total_tokens": 55,
     }
     assert (
-        response.choices[0].message.content == "In a small town, a chicken named Cluck"
+        response.choices[0].message.content
+        == "A chicken sits on a pile of money, looking"
     )
     assert response == response_snapshot
 
@@ -65,7 +66,7 @@ async def test_mllama_load(mllama, generate_load, response_snapshot):
                     "content": [
                         {
                             "type": "text",
-                            "text": "Can you tell me a very short story based on the image?",
+                            "text": "Describe the image in 10 words.",
                         },
                         {
                             "type": "image_url",
@@ -86,7 +87,7 @@ async def test_mllama_load(mllama, generate_load, response_snapshot):
     generated_texts = [response.choices[0].message.content for response in responses]
 
     # XXX: TODO: Fix this test.
-    assert generated_texts[0] == "In a small town, a chicken named Cluck"
+    assert generated_texts[0] == "A chicken sits on a pile of money, looking"
     assert len(generated_texts) == 2
     assert generated_texts, all(
         [text == generated_texts[0] for text in generated_texts]

From 8a211dc7fc278efaf4688c6566b6ba06ae028f39 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 13 Feb 2025 11:23:17 +0100
Subject: [PATCH 084/183] Preventing single user hugging the server to death by
 asking (#3016)

for way too many tokens.
---
 router/src/validation.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/router/src/validation.rs b/router/src/validation.rs
index 7ac05b21..3b021b67 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -147,7 +147,13 @@ impl Validation {
 
         // Get total tokens
         let (max_new_tokens, max_total_new_tokens) = if let Some(max_new_tokens) = max_new_tokens {
-            (max_new_tokens, max_new_tokens)
+            // Do not accept humongous max_new_tokens queries.
+            // We preallocate the default but we prevent a single user
+            // from taking up all the slots in a handful of queries that consume little
+            // amount of tokens. (You can have 10 token long query that creates a handful of token
+            // but the requested amount to be 120k.
+            let chunk_size = min(max_new_tokens, DEFAULT_GENERATION_LENGTH);
+            (chunk_size, max_new_tokens)
         } else {
             // Use the maximum possible number of tokens as default
             // However, the system will re-queue the request everytime it completes

From d6881c37ab06b989c007321d50e07294a6177111 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 14 Feb 2025 11:31:59 +0100
Subject: [PATCH 085/183] Putting back the NCCL forced upgrade. (#2999)

* Putting back the NCCL forced upgrade.

* .

* ...

* Ignoring conda.

* Dropping conda from the buidl system + torch 2.6

* Cache min.

* Rolling back torch version.

* Reverting the EETQ modification.

* Fix flash attention ?

* Actually stay on flash v1.

* Patching flash v1.

* Torch 2.6, fork of rotary, eetq updated.

* Put back nccl latest (override torch).

* Slightly more reproducible build and not as scary.
---
 .github/workflows/build.yaml |   2 +-
 Dockerfile                   | 110 +++--
 server/Makefile-eetq         |   2 +-
 server/Makefile-flash-att    |   4 +-
 server/Makefile-flashinfer   |   4 +-
 server/uv.lock               | 749 ++++++++++++++++++-----------------
 tgi-entrypoint.sh            |   2 +-
 7 files changed, 432 insertions(+), 441 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 720a13cb..7c91d7b0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -202,7 +202,7 @@ jobs:
           target: ${{ env.TARGET }}
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
-          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=max,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
+          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=max,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
           cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
       - name: Final
         id: final
diff --git a/Dockerfile b/Dockerfile
index ead6c8c5..39c9f66b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,21 +45,16 @@ RUN cargo build --profile release-opt --frozen
 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
+WORKDIR /usr/src/
 
 # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
-ARG PYTORCH_VERSION=2.5.1
-
+ARG PYTORCH_VERSION=2.6
 ARG PYTHON_VERSION=3.11
+
 # Keep in sync with `server/pyproject.toml
-ARG CUDA_VERSION=12.4
-ARG MAMBA_VERSION=24.3.0-0
-ARG CUDA_CHANNEL=nvidia
-ARG INSTALL_CHANNEL=pytorch
 # Automatically set by buildx
 ARG TARGETPLATFORM
 
-ENV PATH=/opt/conda/bin:$PATH
-
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
@@ -67,26 +62,12 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         curl \
         git && \
         rm -rf /var/lib/apt/lists/*
-
-# Install conda
-# translating Docker's TARGETPLATFORM into mamba arches
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
-         *)              MAMBA_ARCH=x86_64   ;; \
-    esac && \
-    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
-RUN chmod +x ~/mambaforge.sh && \
-    bash ~/mambaforge.sh -b -p /opt/conda && \
-    rm ~/mambaforge.sh
-
-# Install pytorch
-# On arm64 we exit with an error code
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  exit 1 ;; \
-         *)              /opt/conda/bin/conda update -y conda &&  \
-                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
-    esac && \
-    /opt/conda/bin/conda clean -ya
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
+ENV PATH="$PATH:/root/.local/bin"
+RUN uv python install ${PYTHON_VERSION}
+RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} pip setuptools packaging
+ENV VIRTUAL_ENV=/usr/src/.venv/
+ENV PATH="$PATH:/usr/src/.venv/bin/"
 
 # CUDA kernels builder image
 FROM pytorch-install AS kernel-builder
@@ -106,7 +87,7 @@ WORKDIR /usr/src
 COPY server/Makefile-flash-att Makefile
 
 # Build specific version of flash attention
-RUN make build-flash-attention
+RUN . .venv/bin/activate && make build-flash-attention
 
 # Build Flash Attention v2 CUDA kernels
 FROM kernel-builder AS flash-att-v2-builder
@@ -116,14 +97,14 @@ WORKDIR /usr/src
 COPY server/Makefile-flash-att-v2 Makefile
 
 # Build specific version of flash attention v2
-RUN make build-flash-attention-v2-cuda
+RUN . .venv/bin/activate && make build-flash-attention-v2-cuda
 
 # Build Transformers exllama kernels
 FROM kernel-builder AS exllama-kernels-builder
 WORKDIR /usr/src
 COPY server/exllama_kernels/ .
 
-RUN python setup.py build
+RUN . .venv/bin/activate && python setup.py build
 
 # Build Transformers exllama kernels
 FROM kernel-builder AS exllamav2-kernels-builder
@@ -131,54 +112,50 @@ WORKDIR /usr/src
 COPY server/Makefile-exllamav2/ Makefile
 
 # Build specific version of transformers
-RUN make build-exllamav2
+RUN . .venv/bin/activate && make build-exllamav2
 
 # Build Transformers awq kernels
 FROM kernel-builder AS awq-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-awq Makefile
 # Build specific version of transformers
-RUN make build-awq
+RUN . .venv/bin/activate && make build-awq
 
 # Build eetq kernels
 FROM kernel-builder AS eetq-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-eetq Makefile
 # Build specific version of transformers
-RUN make build-eetq
+RUN . .venv/bin/activate && make build-eetq
 
 # Build Lorax Punica kernels
 FROM kernel-builder AS lorax-punica-builder
 WORKDIR /usr/src
 COPY server/Makefile-lorax-punica Makefile
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
+RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
 
 # Build Transformers CUDA kernels
 FROM kernel-builder AS custom-kernels-builder
 WORKDIR /usr/src
 COPY server/custom_kernels/ .
 # Build specific version of transformers
-RUN python setup.py build
+RUN . .venv/bin/activate && python setup.py build
 
 # Build mamba kernels
 FROM kernel-builder AS mamba-builder
 WORKDIR /usr/src
 COPY server/Makefile-selective-scan Makefile
-RUN make build-all
+RUN . .venv/bin/activate && make build-all
 
 # Build flashinfer
 FROM kernel-builder AS flashinfer-builder
 WORKDIR /usr/src
 COPY server/Makefile-flashinfer Makefile
-RUN make install-flashinfer
+RUN . .venv/bin/activate && make install-flashinfer
 
 # Text Generation Inference base image
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
-
-# Conda env
-ENV PATH=/opt/conda/bin:$PATH \
-    CONDA_PREFIX=/opt/conda
+FROM nvidia/cuda:12.4.0-base-ubuntu22.04 AS base
 
 # Text Generation Inference base env
 ENV HF_HOME=/data \
@@ -195,60 +172,62 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         git \
         && rm -rf /var/lib/apt/lists/*
 
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="$PATH:/root/.local/bin"
 # Install flash-attention dependencies
 # RUN pip install einops --no-cache-dir
 
-# Copy conda with PyTorch installed
-COPY --from=pytorch-install /opt/conda /opt/conda
+# Copy env with PyTorch installed
+COPY --from=pytorch-install /usr/src/.venv /usr/src/.venv
+ENV PYTHON_VERSION=3.11
+RUN uv python install ${PYTHON_VERSION}
+ENV VIRTUAL_ENV=/usr/src/.venv/
+ENV PATH="$PATH:/usr/src/.venv/bin/"
 
 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
-ENV UV_SYSTEM_PYTHON=1
 ENV HF_KERNELS_CACHE=/kernels
 RUN cd server && \
-    pip install -U pip uv && \
-	uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project && \
-    . ./.venv/bin/activate && \
+	uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project --active && \
     make gen-server-raw && \
     hf-kernels download .
 
 RUN cd server && \
-    uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
-    . ./.venv/bin/activate && \
+    uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --active --python=${PYTHON_VERSION} && \
+    uv pip install nvidia-nccl-cu12==2.25.1 && \
     pwd && \
     text-generation-server --help
 
 # Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 
 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-v2-builder /usr/src/.venv/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/.venv/lib/python3.11/site-packages
 
 # Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from lorax punica kernels builder
-COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
-COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
-COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
-COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /usr/src/server/.venv/lib/python3.11/site-packages/flashinfer/
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=flashinfer-builder /usr/src/.venv/lib/python3.11/site-packages/flashinfer/ /usr/src/.venv/lib/python3.11/site-packages/flashinfer/
 
 
 # ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
 # Required to find libpython within the rust binaries
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 # This is needed because exl2 tries to load flash-attn
 # And fails with our builds.
 ENV EXLLAMA_NO_FLASH_ATTN=1
@@ -283,5 +262,6 @@ FROM base
 COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
 RUN chmod +x /tgi-entrypoint.sh
 
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/"
 ENTRYPOINT ["/tgi-entrypoint.sh"]
 # CMD ["--json-output"]
diff --git a/server/Makefile-eetq b/server/Makefile-eetq
index 5b53122b..1b97f6fd 100644
--- a/server/Makefile-eetq
+++ b/server/Makefile-eetq
@@ -1,4 +1,4 @@
-eetq_commit := 81e0b14d64088d58ef6acd2c8f3e788d59324407
+eetq_commit := 465e9726bf7ae30803a2d0dd9e5d4315aef17491
 
 eetq:
     # Clone eetq
diff --git a/server/Makefile-flash-att b/server/Makefile-flash-att
index 29e75bc4..f1efc378 100644
--- a/server/Makefile-flash-att
+++ b/server/Makefile-flash-att
@@ -1,9 +1,9 @@
-flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec
+flash_att_commit := ceee0de88c037ee6eda5e75c813a8648e4bcb1c9
 
 build-flash-attention:
 	if [ ! -d 'flash-attention' ]; then \
 		pip install -U packaging ninja  --no-cache-dir && \
-		git clone https://github.com/HazyResearch/flash-attention.git; \
+		git clone https://github.com/Narsil/flash-attention.git; \
 	fi
 	cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
 	MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build
diff --git a/server/Makefile-flashinfer b/server/Makefile-flashinfer
index 093b895b..d095d841 100644
--- a/server/Makefile-flashinfer
+++ b/server/Makefile-flashinfer
@@ -1,6 +1,6 @@
 install-flashinfer:
 	# We need fsspec as an additional dependency, but
 	# `pip install flashinfer` cannot resolve it.
-	pip install fsspec sympy==1.13.1 numpy
-	pip install -U setuptools
+	uv pip install fsspec sympy==1.13.1 numpy
+	uv pip install -U setuptools
 	TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0+PTX" FLASHINFER_ENABLE_AOT=1 pip install git+https://github.com/flashinfer-ai/flashinfer.git@v0.2.0.post1#egg=flashinfer  --no-build-isolation
diff --git a/server/uv.lock b/server/uv.lock
index 284def0b..bbb2c9d8 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -1,8 +1,7 @@
 version = 1
 requires-python = ">=3.9"
 resolution-markers = [
-    "python_full_version >= '3.13'",
-    "python_full_version == '3.12.*'",
+    "python_full_version >= '3.12'",
     "python_full_version == '3.11.*'",
     "python_full_version == '3.10.*'",
     "python_full_version < '3.10'",
@@ -10,35 +9,35 @@ resolution-markers = [
 
 [[package]]
 name = "accelerate"
-version = "1.2.1"
+version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "packaging" },
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
     { name = "torch" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/42/09/7947691b7d44bfc739da4a44cc47d6a6d75e6fe9adf047c5234d7cb6be64/accelerate-1.2.1.tar.gz", hash = "sha256:03e161fc69d495daf2b9b5c8d5b43d06e2145520c04727b5bda56d49f1a43ab5", size = 341652 }
+sdist = { url = "https://files.pythonhosted.org/packages/85/15/0fab0260ab4069e5224e637d2e400538bb27b0dfc36f17daf68db9770d78/accelerate-1.3.0.tar.gz", hash = "sha256:518631c0adb80bd3d42fb29e7e2dc2256bcd7c786b0ba9119bbaa08611b36d9c", size = 342758 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c2/60/a585c806d6c0ec5f8149d44eb202714792802f484e6e2b1bf96b23bd2b00/accelerate-1.2.1-py3-none-any.whl", hash = "sha256:be1cbb958cf837e7cdfbde46b812964b1b8ae94c9c7d94d921540beafcee8ddf", size = 336355 },
+    { url = "https://files.pythonhosted.org/packages/73/de/64508cb91af013aaba214752309c0967568a4219d50a4ea30e822af3c976/accelerate-1.3.0-py3-none-any.whl", hash = "sha256:5788d9e6a7a9f80fed665cf09681c4dddd9dc056bea656db4140ffc285ce423e", size = 336647 },
 ]
 
 [[package]]
 name = "aiohappyeyeballs"
-version = "2.4.4"
+version = "2.4.6"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/7f/55/e4373e888fdacb15563ef6fa9fa8c8252476ea071e96fb46defac9f18bf2/aiohappyeyeballs-2.4.4.tar.gz", hash = "sha256:5fdd7d87889c63183afc18ce9271f9b0a7d32c2303e394468dd45d514a757745", size = 21977 }
+sdist = { url = "https://files.pythonhosted.org/packages/08/07/508f9ebba367fc3370162e53a3cfd12f5652ad79f0e0bfdf9f9847c6f159/aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0", size = 21726 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b9/74/fbb6559de3607b3300b9be3cc64e97548d55678e44623db17820dbd20002/aiohappyeyeballs-2.4.4-py3-none-any.whl", hash = "sha256:a980909d50efcd44795c4afeca523296716d50cd756ddca6af8c65b996e27de8", size = 14756 },
+    { url = "https://files.pythonhosted.org/packages/44/4c/03fb05f56551828ec67ceb3665e5dc51638042d204983a03b0a1541475b6/aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1", size = 14543 },
 ]
 
 [[package]]
 name = "aiohttp"
-version = "3.11.11"
+version = "3.11.12"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohappyeyeballs" },
@@ -50,83 +49,88 @@ dependencies = [
     { name = "propcache" },
     { name = "yarl" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fe/ed/f26db39d29cd3cb2f5a3374304c713fe5ab5a0e4c8ee25a0c45cc6adf844/aiohttp-3.11.11.tar.gz", hash = "sha256:bb49c7f1e6ebf3821a42d81d494f538107610c3a705987f53068546b0e90303e", size = 7669618 }
+sdist = { url = "https://files.pythonhosted.org/packages/37/4b/952d49c73084fb790cb5c6ead50848c8e96b4980ad806cf4d2ad341eaa03/aiohttp-3.11.12.tar.gz", hash = "sha256:7603ca26d75b1b86160ce1bbe2787a0b706e592af5b2504e12caa88a217767b0", size = 7673175 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/75/7d/ff2e314b8f9e0b1df833e2d4778eaf23eae6b8cc8f922495d110ddcbf9e1/aiohttp-3.11.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a60804bff28662cbcf340a4d61598891f12eea3a66af48ecfdc975ceec21e3c8", size = 708550 },
-    { url = "https://files.pythonhosted.org/packages/09/b8/aeb4975d5bba233d6f246941f5957a5ad4e3def8b0855a72742e391925f2/aiohttp-3.11.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b4fa1cb5f270fb3eab079536b764ad740bb749ce69a94d4ec30ceee1b5940d5", size = 468430 },
-    { url = "https://files.pythonhosted.org/packages/9c/5b/5b620279b3df46e597008b09fa1e10027a39467387c2332657288e25811a/aiohttp-3.11.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:731468f555656767cda219ab42e033355fe48c85fbe3ba83a349631541715ba2", size = 455593 },
-    { url = "https://files.pythonhosted.org/packages/d8/75/0cdf014b816867d86c0bc26f3d3e3f194198dbf33037890beed629cd4f8f/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb23d8bb86282b342481cad4370ea0853a39e4a32a0042bb52ca6bdde132df43", size = 1584635 },
-    { url = "https://files.pythonhosted.org/packages/df/2f/95b8f4e4dfeb57c1d9ad9fa911ede35a0249d75aa339edd2c2270dc539da/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f047569d655f81cb70ea5be942ee5d4421b6219c3f05d131f64088c73bb0917f", size = 1632363 },
-    { url = "https://files.pythonhosted.org/packages/39/cb/70cf69ea7c50f5b0021a84f4c59c3622b2b3b81695f48a2f0e42ef7eba6e/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd7659baae9ccf94ae5fe8bfaa2c7bc2e94d24611528395ce88d009107e00c6d", size = 1668315 },
-    { url = "https://files.pythonhosted.org/packages/2f/cc/3a3fc7a290eabc59839a7e15289cd48f33dd9337d06e301064e1e7fb26c5/aiohttp-3.11.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af01e42ad87ae24932138f154105e88da13ce7d202a6de93fafdafb2883a00ef", size = 1589546 },
-    { url = "https://files.pythonhosted.org/packages/15/b4/0f7b0ed41ac6000e283e7332f0f608d734b675a8509763ca78e93714cfb0/aiohttp-3.11.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5854be2f3e5a729800bac57a8d76af464e160f19676ab6aea74bde18ad19d438", size = 1544581 },
-    { url = "https://files.pythonhosted.org/packages/58/b9/4d06470fd85c687b6b0e31935ef73dde6e31767c9576d617309a2206556f/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6526e5fb4e14f4bbf30411216780c9967c20c5a55f2f51d3abd6de68320cc2f3", size = 1529256 },
-    { url = "https://files.pythonhosted.org/packages/61/a2/6958b1b880fc017fd35f5dfb2c26a9a50c755b75fd9ae001dc2236a4fb79/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:85992ee30a31835fc482468637b3e5bd085fa8fe9392ba0bdcbdc1ef5e9e3c55", size = 1536592 },
-    { url = "https://files.pythonhosted.org/packages/0f/dd/b974012a9551fd654f5bb95a6dd3f03d6e6472a17e1a8216dd42e9638d6c/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:88a12ad8ccf325a8a5ed80e6d7c3bdc247d66175afedbe104ee2aaca72960d8e", size = 1607446 },
-    { url = "https://files.pythonhosted.org/packages/e0/d3/6c98fd87e638e51f074a3f2061e81fcb92123bcaf1439ac1b4a896446e40/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0a6d3fbf2232e3a08c41eca81ae4f1dff3d8f1a30bae415ebe0af2d2458b8a33", size = 1628809 },
-    { url = "https://files.pythonhosted.org/packages/a8/2e/86e6f85cbca02be042c268c3d93e7f35977a0e127de56e319bdd1569eaa8/aiohttp-3.11.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84a585799c58b795573c7fa9b84c455adf3e1d72f19a2bf498b54a95ae0d194c", size = 1564291 },
-    { url = "https://files.pythonhosted.org/packages/0b/8d/1f4ef3503b767717f65e1f5178b0173ab03cba1a19997ebf7b052161189f/aiohttp-3.11.11-cp310-cp310-win32.whl", hash = "sha256:bfde76a8f430cf5c5584553adf9926534352251d379dcb266ad2b93c54a29745", size = 416601 },
-    { url = "https://files.pythonhosted.org/packages/ad/86/81cb83691b5ace3d9aa148dc42bacc3450d749fc88c5ec1973573c1c1779/aiohttp-3.11.11-cp310-cp310-win_amd64.whl", hash = "sha256:0fd82b8e9c383af11d2b26f27a478640b6b83d669440c0a71481f7c865a51da9", size = 442007 },
-    { url = "https://files.pythonhosted.org/packages/34/ae/e8806a9f054e15f1d18b04db75c23ec38ec954a10c0a68d3bd275d7e8be3/aiohttp-3.11.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ba74ec819177af1ef7f59063c6d35a214a8fde6f987f7661f4f0eecc468a8f76", size = 708624 },
-    { url = "https://files.pythonhosted.org/packages/c7/e0/313ef1a333fb4d58d0c55a6acb3cd772f5d7756604b455181049e222c020/aiohttp-3.11.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4af57160800b7a815f3fe0eba9b46bf28aafc195555f1824555fa2cfab6c1538", size = 468507 },
-    { url = "https://files.pythonhosted.org/packages/a9/60/03455476bf1f467e5b4a32a465c450548b2ce724eec39d69f737191f936a/aiohttp-3.11.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ffa336210cf9cd8ed117011085817d00abe4c08f99968deef0013ea283547204", size = 455571 },
-    { url = "https://files.pythonhosted.org/packages/be/f9/469588603bd75bf02c8ffb8c8a0d4b217eed446b49d4a767684685aa33fd/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81b8fe282183e4a3c7a1b72f5ade1094ed1c6345a8f153506d114af5bf8accd9", size = 1685694 },
-    { url = "https://files.pythonhosted.org/packages/88/b9/1b7fa43faf6c8616fa94c568dc1309ffee2b6b68b04ac268e5d64b738688/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3af41686ccec6a0f2bdc66686dc0f403c41ac2089f80e2214a0f82d001052c03", size = 1743660 },
-    { url = "https://files.pythonhosted.org/packages/2a/8b/0248d19dbb16b67222e75f6aecedd014656225733157e5afaf6a6a07e2e8/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70d1f9dde0e5dd9e292a6d4d00058737052b01f3532f69c0c65818dac26dc287", size = 1785421 },
-    { url = "https://files.pythonhosted.org/packages/c4/11/f478e071815a46ca0a5ae974651ff0c7a35898c55063305a896e58aa1247/aiohttp-3.11.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:249cc6912405917344192b9f9ea5cd5b139d49e0d2f5c7f70bdfaf6b4dbf3a2e", size = 1675145 },
-    { url = "https://files.pythonhosted.org/packages/26/5d/284d182fecbb5075ae10153ff7374f57314c93a8681666600e3a9e09c505/aiohttp-3.11.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0eb98d90b6690827dcc84c246811feeb4e1eea683c0eac6caed7549be9c84665", size = 1619804 },
-    { url = "https://files.pythonhosted.org/packages/1b/78/980064c2ad685c64ce0e8aeeb7ef1e53f43c5b005edcd7d32e60809c4992/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec82bf1fda6cecce7f7b915f9196601a1bd1a3079796b76d16ae4cce6d0ef89b", size = 1654007 },
-    { url = "https://files.pythonhosted.org/packages/21/8d/9e658d63b1438ad42b96f94da227f2e2c1d5c6001c9e8ffcc0bfb22e9105/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9fd46ce0845cfe28f108888b3ab17abff84ff695e01e73657eec3f96d72eef34", size = 1650022 },
-    { url = "https://files.pythonhosted.org/packages/85/fd/a032bf7f2755c2df4f87f9effa34ccc1ef5cea465377dbaeef93bb56bbd6/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:bd176afcf8f5d2aed50c3647d4925d0db0579d96f75a31e77cbaf67d8a87742d", size = 1732899 },
-    { url = "https://files.pythonhosted.org/packages/c5/0c/c2b85fde167dd440c7ba50af2aac20b5a5666392b174df54c00f888c5a75/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ec2aa89305006fba9ffb98970db6c8221541be7bee4c1d027421d6f6df7d1ce2", size = 1755142 },
-    { url = "https://files.pythonhosted.org/packages/bc/78/91ae1a3b3b3bed8b893c5d69c07023e151b1c95d79544ad04cf68f596c2f/aiohttp-3.11.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:92cde43018a2e17d48bb09c79e4d4cb0e236de5063ce897a5e40ac7cb4878773", size = 1692736 },
-    { url = "https://files.pythonhosted.org/packages/77/89/a7ef9c4b4cdb546fcc650ca7f7395aaffbd267f0e1f648a436bec33c9b95/aiohttp-3.11.11-cp311-cp311-win32.whl", hash = "sha256:aba807f9569455cba566882c8938f1a549f205ee43c27b126e5450dc9f83cc62", size = 416418 },
-    { url = "https://files.pythonhosted.org/packages/fc/db/2192489a8a51b52e06627506f8ac8df69ee221de88ab9bdea77aa793aa6a/aiohttp-3.11.11-cp311-cp311-win_amd64.whl", hash = "sha256:ae545f31489548c87b0cced5755cfe5a5308d00407000e72c4fa30b19c3220ac", size = 442509 },
-    { url = "https://files.pythonhosted.org/packages/69/cf/4bda538c502f9738d6b95ada11603c05ec260807246e15e869fc3ec5de97/aiohttp-3.11.11-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e595c591a48bbc295ebf47cb91aebf9bd32f3ff76749ecf282ea7f9f6bb73886", size = 704666 },
-    { url = "https://files.pythonhosted.org/packages/46/7b/87fcef2cad2fad420ca77bef981e815df6904047d0a1bd6aeded1b0d1d66/aiohttp-3.11.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3ea1b59dc06396b0b424740a10a0a63974c725b1c64736ff788a3689d36c02d2", size = 464057 },
-    { url = "https://files.pythonhosted.org/packages/5a/a6/789e1f17a1b6f4a38939fbc39d29e1d960d5f89f73d0629a939410171bc0/aiohttp-3.11.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8811f3f098a78ffa16e0ea36dffd577eb031aea797cbdba81be039a4169e242c", size = 455996 },
-    { url = "https://files.pythonhosted.org/packages/b7/dd/485061fbfef33165ce7320db36e530cd7116ee1098e9c3774d15a732b3fd/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7227b87a355ce1f4bf83bfae4399b1f5bb42e0259cb9405824bd03d2f4336a", size = 1682367 },
-    { url = "https://files.pythonhosted.org/packages/e9/d7/9ec5b3ea9ae215c311d88b2093e8da17e67b8856673e4166c994e117ee3e/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d40f9da8cabbf295d3a9dae1295c69975b86d941bc20f0a087f0477fa0a66231", size = 1736989 },
-    { url = "https://files.pythonhosted.org/packages/d6/fb/ea94927f7bfe1d86178c9d3e0a8c54f651a0a655214cce930b3c679b8f64/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ffb3dc385f6bb1568aa974fe65da84723210e5d9707e360e9ecb51f59406cd2e", size = 1793265 },
-    { url = "https://files.pythonhosted.org/packages/40/7f/6de218084f9b653026bd7063cd8045123a7ba90c25176465f266976d8c82/aiohttp-3.11.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8f5f7515f3552d899c61202d99dcb17d6e3b0de777900405611cd747cecd1b8", size = 1691841 },
-    { url = "https://files.pythonhosted.org/packages/77/e2/992f43d87831cbddb6b09c57ab55499332f60ad6fdbf438ff4419c2925fc/aiohttp-3.11.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3499c7ffbfd9c6a3d8d6a2b01c26639da7e43d47c7b4f788016226b1e711caa8", size = 1619317 },
-    { url = "https://files.pythonhosted.org/packages/96/74/879b23cdd816db4133325a201287c95bef4ce669acde37f8f1b8669e1755/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8e2bf8029dbf0810c7bfbc3e594b51c4cc9101fbffb583a3923aea184724203c", size = 1641416 },
-    { url = "https://files.pythonhosted.org/packages/30/98/b123f6b15d87c54e58fd7ae3558ff594f898d7f30a90899718f3215ad328/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b6212a60e5c482ef90f2d788835387070a88d52cf6241d3916733c9176d39eab", size = 1646514 },
-    { url = "https://files.pythonhosted.org/packages/d7/38/257fda3dc99d6978ab943141d5165ec74fd4b4164baa15e9c66fa21da86b/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d119fafe7b634dbfa25a8c597718e69a930e4847f0b88e172744be24515140da", size = 1702095 },
-    { url = "https://files.pythonhosted.org/packages/0c/f4/ddab089053f9fb96654df5505c0a69bde093214b3c3454f6bfdb1845f558/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:6fba278063559acc730abf49845d0e9a9e1ba74f85f0ee6efd5803f08b285853", size = 1734611 },
-    { url = "https://files.pythonhosted.org/packages/c3/d6/f30b2bc520c38c8aa4657ed953186e535ae84abe55c08d0f70acd72ff577/aiohttp-3.11.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92fc484e34b733704ad77210c7957679c5c3877bd1e6b6d74b185e9320cc716e", size = 1694576 },
-    { url = "https://files.pythonhosted.org/packages/bc/97/b0a88c3f4c6d0020b34045ee6d954058abc870814f6e310c4c9b74254116/aiohttp-3.11.11-cp312-cp312-win32.whl", hash = "sha256:9f5b3c1ed63c8fa937a920b6c1bec78b74ee09593b3f5b979ab2ae5ef60d7600", size = 411363 },
-    { url = "https://files.pythonhosted.org/packages/7f/23/cc36d9c398980acaeeb443100f0216f50a7cfe20c67a9fd0a2f1a5a846de/aiohttp-3.11.11-cp312-cp312-win_amd64.whl", hash = "sha256:1e69966ea6ef0c14ee53ef7a3d68b564cc408121ea56c0caa2dc918c1b2f553d", size = 437666 },
-    { url = "https://files.pythonhosted.org/packages/49/d1/d8af164f400bad432b63e1ac857d74a09311a8334b0481f2f64b158b50eb/aiohttp-3.11.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:541d823548ab69d13d23730a06f97460f4238ad2e5ed966aaf850d7c369782d9", size = 697982 },
-    { url = "https://files.pythonhosted.org/packages/92/d1/faad3bf9fa4bfd26b95c69fc2e98937d52b1ff44f7e28131855a98d23a17/aiohttp-3.11.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:929f3ed33743a49ab127c58c3e0a827de0664bfcda566108989a14068f820194", size = 460662 },
-    { url = "https://files.pythonhosted.org/packages/db/61/0d71cc66d63909dabc4590f74eba71f91873a77ea52424401c2498d47536/aiohttp-3.11.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0882c2820fd0132240edbb4a51eb8ceb6eef8181db9ad5291ab3332e0d71df5f", size = 452950 },
-    { url = "https://files.pythonhosted.org/packages/07/db/6d04bc7fd92784900704e16b745484ef45b77bd04e25f58f6febaadf7983/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b63de12e44935d5aca7ed7ed98a255a11e5cb47f83a9fded7a5e41c40277d104", size = 1665178 },
-    { url = "https://files.pythonhosted.org/packages/54/5c/e95ade9ae29f375411884d9fd98e50535bf9fe316c9feb0f30cd2ac8f508/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa54f8ef31d23c506910c21163f22b124facb573bff73930735cf9fe38bf7dff", size = 1717939 },
-    { url = "https://files.pythonhosted.org/packages/6f/1c/1e7d5c5daea9e409ed70f7986001b8c9e3a49a50b28404498d30860edab6/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a344d5dc18074e3872777b62f5f7d584ae4344cd6006c17ba12103759d407af3", size = 1775125 },
-    { url = "https://files.pythonhosted.org/packages/5d/66/890987e44f7d2f33a130e37e01a164168e6aff06fce15217b6eaf14df4f6/aiohttp-3.11.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7fb429ab1aafa1f48578eb315ca45bd46e9c37de11fe45c7f5f4138091e2f1", size = 1677176 },
-    { url = "https://files.pythonhosted.org/packages/8f/dc/e2ba57d7a52df6cdf1072fd5fa9c6301a68e1cd67415f189805d3eeb031d/aiohttp-3.11.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c341c7d868750e31961d6d8e60ff040fb9d3d3a46d77fd85e1ab8e76c3e9a5c4", size = 1603192 },
-    { url = "https://files.pythonhosted.org/packages/6c/9e/8d08a57de79ca3a358da449405555e668f2c8871a7777ecd2f0e3912c272/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed9ee95614a71e87f1a70bc81603f6c6760128b140bc4030abe6abaa988f1c3d", size = 1618296 },
-    { url = "https://files.pythonhosted.org/packages/56/51/89822e3ec72db352c32e7fc1c690370e24e231837d9abd056490f3a49886/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:de8d38f1c2810fa2a4f1d995a2e9c70bb8737b18da04ac2afbf3971f65781d87", size = 1616524 },
-    { url = "https://files.pythonhosted.org/packages/2c/fa/e2e6d9398f462ffaa095e84717c1732916a57f1814502929ed67dd7568ef/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:a9b7371665d4f00deb8f32208c7c5e652059b0fda41cf6dbcac6114a041f1cc2", size = 1685471 },
-    { url = "https://files.pythonhosted.org/packages/ae/5f/6bb976e619ca28a052e2c0ca7b0251ccd893f93d7c24a96abea38e332bf6/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:620598717fce1b3bd14dd09947ea53e1ad510317c85dda2c9c65b622edc96b12", size = 1715312 },
-    { url = "https://files.pythonhosted.org/packages/79/c1/756a7e65aa087c7fac724d6c4c038f2faaa2a42fe56dbc1dd62a33ca7213/aiohttp-3.11.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bf8d9bfee991d8acc72d060d53860f356e07a50f0e0d09a8dfedea1c554dd0d5", size = 1672783 },
-    { url = "https://files.pythonhosted.org/packages/73/ba/a6190ebb02176c7f75e6308da31f5d49f6477b651a3dcfaaaca865a298e2/aiohttp-3.11.11-cp313-cp313-win32.whl", hash = "sha256:9d73ee3725b7a737ad86c2eac5c57a4a97793d9f442599bea5ec67ac9f4bdc3d", size = 410229 },
-    { url = "https://files.pythonhosted.org/packages/b8/62/c9fa5bafe03186a0e4699150a7fed9b1e73240996d0d2f0e5f70f3fdf471/aiohttp-3.11.11-cp313-cp313-win_amd64.whl", hash = "sha256:c7a06301c2fb096bdb0bd25fe2011531c1453b9f2c163c8031600ec73af1cc99", size = 436081 },
-    { url = "https://files.pythonhosted.org/packages/9f/37/326ee86b7640be6ca4493c8121cb9a4386e07cf1e5757ce6b7fa854d0a5f/aiohttp-3.11.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3e23419d832d969f659c208557de4a123e30a10d26e1e14b73431d3c13444c2e", size = 709424 },
-    { url = "https://files.pythonhosted.org/packages/9c/c5/a88ec2160b06c22e57e483a1f78f99f005fcd4e7d6855a2d3d6510881b65/aiohttp-3.11.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:21fef42317cf02e05d3b09c028712e1d73a9606f02467fd803f7c1f39cc59add", size = 468907 },
-    { url = "https://files.pythonhosted.org/packages/b2/f0/02f03f818e91996161cce200241b631bb2b4a87e61acddb5b974e254a288/aiohttp-3.11.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1f21bb8d0235fc10c09ce1d11ffbd40fc50d3f08a89e4cf3a0c503dc2562247a", size = 455981 },
-    { url = "https://files.pythonhosted.org/packages/0e/17/c8be12436ec19915f67b1ab8240d4105aba0f7e0894a1f0d8939c3e79c70/aiohttp-3.11.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1642eceeaa5ab6c9b6dfeaaa626ae314d808188ab23ae196a34c9d97efb68350", size = 1587395 },
-    { url = "https://files.pythonhosted.org/packages/43/c0/f4db1ac30ebe855b2fefd6fa98767862d88ac54ab08a6ad07d619146270c/aiohttp-3.11.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2170816e34e10f2fd120f603e951630f8a112e1be3b60963a1f159f5699059a6", size = 1636243 },
-    { url = "https://files.pythonhosted.org/packages/ea/a7/9acf20e9a09b0d38b5b55691410500d051a9f4194692cac22b0d0fc92ad9/aiohttp-3.11.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8be8508d110d93061197fd2d6a74f7401f73b6d12f8822bbcd6d74f2b55d71b1", size = 1672323 },
-    { url = "https://files.pythonhosted.org/packages/f7/5b/a27e8fe1a3b0e245ca80863eefd83fc00136752d27d2cf1afa0130a76f34/aiohttp-3.11.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4eed954b161e6b9b65f6be446ed448ed3921763cc432053ceb606f89d793927e", size = 1589521 },
-    { url = "https://files.pythonhosted.org/packages/25/50/8bccd08004e15906791b46f0a908a8e7f5e0c5882b17da96d1933bd34ac0/aiohttp-3.11.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6c9af134da4bc9b3bd3e6a70072509f295d10ee60c697826225b60b9959acdd", size = 1544059 },
-    { url = "https://files.pythonhosted.org/packages/84/5a/42250b37b06ee0cb7a03dd1630243b1d739ca3edb5abd8b18f479a539900/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:44167fc6a763d534a6908bdb2592269b4bf30a03239bcb1654781adf5e49caf1", size = 1530217 },
-    { url = "https://files.pythonhosted.org/packages/18/08/eb334da86cd2cdbd0621bb7039255b19ca74ce8b05e8fb61850e2589938c/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:479b8c6ebd12aedfe64563b85920525d05d394b85f166b7873c8bde6da612f9c", size = 1536081 },
-    { url = "https://files.pythonhosted.org/packages/1a/a9/9d59958084d5bad7e77a44841013bd59768cda94f9f744769461b66038fc/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:10b4ff0ad793d98605958089fabfa350e8e62bd5d40aa65cdc69d6785859f94e", size = 1606918 },
-    { url = "https://files.pythonhosted.org/packages/4f/e7/27feb1cff17dcddb7a5b703199106196718d622a3aa70f80a386d15361d7/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:b540bd67cfb54e6f0865ceccd9979687210d7ed1a1cc8c01f8e67e2f1e883d28", size = 1629101 },
-    { url = "https://files.pythonhosted.org/packages/e8/29/49debcd858b997c655fca274c5247fcfe29bf31a4ddb1ce3f088539b14e4/aiohttp-3.11.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1dac54e8ce2ed83b1f6b1a54005c87dfed139cf3f777fdc8afc76e7841101226", size = 1567338 },
-    { url = "https://files.pythonhosted.org/packages/3b/34/33af1e97aba1862e1812e2e2b96a1e050c5a6e9cecd5a5370591122fb07b/aiohttp-3.11.11-cp39-cp39-win32.whl", hash = "sha256:568c1236b2fde93b7720f95a890741854c1200fba4a3471ff48b2934d2d93fd3", size = 416914 },
-    { url = "https://files.pythonhosted.org/packages/2d/47/28b3fbd97026963af2774423c64341e0d4ec180ea3b79a2762a3c18d5d94/aiohttp-3.11.11-cp39-cp39-win_amd64.whl", hash = "sha256:943a8b052e54dfd6439fd7989f67fc6a7f2138d0a2cf0a7de5f18aa4fe7eb3b1", size = 442225 },
+    { url = "https://files.pythonhosted.org/packages/65/42/3880e133590820aa7bc6d068eb7d8e0ad9fdce9b4663f92b821d3f6b5601/aiohttp-3.11.12-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aa8a8caca81c0a3e765f19c6953416c58e2f4cc1b84829af01dd1c771bb2f91f", size = 708721 },
+    { url = "https://files.pythonhosted.org/packages/d8/8c/04869803bed108b25afad75f94c651b287851843caacbec6677d8f2d572b/aiohttp-3.11.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:84ede78acde96ca57f6cf8ccb8a13fbaf569f6011b9a52f870c662d4dc8cd854", size = 468596 },
+    { url = "https://files.pythonhosted.org/packages/4f/f4/9074011f0d1335b161c953fb32545b6667cf24465e1932b9767874995c7e/aiohttp-3.11.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:584096938a001378484aa4ee54e05dc79c7b9dd933e271c744a97b3b6f644957", size = 455758 },
+    { url = "https://files.pythonhosted.org/packages/fd/68/06298c57ef8f534065930b805e6dbd83613f0534447922782fb9920fce28/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:392432a2dde22b86f70dd4a0e9671a349446c93965f261dbaecfaf28813e5c42", size = 1584797 },
+    { url = "https://files.pythonhosted.org/packages/bd/1e/cee6b51fcb3b1c4185a7dc62b3113bc136fae07f39386c88c90b7f79f199/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:88d385b8e7f3a870146bf5ea31786ef7463e99eb59e31db56e2315535d811f55", size = 1632535 },
+    { url = "https://files.pythonhosted.org/packages/71/1f/42424462b7a09da362e1711090db9f8d68a37a33f0aab51307335517c599/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b10a47e5390c4b30a0d58ee12581003be52eedd506862ab7f97da7a66805befb", size = 1668484 },
+    { url = "https://files.pythonhosted.org/packages/f6/79/0e25542bbe3c2bfd7a12c7a49c7bce73b09a836f65079e4b77bc2bafc89e/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b5263dcede17b6b0c41ef0c3ccce847d82a7da98709e75cf7efde3e9e3b5cae", size = 1589708 },
+    { url = "https://files.pythonhosted.org/packages/d1/13/93ae26b75e23f7d3a613872e472fae836ca100dc5bde5936ebc93ada8890/aiohttp-3.11.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50c5c7b8aa5443304c55c262c5693b108c35a3b61ef961f1e782dd52a2f559c7", size = 1544752 },
+    { url = "https://files.pythonhosted.org/packages/cf/5e/48847fad1b014ef92ef18ea1339a3b58eb81d3bc717b94c3627f5d2a42c5/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d1c031a7572f62f66f1257db37ddab4cb98bfaf9b9434a3b4840bf3560f5e788", size = 1529417 },
+    { url = "https://files.pythonhosted.org/packages/ae/56/fbd4ea019303f4877f0e0b8c9de92e9db24338e7545570d3f275f3c74c53/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:7e44eba534381dd2687be50cbd5f2daded21575242ecfdaf86bbeecbc38dae8e", size = 1557808 },
+    { url = "https://files.pythonhosted.org/packages/f1/43/112189cf6b3c482ecdd6819b420eaa0c2033426f28d741bb7f19db5dd2bb/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:145a73850926018ec1681e734cedcf2716d6a8697d90da11284043b745c286d5", size = 1536765 },
+    { url = "https://files.pythonhosted.org/packages/30/12/59986547de8306e06c7b30e547ccda02d29636e152366caba2dd8627bfe1/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:2c311e2f63e42c1bf86361d11e2c4a59f25d9e7aabdbdf53dc38b885c5435cdb", size = 1607621 },
+    { url = "https://files.pythonhosted.org/packages/aa/9b/af3b323b20df3318ed20d701d8242e523d59c842ca93f23134b05c9d5054/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ea756b5a7bac046d202a9a3889b9a92219f885481d78cd318db85b15cc0b7bcf", size = 1628977 },
+    { url = "https://files.pythonhosted.org/packages/36/62/adf5a331a7bda475cc326dde393fa2bc5849060b1b37ac3d1bee1953f2cd/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:526c900397f3bbc2db9cb360ce9c35134c908961cdd0ac25b1ae6ffcaa2507ff", size = 1564455 },
+    { url = "https://files.pythonhosted.org/packages/90/c4/4a24291f22f111a854dfdb54dc94d4e0a5229ccbb7bc7f0bed972aa50410/aiohttp-3.11.12-cp310-cp310-win32.whl", hash = "sha256:b8d3bb96c147b39c02d3db086899679f31958c5d81c494ef0fc9ef5bb1359b3d", size = 416768 },
+    { url = "https://files.pythonhosted.org/packages/51/69/5221c8006acb7bb10d9e8e2238fb216571bddc2e00a8d95bcfbe2f579c57/aiohttp-3.11.12-cp310-cp310-win_amd64.whl", hash = "sha256:7fe3d65279bfbee8de0fb4f8c17fc4e893eed2dba21b2f680e930cc2b09075c5", size = 442170 },
+    { url = "https://files.pythonhosted.org/packages/9c/38/35311e70196b6a63cfa033a7f741f800aa8a93f57442991cbe51da2394e7/aiohttp-3.11.12-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87a2e00bf17da098d90d4145375f1d985a81605267e7f9377ff94e55c5d769eb", size = 708797 },
+    { url = "https://files.pythonhosted.org/packages/44/3e/46c656e68cbfc4f3fc7cb5d2ba4da6e91607fe83428208028156688f6201/aiohttp-3.11.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b34508f1cd928ce915ed09682d11307ba4b37d0708d1f28e5774c07a7674cac9", size = 468669 },
+    { url = "https://files.pythonhosted.org/packages/a0/d6/2088fb4fd1e3ac2bfb24bc172223babaa7cdbb2784d33c75ec09e66f62f8/aiohttp-3.11.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:936d8a4f0f7081327014742cd51d320296b56aa6d324461a13724ab05f4b2933", size = 455739 },
+    { url = "https://files.pythonhosted.org/packages/e7/dc/c443a6954a56f4a58b5efbfdf23cc6f3f0235e3424faf5a0c56264d5c7bb/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de1378f72def7dfb5dbd73d86c19eda0ea7b0a6873910cc37d57e80f10d64e1", size = 1685858 },
+    { url = "https://files.pythonhosted.org/packages/25/67/2d5b3aaade1d5d01c3b109aa76e3aa9630531252cda10aa02fb99b0b11a1/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9d45dbb3aaec05cf01525ee1a7ac72de46a8c425cb75c003acd29f76b1ffe94", size = 1743829 },
+    { url = "https://files.pythonhosted.org/packages/90/9b/9728fe9a3e1b8521198455d027b0b4035522be18f504b24c5d38d59e7278/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:930ffa1925393381e1e0a9b82137fa7b34c92a019b521cf9f41263976666a0d6", size = 1785587 },
+    { url = "https://files.pythonhosted.org/packages/ce/cf/28fbb43d4ebc1b4458374a3c7b6db3b556a90e358e9bbcfe6d9339c1e2b6/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8340def6737118f5429a5df4e88f440746b791f8f1c4ce4ad8a595f42c980bd5", size = 1675319 },
+    { url = "https://files.pythonhosted.org/packages/e5/d2/006c459c11218cabaa7bca401f965c9cc828efbdea7e1615d4644eaf23f7/aiohttp-3.11.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4016e383f91f2814e48ed61e6bda7d24c4d7f2402c75dd28f7e1027ae44ea204", size = 1619982 },
+    { url = "https://files.pythonhosted.org/packages/9d/83/ca425891ebd37bee5d837110f7fddc4d808a7c6c126a7d1b5c3ad72fc6ba/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c0600bcc1adfaaac321422d615939ef300df81e165f6522ad096b73439c0f58", size = 1654176 },
+    { url = "https://files.pythonhosted.org/packages/25/df/047b1ce88514a1b4915d252513640184b63624e7914e41d846668b8edbda/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:0450ada317a65383b7cce9576096150fdb97396dcfe559109b403c7242faffef", size = 1660198 },
+    { url = "https://files.pythonhosted.org/packages/d3/cc/6ecb8e343f0902528620b9dbd567028a936d5489bebd7dbb0dd0914f4fdb/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:850ff6155371fd802a280f8d369d4e15d69434651b844bde566ce97ee2277420", size = 1650186 },
+    { url = "https://files.pythonhosted.org/packages/f8/f8/453df6dd69256ca8c06c53fc8803c9056e2b0b16509b070f9a3b4bdefd6c/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8fd12d0f989c6099e7b0f30dc6e0d1e05499f3337461f0b2b0dadea6c64b89df", size = 1733063 },
+    { url = "https://files.pythonhosted.org/packages/55/f8/540160787ff3000391de0e5d0d1d33be4c7972f933c21991e2ea105b2d5e/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:76719dd521c20a58a6c256d058547b3a9595d1d885b830013366e27011ffe804", size = 1755306 },
+    { url = "https://files.pythonhosted.org/packages/30/7d/49f3bfdfefd741576157f8f91caa9ff61a6f3d620ca6339268327518221b/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97fe431f2ed646a3b56142fc81d238abcbaff08548d6912acb0b19a0cadc146b", size = 1692909 },
+    { url = "https://files.pythonhosted.org/packages/40/9c/8ce00afd6f6112ce9a2309dc490fea376ae824708b94b7b5ea9cba979d1d/aiohttp-3.11.12-cp311-cp311-win32.whl", hash = "sha256:e10c440d142fa8b32cfdb194caf60ceeceb3e49807072e0dc3a8887ea80e8c16", size = 416584 },
+    { url = "https://files.pythonhosted.org/packages/35/97/4d3c5f562f15830de472eb10a7a222655d750839943e0e6d915ef7e26114/aiohttp-3.11.12-cp311-cp311-win_amd64.whl", hash = "sha256:246067ba0cf5560cf42e775069c5d80a8989d14a7ded21af529a4e10e3e0f0e6", size = 442674 },
+    { url = "https://files.pythonhosted.org/packages/4d/d0/94346961acb476569fca9a644cc6f9a02f97ef75961a6b8d2b35279b8d1f/aiohttp-3.11.12-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e392804a38353900c3fd8b7cacbea5132888f7129f8e241915e90b85f00e3250", size = 704837 },
+    { url = "https://files.pythonhosted.org/packages/a9/af/05c503f1cc8f97621f199ef4b8db65fb88b8bc74a26ab2adb74789507ad3/aiohttp-3.11.12-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8fa1510b96c08aaad49303ab11f8803787c99222288f310a62f493faf883ede1", size = 464218 },
+    { url = "https://files.pythonhosted.org/packages/f2/48/b9949eb645b9bd699153a2ec48751b985e352ab3fed9d98c8115de305508/aiohttp-3.11.12-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dc065a4285307607df3f3686363e7f8bdd0d8ab35f12226362a847731516e42c", size = 456166 },
+    { url = "https://files.pythonhosted.org/packages/14/fb/980981807baecb6f54bdd38beb1bd271d9a3a786e19a978871584d026dcf/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddb31f8474695cd61fc9455c644fc1606c164b93bff2490390d90464b4655df", size = 1682528 },
+    { url = "https://files.pythonhosted.org/packages/90/cb/77b1445e0a716914e6197b0698b7a3640590da6c692437920c586764d05b/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dec0000d2d8621d8015c293e24589d46fa218637d820894cb7356c77eca3259", size = 1737154 },
+    { url = "https://files.pythonhosted.org/packages/ff/24/d6fb1f4cede9ccbe98e4def6f3ed1e1efcb658871bbf29f4863ec646bf38/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3552fe98e90fdf5918c04769f338a87fa4f00f3b28830ea9b78b1bdc6140e0d", size = 1793435 },
+    { url = "https://files.pythonhosted.org/packages/17/e2/9f744cee0861af673dc271a3351f59ebd5415928e20080ab85be25641471/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfe7f984f28a8ae94ff3a7953cd9678550dbd2a1f9bda5dd9c5ae627744c78e", size = 1692010 },
+    { url = "https://files.pythonhosted.org/packages/90/c4/4a1235c1df544223eb57ba553ce03bc706bdd065e53918767f7fa1ff99e0/aiohttp-3.11.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a481a574af914b6e84624412666cbfbe531a05667ca197804ecc19c97b8ab1b0", size = 1619481 },
+    { url = "https://files.pythonhosted.org/packages/60/70/cf12d402a94a33abda86dd136eb749b14c8eb9fec1e16adc310e25b20033/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1987770fb4887560363b0e1a9b75aa303e447433c41284d3af2840a2f226d6e0", size = 1641578 },
+    { url = "https://files.pythonhosted.org/packages/1b/25/7211973fda1f5e833fcfd98ccb7f9ce4fbfc0074e3e70c0157a751d00db8/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a4ac6a0f0f6402854adca4e3259a623f5c82ec3f0c049374133bcb243132baf9", size = 1684463 },
+    { url = "https://files.pythonhosted.org/packages/93/60/b5905b4d0693f6018b26afa9f2221fefc0dcbd3773fe2dff1a20fb5727f1/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c96a43822f1f9f69cc5c3706af33239489a6294be486a0447fb71380070d4d5f", size = 1646691 },
+    { url = "https://files.pythonhosted.org/packages/b4/fc/ba1b14d6fdcd38df0b7c04640794b3683e949ea10937c8a58c14d697e93f/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a5e69046f83c0d3cb8f0d5bd9b8838271b1bc898e01562a04398e160953e8eb9", size = 1702269 },
+    { url = "https://files.pythonhosted.org/packages/5e/39/18c13c6f658b2ba9cc1e0c6fb2d02f98fd653ad2addcdf938193d51a9c53/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:68d54234c8d76d8ef74744f9f9fc6324f1508129e23da8883771cdbb5818cbef", size = 1734782 },
+    { url = "https://files.pythonhosted.org/packages/9f/d2/ccc190023020e342419b265861877cd8ffb75bec37b7ddd8521dd2c6deb8/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c9fd9dcf9c91affe71654ef77426f5cf8489305e1c66ed4816f5a21874b094b9", size = 1694740 },
+    { url = "https://files.pythonhosted.org/packages/3f/54/186805bcada64ea90ea909311ffedcd74369bfc6e880d39d2473314daa36/aiohttp-3.11.12-cp312-cp312-win32.whl", hash = "sha256:0ed49efcd0dc1611378beadbd97beb5d9ca8fe48579fc04a6ed0844072261b6a", size = 411530 },
+    { url = "https://files.pythonhosted.org/packages/3d/63/5eca549d34d141bcd9de50d4e59b913f3641559460c739d5e215693cb54a/aiohttp-3.11.12-cp312-cp312-win_amd64.whl", hash = "sha256:54775858c7f2f214476773ce785a19ee81d1294a6bedc5cc17225355aab74802", size = 437860 },
+    { url = "https://files.pythonhosted.org/packages/c3/9b/cea185d4b543ae08ee478373e16653722c19fcda10d2d0646f300ce10791/aiohttp-3.11.12-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:413ad794dccb19453e2b97c2375f2ca3cdf34dc50d18cc2693bd5aed7d16f4b9", size = 698148 },
+    { url = "https://files.pythonhosted.org/packages/91/5c/80d47fe7749fde584d1404a68ade29bcd7e58db8fa11fa38e8d90d77e447/aiohttp-3.11.12-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4a93d28ed4b4b39e6f46fd240896c29b686b75e39cc6992692e3922ff6982b4c", size = 460831 },
+    { url = "https://files.pythonhosted.org/packages/8e/f9/de568f8a8ca6b061d157c50272620c53168d6e3eeddae78dbb0f7db981eb/aiohttp-3.11.12-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d589264dbba3b16e8951b6f145d1e6b883094075283dafcab4cdd564a9e353a0", size = 453122 },
+    { url = "https://files.pythonhosted.org/packages/8b/fd/b775970a047543bbc1d0f66725ba72acef788028fce215dc959fd15a8200/aiohttp-3.11.12-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5148ca8955affdfeb864aca158ecae11030e952b25b3ae15d4e2b5ba299bad2", size = 1665336 },
+    { url = "https://files.pythonhosted.org/packages/82/9b/aff01d4f9716245a1b2965f02044e4474fadd2bcfe63cf249ca788541886/aiohttp-3.11.12-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:525410e0790aab036492eeea913858989c4cb070ff373ec3bc322d700bdf47c1", size = 1718111 },
+    { url = "https://files.pythonhosted.org/packages/e0/a9/166fd2d8b2cc64f08104aa614fad30eee506b563154081bf88ce729bc665/aiohttp-3.11.12-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bd8695be2c80b665ae3f05cb584093a1e59c35ecb7d794d1edd96e8cc9201d7", size = 1775293 },
+    { url = "https://files.pythonhosted.org/packages/13/c5/0d3c89bd9e36288f10dc246f42518ce8e1c333f27636ac78df091c86bb4a/aiohttp-3.11.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0203433121484b32646a5f5ea93ae86f3d9559d7243f07e8c0eab5ff8e3f70e", size = 1677338 },
+    { url = "https://files.pythonhosted.org/packages/72/b2/017db2833ef537be284f64ead78725984db8a39276c1a9a07c5c7526e238/aiohttp-3.11.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40cd36749a1035c34ba8d8aaf221b91ca3d111532e5ccb5fa8c3703ab1b967ed", size = 1603365 },
+    { url = "https://files.pythonhosted.org/packages/fc/72/b66c96a106ec7e791e29988c222141dd1219d7793ffb01e72245399e08d2/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a7442662afebbf7b4c6d28cb7aab9e9ce3a5df055fc4116cc7228192ad6cb484", size = 1618464 },
+    { url = "https://files.pythonhosted.org/packages/3f/50/e68a40f267b46a603bab569d48d57f23508801614e05b3369898c5b2910a/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:8a2fb742ef378284a50766e985804bd6adb5adb5aa781100b09befdbfa757b65", size = 1657827 },
+    { url = "https://files.pythonhosted.org/packages/c5/1d/aafbcdb1773d0ba7c20793ebeedfaba1f3f7462f6fc251f24983ed738aa7/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2cee3b117a8d13ab98b38d5b6bdcd040cfb4181068d05ce0c474ec9db5f3c5bb", size = 1616700 },
+    { url = "https://files.pythonhosted.org/packages/b0/5e/6cd9724a2932f36e2a6b742436a36d64784322cfb3406ca773f903bb9a70/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f6a19bcab7fbd8f8649d6595624856635159a6527861b9cdc3447af288a00c00", size = 1685643 },
+    { url = "https://files.pythonhosted.org/packages/8b/38/ea6c91d5c767fd45a18151675a07c710ca018b30aa876a9f35b32fa59761/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e4cecdb52aaa9994fbed6b81d4568427b6002f0a91c322697a4bfcc2b2363f5a", size = 1715487 },
+    { url = "https://files.pythonhosted.org/packages/8e/24/e9edbcb7d1d93c02e055490348df6f955d675e85a028c33babdcaeda0853/aiohttp-3.11.12-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:30f546358dfa0953db92ba620101fefc81574f87b2346556b90b5f3ef16e55ce", size = 1672948 },
+    { url = "https://files.pythonhosted.org/packages/25/be/0b1fb737268e003198f25c3a68c2135e76e4754bf399a879b27bd508a003/aiohttp-3.11.12-cp313-cp313-win32.whl", hash = "sha256:ce1bb21fc7d753b5f8a5d5a4bae99566386b15e716ebdb410154c16c91494d7f", size = 410396 },
+    { url = "https://files.pythonhosted.org/packages/68/fd/677def96a75057b0a26446b62f8fbb084435b20a7d270c99539c26573bfd/aiohttp-3.11.12-cp313-cp313-win_amd64.whl", hash = "sha256:f7914ab70d2ee8ab91c13e5402122edbc77821c66d2758abb53aabe87f013287", size = 436234 },
+    { url = "https://files.pythonhosted.org/packages/a7/bd/358c7032c43d4875dcbedc9113b087ef8bc619bee034f9423335698631e3/aiohttp-3.11.12-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7c3623053b85b4296cd3925eeb725e386644fd5bc67250b3bb08b0f144803e7b", size = 709588 },
+    { url = "https://files.pythonhosted.org/packages/9f/87/9e4700a56722c139b6ed4ad9be926183545a1b55e82babd9b082be3ef4c5/aiohttp-3.11.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:67453e603cea8e85ed566b2700efa1f6916aefbc0c9fcb2e86aaffc08ec38e78", size = 469076 },
+    { url = "https://files.pythonhosted.org/packages/c0/fa/585b66076795911800f8f16f0f93ea8fb9bfa5d8fd757bbf78f32d17c2d9/aiohttp-3.11.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6130459189e61baac5a88c10019b21e1f0c6d00ebc770e9ce269475650ff7f73", size = 456148 },
+    { url = "https://files.pythonhosted.org/packages/ba/6b/a1fe710860b10d83799af8c63cf2ffb63eac4edaa42d76e9540679545951/aiohttp-3.11.12-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9060addfa4ff753b09392efe41e6af06ea5dd257829199747b9f15bfad819460", size = 1587566 },
+    { url = "https://files.pythonhosted.org/packages/31/78/ab78f36b44c7239c953afd9bb331edf2b3977925de2ce98545d62e415565/aiohttp-3.11.12-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34245498eeb9ae54c687a07ad7f160053911b5745e186afe2d0c0f2898a1ab8a", size = 1636411 },
+    { url = "https://files.pythonhosted.org/packages/e1/5c/b316b559dde4ae983e725132a2fa2518532ad56ca4698d4b71f42af48722/aiohttp-3.11.12-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8dc0fba9a74b471c45ca1a3cb6e6913ebfae416678d90529d188886278e7f3f6", size = 1672484 },
+    { url = "https://files.pythonhosted.org/packages/90/08/8c409ab4040276a8c9944d5e444121a2f34151872440b3c69f31c35edf18/aiohttp-3.11.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a478aa11b328983c4444dacb947d4513cb371cd323f3845e53caeda6be5589d5", size = 1589689 },
+    { url = "https://files.pythonhosted.org/packages/e0/25/53b4ceffaac5dcaf4772be41f4f06e7201be5407aa743758e1a37f7d1b63/aiohttp-3.11.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c160a04283c8c6f55b5bf6d4cad59bb9c5b9c9cd08903841b25f1f7109ef1259", size = 1544225 },
+    { url = "https://files.pythonhosted.org/packages/4a/40/769d221f4067a05974b3352ffa228041bcda72c487689ab4030791691861/aiohttp-3.11.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:edb69b9589324bdc40961cdf0657815df674f1743a8d5ad9ab56a99e4833cfdd", size = 1530391 },
+    { url = "https://files.pythonhosted.org/packages/14/48/22527fadfdfca85fb585870ffd98aece982606775fd2f4ee80270f5c85a0/aiohttp-3.11.12-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:4ee84c2a22a809c4f868153b178fe59e71423e1f3d6a8cd416134bb231fbf6d3", size = 1559005 },
+    { url = "https://files.pythonhosted.org/packages/fd/0e/72144954bae5d80a8857dca18b8ed8e2ef76acf557465545ad5b5b9bfb58/aiohttp-3.11.12-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:bf4480a5438f80e0f1539e15a7eb8b5f97a26fe087e9828e2c0ec2be119a9f72", size = 1536244 },
+    { url = "https://files.pythonhosted.org/packages/60/db/a2cfb5565f5e5870757e2d3099f8e24640e746ff2ba9ea899b35b6acad3f/aiohttp-3.11.12-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:e6b2732ef3bafc759f653a98881b5b9cdef0716d98f013d376ee8dfd7285abf1", size = 1607092 },
+    { url = "https://files.pythonhosted.org/packages/b0/31/87e869650c5532876e83c7c7d9d3f5505c5a738abe991f3ac2264070ee81/aiohttp-3.11.12-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f752e80606b132140883bb262a457c475d219d7163d996dc9072434ffb0784c4", size = 1629268 },
+    { url = "https://files.pythonhosted.org/packages/d2/73/25fb4d2d259caf4cf23035204315665976a66292a1055d0937c62273675a/aiohttp-3.11.12-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ab3247d58b393bda5b1c8f31c9edece7162fc13265334217785518dd770792b8", size = 1567511 },
+    { url = "https://files.pythonhosted.org/packages/a3/59/ef91da9971e187033476945cd18bebc3974930bde81cdf66099b318df7a6/aiohttp-3.11.12-cp39-cp39-win32.whl", hash = "sha256:0d5176f310a7fe6f65608213cc74f4228e4f4ce9fd10bcb2bb6da8fc66991462", size = 417082 },
+    { url = "https://files.pythonhosted.org/packages/e0/fa/6cfc042c0f59d1fa6eaeeb678b9f13b2c0bf1d7803dae81b93ca55ac6288/aiohttp-3.11.12-cp39-cp39-win_amd64.whl", hash = "sha256:74bd573dde27e58c760d9ca8615c41a57e719bff315c9adb6f2a4281a28e8798", size = 442385 },
 ]
 
 [[package]]
@@ -170,35 +174,34 @@ wheels = [
 
 [[package]]
 name = "attrs"
-version = "24.3.0"
+version = "25.1.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/48/c8/6260f8ccc11f0917360fc0da435c5c9c7504e3db174d5a12a1494887b045/attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff", size = 805984 }
+sdist = { url = "https://files.pythonhosted.org/packages/49/7c/fdf464bcc51d23881d110abd74b512a42b3d5d376a55a831b44c603ae17f/attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e", size = 810562 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/89/aa/ab0f7891a01eeb2d2e338ae8fecbe57fcebea1a24dbb64d45801bfab481d/attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308", size = 63397 },
+    { url = "https://files.pythonhosted.org/packages/fc/30/d4986a882011f9df997a55e6becd864812ccfcd821d64aac8570ee39f719/attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a", size = 63152 },
 ]
 
 [[package]]
 name = "bitsandbytes"
-version = "0.45.0"
+version = "0.45.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "torch" },
-    { name = "typing-extensions" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/99/9a/f41d252bf8b0bc5969b4dce1274cd04b7ddc541de1060dd27eca680bc1b2/bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:0f0323de1ff1fdf8383e79bdad1283516a4c05a6fd2b44a363bf4e059422305b", size = 69084267 },
-    { url = "https://files.pythonhosted.org/packages/1e/90/a2bbb9b5f997b9c9aa9c15ee4adf553ee71053bb942f89fd48d920a1aa9d/bitsandbytes-0.45.0-py3-none-win_amd64.whl", hash = "sha256:ebbf96e0ecb466716a65ecdeaef3fa1983575447b9ab66b74e5211892507c6ff", size = 68520043 },
+    { url = "https://files.pythonhosted.org/packages/db/9d/9382259196d7ad7f3550702390081224e673a705e75b5660ee377b592fc0/bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:ba3a720187f518b172ebce4081049c682ae3fd8284947e22499b256ff99a2bc3", size = 69680042 },
+    { url = "https://files.pythonhosted.org/packages/cb/33/550bcfe84f08ee20f3bcc1b129dcadaf7f2d1a065ce9812476fc7be2874a/bitsandbytes-0.45.2-py3-none-win_amd64.whl", hash = "sha256:e1893170455422924156760bae9e210ae26e8bd2ce7b21065d24b47decbe6963", size = 69124143 },
 ]
 
 [[package]]
 name = "certifi"
-version = "2024.12.14"
+version = "2025.1.31"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/0f/bd/1d41ee578ce09523c81a15426705dd20969f5abf006d1afe8aeff0dd776a/certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db", size = 166010 }
+sdist = { url = "https://files.pythonhosted.org/packages/1c/ab/c9f1e32b7b1bf505bf26f0ef697775960db7932abeb7b516de930ba2705f/certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651", size = 167577 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a5/32/8f6669fc4798494966bf446c8c4a162e0b5d893dff088afddf76414f70e1/certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56", size = 164927 },
+    { url = "https://files.pythonhosted.org/packages/38/fc/bce832fd4fd99766c04d1ee0eead6b0ec6486fb100ae5e74c1d91292b982/certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe", size = 166393 },
 ]
 
 [[package]]
@@ -307,16 +310,16 @@ wheels = [
 
 [[package]]
 name = "compressed-tensors"
-version = "0.9.0"
+version = "0.9.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pydantic" },
     { name = "torch" },
     { name = "transformers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/9a/58/e7cab9a2c423066b139e6fd5bd2f1979c88fe0cf6dabbee631917288b79e/compressed-tensors-0.9.0.tar.gz", hash = "sha256:80f8b002309b5970493e578d58542a308111fbef26f96811fa2c5322b24e7f1f", size = 63058 }
+sdist = { url = "https://files.pythonhosted.org/packages/40/e0/d9529aae2d2425d214e5a50497df4532d3f9e21c8d2023037c701f8a37d3/compressed-tensors-0.9.1.tar.gz", hash = "sha256:3cf5cd637f0186c184dd5bbbbf941356b1225199b49c6a45bf0909d65907f686", size = 63060 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/18/48/9a733034c68d22708f01094228f0642789ed94a2762612c227184cfd90e6/compressed_tensors-0.9.0-py3-none-any.whl", hash = "sha256:c4a0bccf2fd180a18a4bf0646f7746df4ea8ea537c06061f4b571662debf4424", size = 96438 },
+    { url = "https://files.pythonhosted.org/packages/d0/6d/cfb16a141dfffc6721737cb5f26951c4e72eb338bace7d9fdfa880dfdda9/compressed_tensors-0.9.1-py3-none-any.whl", hash = "sha256:77385f879c5c092db777a7880851cd9f801bf2f9bb46bb4402f052d9e002975c", size = 96454 },
 ]
 
 [[package]]
@@ -331,7 +334,7 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "multiprocess" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "packaging" },
     { name = "pandas" },
     { name = "pyarrow" },
@@ -347,14 +350,14 @@ wheels = [
 
 [[package]]
 name = "deprecated"
-version = "1.2.15"
+version = "1.2.18"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "wrapt" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/2e/a3/53e7d78a6850ffdd394d7048a31a6f14e44900adedf190f9a165f6b69439/deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d", size = 2977612 }
+sdist = { url = "https://files.pythonhosted.org/packages/98/97/06afe62762c9a8a86af0cfb7bfdab22a43ad17138b07af5b1a58442690a2/deprecated-1.2.18.tar.gz", hash = "sha256:422b6f6d859da6f2ef57857761bfb392480502a64c3028ca9bbe86085d72115d", size = 2928744 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1d/8f/c7f227eb42cfeaddce3eb0c96c60cbca37797fa7b34f8e1aeadf6c5c0983/Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320", size = 9941 },
+    { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 },
 ]
 
 [[package]]
@@ -377,11 +380,11 @@ wheels = [
 
 [[package]]
 name = "einops"
-version = "0.8.0"
+version = "0.8.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/79/ca/9f5dcb8bead39959454c3912266bedc4c315839cee0e0ca9f4328f4588c1/einops-0.8.0.tar.gz", hash = "sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85", size = 58861 }
+sdist = { url = "https://files.pythonhosted.org/packages/e5/81/df4fbe24dff8ba3934af99044188e20a98ed441ad17a274539b74e82e126/einops-0.8.1.tar.gz", hash = "sha256:de5d960a7a761225532e0f1959e5315ebeafc0cd43394732f103ca44b9837e84", size = 54805 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/44/5a/f0b9ad6c0a9017e62d4735daaeb11ba3b6c009d69a26141b258cd37b5588/einops-0.8.0-py3-none-any.whl", hash = "sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f", size = 43223 },
+    { url = "https://files.pythonhosted.org/packages/87/62/9773de14fe6c45c23649e98b83231fffd7b9892b6cf863251dc2afa73643/einops-0.8.1-py3-none-any.whl", hash = "sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737", size = 64359 },
 ]
 
 [[package]]
@@ -395,11 +398,11 @@ wheels = [
 
 [[package]]
 name = "filelock"
-version = "3.16.1"
+version = "3.17.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/9d/db/3ef5bb276dae18d6ec2124224403d1d67bccdbefc17af4cc8f553e341ab1/filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435", size = 18037 }
+sdist = { url = "https://files.pythonhosted.org/packages/dc/9c/0b15fb47b464e1b663b1acd1253a062aa5feecb07d4e597daea542ebd2b5/filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e", size = 18027 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b9/f8/feced7779d755758a52d1f6635d990b8d98dc0a29fa568bbe0625f18fdf3/filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0", size = 16163 },
+    { url = "https://files.pythonhosted.org/packages/89/ec/00d68c4ddfedfe64159999e5f8a98fb8442729a63e2077eb9dcd89623d27/filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338", size = 16164 },
 ]
 
 [[package]]
@@ -500,6 +503,15 @@ http = [
     { name = "aiohttp" },
 ]
 
+[[package]]
+name = "genson"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c5/cf/2303c8ad276dcf5ee2ad6cf69c4338fd86ef0f471a5207b069adf7a393cf/genson-1.3.0.tar.gz", hash = "sha256:e02db9ac2e3fd29e65b5286f7135762e2cd8a986537c075b06fc5f1517308e37", size = 34919 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f8/5c/e226de133afd8bb267ec27eead9ae3d784b95b39a287ed404caab39a5f50/genson-1.3.0-py3-none-any.whl", hash = "sha256:468feccd00274cc7e4c09e84b08704270ba8d95232aa280f65b986139cec67f7", size = 21470 },
+]
+
 [[package]]
 name = "googleapis-common-protos"
 version = "1.66.0"
@@ -526,140 +538,140 @@ wheels = [
 
 [[package]]
 name = "grpcio"
-version = "1.69.0"
+version = "1.70.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e4/87/06a145284cbe86c91ca517fe6b57be5efbb733c0d6374b407f0992054d18/grpcio-1.69.0.tar.gz", hash = "sha256:936fa44241b5379c5afc344e1260d467bee495747eaf478de825bab2791da6f5", size = 12738244 }
+sdist = { url = "https://files.pythonhosted.org/packages/69/e1/4b21b5017c33f3600dcc32b802bb48fe44a4d36d6c066f52650c7c2690fa/grpcio-1.70.0.tar.gz", hash = "sha256:8d1584a68d5922330025881e63a6c1b54cc8117291d382e4fa69339b6d914c56", size = 12788932 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b0/6e/2f8ee5fb65aef962d0bd7e46b815e7b52820687e29c138eaee207a688abc/grpcio-1.69.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:2060ca95a8db295ae828d0fc1c7f38fb26ccd5edf9aa51a0f44251f5da332e97", size = 5190753 },
-    { url = "https://files.pythonhosted.org/packages/89/07/028dcda44d40f9488f0a0de79c5ffc80e2c1bc5ed89da9483932e3ea67cf/grpcio-1.69.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2e52e107261fd8fa8fa457fe44bfadb904ae869d87c1280bf60f93ecd3e79278", size = 11096752 },
-    { url = "https://files.pythonhosted.org/packages/99/a0/c727041b1410605ba38b585b6b52c1a289d7fcd70a41bccbc2c58fc643b2/grpcio-1.69.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:316463c0832d5fcdb5e35ff2826d9aa3f26758d29cdfb59a368c1d6c39615a11", size = 5705442 },
-    { url = "https://files.pythonhosted.org/packages/7a/2f/1c53f5d127ff882443b19c757d087da1908f41c58c4b098e8eaf6b2bb70a/grpcio-1.69.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:26c9a9c4ac917efab4704b18eed9082ed3b6ad19595f047e8173b5182fec0d5e", size = 6333796 },
-    { url = "https://files.pythonhosted.org/packages/cc/f6/2017da2a1b64e896af710253e5bfbb4188605cdc18bce3930dae5cdbf502/grpcio-1.69.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b3646ced2eae3a0599658eeccc5ba7f303bf51b82514c50715bdd2b109e5ec", size = 5954245 },
-    { url = "https://files.pythonhosted.org/packages/c1/65/1395bec928e99ba600464fb01b541e7e4cdd462e6db25259d755ef9f8d02/grpcio-1.69.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3b75aea7c6cb91b341c85e7c1d9db1e09e1dd630b0717f836be94971e015031e", size = 6664854 },
-    { url = "https://files.pythonhosted.org/packages/40/57/8b3389cfeb92056c8b44288c9c4ed1d331bcad0215c4eea9ae4629e156d9/grpcio-1.69.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5cfd14175f9db33d4b74d63de87c64bb0ee29ce475ce3c00c01ad2a3dc2a9e51", size = 6226854 },
-    { url = "https://files.pythonhosted.org/packages/cc/61/1f2bbeb7c15544dffc98b3f65c093e746019995e6f1e21dc3655eec3dc23/grpcio-1.69.0-cp310-cp310-win32.whl", hash = "sha256:9031069d36cb949205293cf0e243abd5e64d6c93e01b078c37921493a41b72dc", size = 3662734 },
-    { url = "https://files.pythonhosted.org/packages/ef/ba/bf1a6d9f5c17d2da849793d72039776c56c98c889c9527f6721b6ee57e6e/grpcio-1.69.0-cp310-cp310-win_amd64.whl", hash = "sha256:cc89b6c29f3dccbe12d7a3b3f1b3999db4882ae076c1c1f6df231d55dbd767a5", size = 4410306 },
-    { url = "https://files.pythonhosted.org/packages/8d/cd/ca256aeef64047881586331347cd5a68a4574ba1a236e293cd8eba34e355/grpcio-1.69.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:8de1b192c29b8ce45ee26a700044717bcbbd21c697fa1124d440548964328561", size = 5198734 },
-    { url = "https://files.pythonhosted.org/packages/37/3f/10c1e5e0150bf59aa08ea6aebf38f87622f95f7f33f98954b43d1b2a3200/grpcio-1.69.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:7e76accf38808f5c5c752b0ab3fd919eb14ff8fafb8db520ad1cc12afff74de6", size = 11135285 },
-    { url = "https://files.pythonhosted.org/packages/08/61/61cd116a572203a740684fcba3fef37a3524f1cf032b6568e1e639e59db0/grpcio-1.69.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:d5658c3c2660417d82db51e168b277e0ff036d0b0f859fa7576c0ffd2aec1442", size = 5699468 },
-    { url = "https://files.pythonhosted.org/packages/01/f1/a841662e8e2465ba171c973b77d18fa7438ced535519b3c53617b7e6e25c/grpcio-1.69.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5494d0e52bf77a2f7eb17c6da662886ca0a731e56c1c85b93505bece8dc6cf4c", size = 6332337 },
-    { url = "https://files.pythonhosted.org/packages/62/b1/c30e932e02c2e0bfdb8df46fe3b0c47f518fb04158ebdc0eb96cc97d642f/grpcio-1.69.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ed866f9edb574fd9be71bf64c954ce1b88fc93b2a4cbf94af221e9426eb14d6", size = 5949844 },
-    { url = "https://files.pythonhosted.org/packages/5e/cb/55327d43b6286100ffae7d1791be6178d13c917382f3e9f43f82e8b393cf/grpcio-1.69.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c5ba38aeac7a2fe353615c6b4213d1fbb3a3c34f86b4aaa8be08baaaee8cc56d", size = 6661828 },
-    { url = "https://files.pythonhosted.org/packages/6f/e4/120d72ae982d51cb9cabcd9672f8a1c6d62011b493a4d049d2abdf564db0/grpcio-1.69.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f79e05f5bbf551c4057c227d1b041ace0e78462ac8128e2ad39ec58a382536d2", size = 6226026 },
-    { url = "https://files.pythonhosted.org/packages/96/e8/2cc15f11db506d7b1778f0587fa7bdd781602b05b3c4d75b7ca13de33d62/grpcio-1.69.0-cp311-cp311-win32.whl", hash = "sha256:bf1f8be0da3fcdb2c1e9f374f3c2d043d606d69f425cd685110dd6d0d2d61258", size = 3662653 },
-    { url = "https://files.pythonhosted.org/packages/42/78/3c5216829a48237fcb71a077f891328a435e980d9757a9ebc49114d88768/grpcio-1.69.0-cp311-cp311-win_amd64.whl", hash = "sha256:fb9302afc3a0e4ba0b225cd651ef8e478bf0070cf11a529175caecd5ea2474e7", size = 4412824 },
-    { url = "https://files.pythonhosted.org/packages/61/1d/8f28f147d7f3f5d6b6082f14e1e0f40d58e50bc2bd30d2377c730c57a286/grpcio-1.69.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:fc18a4de8c33491ad6f70022af5c460b39611e39578a4d84de0fe92f12d5d47b", size = 5161414 },
-    { url = "https://files.pythonhosted.org/packages/35/4b/9ab8ea65e515e1844feced1ef9e7a5d8359c48d986c93f3d2a2006fbdb63/grpcio-1.69.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:0f0270bd9ffbff6961fe1da487bdcd594407ad390cc7960e738725d4807b18c4", size = 11108909 },
-    { url = "https://files.pythonhosted.org/packages/99/68/1856fde2b3c3162bdfb9845978608deef3606e6907fdc2c87443fce6ecd0/grpcio-1.69.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:dc48f99cc05e0698e689b51a05933253c69a8c8559a47f605cff83801b03af0e", size = 5658302 },
-    { url = "https://files.pythonhosted.org/packages/3e/21/3fa78d38dc5080d0d677103fad3a8cd55091635cc2069a7c06c7a54e6c4d/grpcio-1.69.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e925954b18d41aeb5ae250262116d0970893b38232689c4240024e4333ac084", size = 6306201 },
-    { url = "https://files.pythonhosted.org/packages/f3/cb/5c47b82fd1baf43dba973ae399095d51aaf0085ab0439838b4cbb1e87e3c/grpcio-1.69.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87d222569273720366f68a99cb62e6194681eb763ee1d3b1005840678d4884f9", size = 5919649 },
-    { url = "https://files.pythonhosted.org/packages/c6/67/59d1a56a0f9508a29ea03e1ce800bdfacc1f32b4f6b15274b2e057bf8758/grpcio-1.69.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b62b0f41e6e01a3e5082000b612064c87c93a49b05f7602fe1b7aa9fd5171a1d", size = 6648974 },
-    { url = "https://files.pythonhosted.org/packages/f8/fe/ca70c14d98c6400095f19a0f4df8273d09c2106189751b564b26019f1dbe/grpcio-1.69.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:db6f9fd2578dbe37db4b2994c94a1d9c93552ed77dca80e1657bb8a05b898b55", size = 6215144 },
-    { url = "https://files.pythonhosted.org/packages/b3/94/b2b0a9fd487fc8262e20e6dd0ec90d9fa462c82a43b4855285620f6e9d01/grpcio-1.69.0-cp312-cp312-win32.whl", hash = "sha256:b192b81076073ed46f4b4dd612b8897d9a1e39d4eabd822e5da7b38497ed77e1", size = 3644552 },
-    { url = "https://files.pythonhosted.org/packages/93/99/81aec9f85412e3255a591ae2ccb799238e074be774e5f741abae08a23418/grpcio-1.69.0-cp312-cp312-win_amd64.whl", hash = "sha256:1227ff7836f7b3a4ab04e5754f1d001fa52a730685d3dc894ed8bc262cc96c01", size = 4399532 },
-    { url = "https://files.pythonhosted.org/packages/54/47/3ff4501365f56b7cc16617695dbd4fd838c5e362bc7fa9fee09d592f7d78/grpcio-1.69.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:a78a06911d4081a24a1761d16215a08e9b6d4d29cdbb7e427e6c7e17b06bcc5d", size = 5162928 },
-    { url = "https://files.pythonhosted.org/packages/c0/63/437174c5fa951052c9ecc5f373f62af6f3baf25f3f5ef35cbf561806b371/grpcio-1.69.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:dc5a351927d605b2721cbb46158e431dd49ce66ffbacb03e709dc07a491dde35", size = 11103027 },
-    { url = "https://files.pythonhosted.org/packages/53/df/53566a6fdc26b6d1f0585896e1cc4825961039bca5a6a314ff29d79b5d5b/grpcio-1.69.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:3629d8a8185f5139869a6a17865d03113a260e311e78fbe313f1a71603617589", size = 5659277 },
-    { url = "https://files.pythonhosted.org/packages/e6/4c/b8a0c4f71498b6f9be5ca6d290d576cf2af9d95fd9827c47364f023969ad/grpcio-1.69.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9a281878feeb9ae26db0622a19add03922a028d4db684658f16d546601a4870", size = 6305255 },
-    { url = "https://files.pythonhosted.org/packages/ef/55/d9aa05eb3dfcf6aa946aaf986740ec07fc5189f20e2cbeb8c5d278ffd00f/grpcio-1.69.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc614e895177ab7e4b70f154d1a7c97e152577ea101d76026d132b7aaba003b", size = 5920240 },
-    { url = "https://files.pythonhosted.org/packages/ea/eb/774b27c51e3e386dfe6c491a710f6f87ffdb20d88ec6c3581e047d9354a2/grpcio-1.69.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:1ee76cd7e2e49cf9264f6812d8c9ac1b85dda0eaea063af07292400f9191750e", size = 6652974 },
-    { url = "https://files.pythonhosted.org/packages/59/98/96de14e6e7d89123813d58c246d9b0f1fbd24f9277f5295264e60861d9d6/grpcio-1.69.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:0470fa911c503af59ec8bc4c82b371ee4303ececbbdc055f55ce48e38b20fd67", size = 6215757 },
-    { url = "https://files.pythonhosted.org/packages/7d/5b/ce922e0785910b10756fabc51fd294260384a44bea41651dadc4e47ddc82/grpcio-1.69.0-cp313-cp313-win32.whl", hash = "sha256:b650f34aceac8b2d08a4c8d7dc3e8a593f4d9e26d86751ebf74ebf5107d927de", size = 3642488 },
-    { url = "https://files.pythonhosted.org/packages/5d/04/11329e6ca1ceeb276df2d9c316b5e170835a687a4d0f778dba8294657e36/grpcio-1.69.0-cp313-cp313-win_amd64.whl", hash = "sha256:028337786f11fecb5d7b7fa660475a06aabf7e5e52b5ac2df47414878c0ce7ea", size = 4399968 },
-    { url = "https://files.pythonhosted.org/packages/c6/e6/9c6448a9f2b192b4dab8ecba6a99d34aebfb3398da9f407eb8f5a14181d4/grpcio-1.69.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:dd034d68a2905464c49479b0c209c773737a4245d616234c79c975c7c90eca03", size = 5190897 },
-    { url = "https://files.pythonhosted.org/packages/4d/ce/fb54596867c813756c70266cb433e37619324c0f18ad917c2bbeeb6b5b21/grpcio-1.69.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:01f834732c22a130bdf3dc154d1053bdbc887eb3ccb7f3e6285cfbfc33d9d5cc", size = 11124006 },
-    { url = "https://files.pythonhosted.org/packages/af/c1/c314372f3b6605b3ef8c03bcecd3deef92a3a5817b26ca4c5a6d519bdf04/grpcio-1.69.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:a7f4ed0dcf202a70fe661329f8874bc3775c14bb3911d020d07c82c766ce0eb1", size = 5703399 },
-    { url = "https://files.pythonhosted.org/packages/c6/e4/d4a051b2e3752590e5a8fdfd3270045d8c0e49f0566fd9dacf30e3de1bc3/grpcio-1.69.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd7ea241b10bc5f0bb0f82c0d7896822b7ed122b3ab35c9851b440c1ccf81588", size = 6333585 },
-    { url = "https://files.pythonhosted.org/packages/9b/dd/3b0057863f27325ad9371e966684d2e287cdb4ee5861b4cd4fbbb1c7bf91/grpcio-1.69.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f03dc9b4da4c0dc8a1db7a5420f575251d7319b7a839004d8916257ddbe4816", size = 5953919 },
-    { url = "https://files.pythonhosted.org/packages/98/8a/5f782d5493e4c67c64389996d800a666987dc27ab5fbe093864e9fd66982/grpcio-1.69.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ca71d73a270dff052fe4edf74fef142d6ddd1f84175d9ac4a14b7280572ac519", size = 6666357 },
-    { url = "https://files.pythonhosted.org/packages/de/a4/d1a03913df292ba7322086c68301c66e14b3f8f9532e4c3854846442f0a0/grpcio-1.69.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5ccbed100dc43704e94ccff9e07680b540d64e4cc89213ab2832b51b4f68a520", size = 6226574 },
-    { url = "https://files.pythonhosted.org/packages/8d/fb/e104bc4296ee4991d803dd39b6c72ed247ba0e18a4e56fd895947aae1249/grpcio-1.69.0-cp39-cp39-win32.whl", hash = "sha256:1514341def9c6ec4b7f0b9628be95f620f9d4b99331b7ef0a1845fd33d9b579c", size = 3663452 },
-    { url = "https://files.pythonhosted.org/packages/ad/39/12d48bccd429699a3c909173b395900eb64e4c6bc5eed34d7088e438bc4d/grpcio-1.69.0-cp39-cp39-win_amd64.whl", hash = "sha256:c1fea55d26d647346acb0069b08dca70984101f2dc95066e003019207212e303", size = 4411151 },
+    { url = "https://files.pythonhosted.org/packages/10/e9/f72408bac1f7b05b25e4df569b02d6b200c8e7857193aa9f1df7a3744add/grpcio-1.70.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:95469d1977429f45fe7df441f586521361e235982a0b39e33841549143ae2851", size = 5229736 },
+    { url = "https://files.pythonhosted.org/packages/b3/17/e65139ea76dac7bcd8a3f17cbd37e3d1a070c44db3098d0be5e14c5bd6a1/grpcio-1.70.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:ed9718f17fbdb472e33b869c77a16d0b55e166b100ec57b016dc7de9c8d236bf", size = 11432751 },
+    { url = "https://files.pythonhosted.org/packages/a0/12/42de6082b4ab14a59d30b2fc7786882fdaa75813a4a4f3d4a8c4acd6ed59/grpcio-1.70.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:374d014f29f9dfdb40510b041792e0e2828a1389281eb590df066e1cc2b404e5", size = 5711439 },
+    { url = "https://files.pythonhosted.org/packages/34/f8/b5a19524d273cbd119274a387bb72d6fbb74578e13927a473bc34369f079/grpcio-1.70.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2af68a6f5c8f78d56c145161544ad0febbd7479524a59c16b3e25053f39c87f", size = 6330777 },
+    { url = "https://files.pythonhosted.org/packages/1a/67/3d6c0ad786238aac7fa93b79246fc452978fbfe9e5f86f70da8e8a2797d0/grpcio-1.70.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce7df14b2dcd1102a2ec32f621cc9fab6695effef516efbc6b063ad749867295", size = 5944639 },
+    { url = "https://files.pythonhosted.org/packages/76/0d/d9f7cbc41c2743cf18236a29b6a582f41bd65572a7144d92b80bc1e68479/grpcio-1.70.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c78b339869f4dbf89881e0b6fbf376313e4f845a42840a7bdf42ee6caed4b11f", size = 6643543 },
+    { url = "https://files.pythonhosted.org/packages/fc/24/bdd7e606b3400c14330e33a4698fa3a49e38a28c9e0a831441adbd3380d2/grpcio-1.70.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:58ad9ba575b39edef71f4798fdb5c7b6d02ad36d47949cd381d4392a5c9cbcd3", size = 6199897 },
+    { url = "https://files.pythonhosted.org/packages/d1/33/8132eb370087960c82d01b89faeb28f3e58f5619ffe19889f57c58a19c18/grpcio-1.70.0-cp310-cp310-win32.whl", hash = "sha256:2b0d02e4b25a5c1f9b6c7745d4fa06efc9fd6a611af0fb38d3ba956786b95199", size = 3617513 },
+    { url = "https://files.pythonhosted.org/packages/99/bc/0fce5cfc0ca969df66f5dca6cf8d2258abb88146bf9ab89d8cf48e970137/grpcio-1.70.0-cp310-cp310-win_amd64.whl", hash = "sha256:0de706c0a5bb9d841e353f6343a9defc9fc35ec61d6eb6111802f3aa9fef29e1", size = 4303342 },
+    { url = "https://files.pythonhosted.org/packages/65/c4/1f67d23d6bcadd2fd61fb460e5969c52b3390b4a4e254b5e04a6d1009e5e/grpcio-1.70.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:17325b0be0c068f35770f944124e8839ea3185d6d54862800fc28cc2ffad205a", size = 5229017 },
+    { url = "https://files.pythonhosted.org/packages/e4/bd/cc36811c582d663a740fb45edf9f99ddbd99a10b6ba38267dc925e1e193a/grpcio-1.70.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:dbe41ad140df911e796d4463168e33ef80a24f5d21ef4d1e310553fcd2c4a386", size = 11472027 },
+    { url = "https://files.pythonhosted.org/packages/7e/32/8538bb2ace5cd72da7126d1c9804bf80b4fe3be70e53e2d55675c24961a8/grpcio-1.70.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:5ea67c72101d687d44d9c56068328da39c9ccba634cabb336075fae2eab0d04b", size = 5707785 },
+    { url = "https://files.pythonhosted.org/packages/ce/5c/a45f85f2a0dfe4a6429dee98717e0e8bd7bd3f604315493c39d9679ca065/grpcio-1.70.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb5277db254ab7586769e490b7b22f4ddab3876c490da0a1a9d7c695ccf0bf77", size = 6331599 },
+    { url = "https://files.pythonhosted.org/packages/9f/e5/5316b239380b8b2ad30373eb5bb25d9fd36c0375e94a98a0a60ea357d254/grpcio-1.70.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7831a0fc1beeeb7759f737f5acd9fdcda520e955049512d68fda03d91186eea", size = 5940834 },
+    { url = "https://files.pythonhosted.org/packages/05/33/dbf035bc6d167068b4a9f2929dfe0b03fb763f0f861ecb3bb1709a14cb65/grpcio-1.70.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:27cc75e22c5dba1fbaf5a66c778e36ca9b8ce850bf58a9db887754593080d839", size = 6641191 },
+    { url = "https://files.pythonhosted.org/packages/4c/c4/684d877517e5bfd6232d79107e5a1151b835e9f99051faef51fed3359ec4/grpcio-1.70.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d63764963412e22f0491d0d32833d71087288f4e24cbcddbae82476bfa1d81fd", size = 6198744 },
+    { url = "https://files.pythonhosted.org/packages/e9/43/92fe5eeaf340650a7020cfb037402c7b9209e7a0f3011ea1626402219034/grpcio-1.70.0-cp311-cp311-win32.whl", hash = "sha256:bb491125103c800ec209d84c9b51f1c60ea456038e4734688004f377cfacc113", size = 3617111 },
+    { url = "https://files.pythonhosted.org/packages/55/15/b6cf2c9515c028aff9da6984761a3ab484a472b0dc6435fcd07ced42127d/grpcio-1.70.0-cp311-cp311-win_amd64.whl", hash = "sha256:d24035d49e026353eb042bf7b058fb831db3e06d52bee75c5f2f3ab453e71aca", size = 4304604 },
+    { url = "https://files.pythonhosted.org/packages/4c/a4/ddbda79dd176211b518f0f3795af78b38727a31ad32bc149d6a7b910a731/grpcio-1.70.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:ef4c14508299b1406c32bdbb9fb7b47612ab979b04cf2b27686ea31882387cff", size = 5198135 },
+    { url = "https://files.pythonhosted.org/packages/30/5c/60eb8a063ea4cb8d7670af8fac3f2033230fc4b75f62669d67c66ac4e4b0/grpcio-1.70.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:aa47688a65643afd8b166928a1da6247d3f46a2784d301e48ca1cc394d2ffb40", size = 11447529 },
+    { url = "https://files.pythonhosted.org/packages/fb/b9/1bf8ab66729f13b44e8f42c9de56417d3ee6ab2929591cfee78dce749b57/grpcio-1.70.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:880bfb43b1bb8905701b926274eafce5c70a105bc6b99e25f62e98ad59cb278e", size = 5664484 },
+    { url = "https://files.pythonhosted.org/packages/d1/06/2f377d6906289bee066d96e9bdb91e5e96d605d173df9bb9856095cccb57/grpcio-1.70.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e654c4b17d07eab259d392e12b149c3a134ec52b11ecdc6a515b39aceeec898", size = 6303739 },
+    { url = "https://files.pythonhosted.org/packages/ae/50/64c94cfc4db8d9ed07da71427a936b5a2bd2b27c66269b42fbda82c7c7a4/grpcio-1.70.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2394e3381071045a706ee2eeb6e08962dd87e8999b90ac15c55f56fa5a8c9597", size = 5910417 },
+    { url = "https://files.pythonhosted.org/packages/53/89/8795dfc3db4389c15554eb1765e14cba8b4c88cc80ff828d02f5572965af/grpcio-1.70.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b3c76701428d2df01964bc6479422f20e62fcbc0a37d82ebd58050b86926ef8c", size = 6626797 },
+    { url = "https://files.pythonhosted.org/packages/9c/b2/6a97ac91042a2c59d18244c479ee3894e7fb6f8c3a90619bb5a7757fa30c/grpcio-1.70.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ac073fe1c4cd856ebcf49e9ed6240f4f84d7a4e6ee95baa5d66ea05d3dd0df7f", size = 6190055 },
+    { url = "https://files.pythonhosted.org/packages/86/2b/28db55c8c4d156053a8c6f4683e559cd0a6636f55a860f87afba1ac49a51/grpcio-1.70.0-cp312-cp312-win32.whl", hash = "sha256:cd24d2d9d380fbbee7a5ac86afe9787813f285e684b0271599f95a51bce33528", size = 3600214 },
+    { url = "https://files.pythonhosted.org/packages/17/c3/a7a225645a965029ed432e5b5e9ed959a574e62100afab553eef58be0e37/grpcio-1.70.0-cp312-cp312-win_amd64.whl", hash = "sha256:0495c86a55a04a874c7627fd33e5beaee771917d92c0e6d9d797628ac40e7655", size = 4292538 },
+    { url = "https://files.pythonhosted.org/packages/68/38/66d0f32f88feaf7d83f8559cd87d899c970f91b1b8a8819b58226de0a496/grpcio-1.70.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:aa573896aeb7d7ce10b1fa425ba263e8dddd83d71530d1322fd3a16f31257b4a", size = 5199218 },
+    { url = "https://files.pythonhosted.org/packages/c1/96/947df763a0b18efb5cc6c2ae348e56d97ca520dc5300c01617b234410173/grpcio-1.70.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:d405b005018fd516c9ac529f4b4122342f60ec1cee181788249372524e6db429", size = 11445983 },
+    { url = "https://files.pythonhosted.org/packages/fd/5b/f3d4b063e51b2454bedb828e41f3485800889a3609c49e60f2296cc8b8e5/grpcio-1.70.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f32090238b720eb585248654db8e3afc87b48d26ac423c8dde8334a232ff53c9", size = 5663954 },
+    { url = "https://files.pythonhosted.org/packages/bd/0b/dab54365fcedf63e9f358c1431885478e77d6f190d65668936b12dd38057/grpcio-1.70.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa089a734f24ee5f6880c83d043e4f46bf812fcea5181dcb3a572db1e79e01c", size = 6304323 },
+    { url = "https://files.pythonhosted.org/packages/76/a8/8f965a7171ddd336ce32946e22954aa1bbc6f23f095e15dadaa70604ba20/grpcio-1.70.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f19375f0300b96c0117aca118d400e76fede6db6e91f3c34b7b035822e06c35f", size = 5910939 },
+    { url = "https://files.pythonhosted.org/packages/1b/05/0bbf68be8b17d1ed6f178435a3c0c12e665a1e6054470a64ce3cb7896596/grpcio-1.70.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:7c73c42102e4a5ec76608d9b60227d917cea46dff4d11d372f64cbeb56d259d0", size = 6631405 },
+    { url = "https://files.pythonhosted.org/packages/79/6a/5df64b6df405a1ed1482cb6c10044b06ec47fd28e87c2232dbcf435ecb33/grpcio-1.70.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:0a5c78d5198a1f0aa60006cd6eb1c912b4a1520b6a3968e677dbcba215fabb40", size = 6190982 },
+    { url = "https://files.pythonhosted.org/packages/42/aa/aeaac87737e6d25d1048c53b8ec408c056d3ed0c922e7c5efad65384250c/grpcio-1.70.0-cp313-cp313-win32.whl", hash = "sha256:fe9dbd916df3b60e865258a8c72ac98f3ac9e2a9542dcb72b7a34d236242a5ce", size = 3598359 },
+    { url = "https://files.pythonhosted.org/packages/1f/79/8edd2442d2de1431b4a3de84ef91c37002f12de0f9b577fb07b452989dbc/grpcio-1.70.0-cp313-cp313-win_amd64.whl", hash = "sha256:4119fed8abb7ff6c32e3d2255301e59c316c22d31ab812b3fbcbaf3d0d87cc68", size = 4293938 },
+    { url = "https://files.pythonhosted.org/packages/9d/0e/64061c9746a2dd6e07cb0a0f3829f0a431344add77ec36397cc452541ff6/grpcio-1.70.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:4f1937f47c77392ccd555728f564a49128b6a197a05a5cd527b796d36f3387d0", size = 5231123 },
+    { url = "https://files.pythonhosted.org/packages/72/9f/c93501d5f361aecee0146ab19300d5acb1c2747b00217c641f06fffbcd62/grpcio-1.70.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:0cd430b9215a15c10b0e7d78f51e8a39d6cf2ea819fd635a7214fae600b1da27", size = 11467217 },
+    { url = "https://files.pythonhosted.org/packages/0a/1a/980d115b701023450a304881bf3f6309f6fb15787f9b78d2728074f3bf86/grpcio-1.70.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:e27585831aa6b57b9250abaf147003e126cd3a6c6ca0c531a01996f31709bed1", size = 5710913 },
+    { url = "https://files.pythonhosted.org/packages/a0/84/af420067029808f9790e98143b3dd0f943bebba434a4706755051a520c91/grpcio-1.70.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1af8e15b0f0fe0eac75195992a63df17579553b0c4af9f8362cc7cc99ccddf4", size = 6330947 },
+    { url = "https://files.pythonhosted.org/packages/24/1c/e1f06a7d29a1fa5053dcaf5352a50f8e1f04855fd194a65422a9d685d375/grpcio-1.70.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbce24409beaee911c574a3d75d12ffb8c3e3dd1b813321b1d7a96bbcac46bf4", size = 5943913 },
+    { url = "https://files.pythonhosted.org/packages/41/8f/de13838e4467519a50cd0693e98b0b2bcc81d656013c38a1dd7dcb801526/grpcio-1.70.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ff4a8112a79464919bb21c18e956c54add43ec9a4850e3949da54f61c241a4a6", size = 6643236 },
+    { url = "https://files.pythonhosted.org/packages/ac/73/d68c745d34e43a80440da4f3d79fa02c56cb118c2a26ba949f3cfd8316d7/grpcio-1.70.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5413549fdf0b14046c545e19cfc4eb1e37e9e1ebba0ca390a8d4e9963cab44d2", size = 6199038 },
+    { url = "https://files.pythonhosted.org/packages/7e/dd/991f100b8c31636b4bb2a941dbbf54dbcc55d69c722cfa038c3d017eaa0c/grpcio-1.70.0-cp39-cp39-win32.whl", hash = "sha256:b745d2c41b27650095e81dea7091668c040457483c9bdb5d0d9de8f8eb25e59f", size = 3617512 },
+    { url = "https://files.pythonhosted.org/packages/4d/80/1aa2ba791207a13e314067209b48e1a0893ed8d1f43ef012e194aaa6c2de/grpcio-1.70.0-cp39-cp39-win_amd64.whl", hash = "sha256:a31d7e3b529c94e930a117b2175b2efd179d96eb3c7a21ccb0289a8ab05b645c", size = 4303506 },
 ]
 
 [[package]]
 name = "grpcio-reflection"
-version = "1.69.0"
+version = "1.70.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "grpcio" },
     { name = "protobuf" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/bd/88/064538adae2e526f4626ad57f53a99fca5ae5013e627214093ef1d47cc35/grpcio_reflection-1.69.0.tar.gz", hash = "sha256:3474aaedfb10d8da5503135a2fb9fbb4e1f72044ad7ee2d0bf22090934e80908", size = 18824 }
+sdist = { url = "https://files.pythonhosted.org/packages/42/98/f76681980ec7e9813e010d2778f4109816650760f6839661ae378286b3b5/grpcio_reflection-1.70.0.tar.gz", hash = "sha256:af46ce13e57fd7602deaef5b1d1903f6644f8982cc00f62fa9fa5b928acf6a49", size = 18829 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4f/2a/15455e320839c7ded83cf347b7256c38b8a54352b46b402a3d20b05abd65/grpcio_reflection-1.69.0-py3-none-any.whl", hash = "sha256:6dc356cc24a324295550991b19cef1804c8d1fd9d85ea313809af4b03459d5b2", size = 22690 },
+    { url = "https://files.pythonhosted.org/packages/ca/78/81684827a247071eec55da35e5fcf047d595c210bfae33a88c99bd1b8056/grpcio_reflection-1.70.0-py3-none-any.whl", hash = "sha256:5b8730dbb316ebc0b592863f391a6effa8261be1b74022facfc2aa28639ddbfd", size = 22691 },
 ]
 
 [[package]]
 name = "grpcio-status"
-version = "1.69.0"
+version = "1.70.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "googleapis-common-protos" },
     { name = "grpcio" },
     { name = "protobuf" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/02/35/52dc0d8300f879dbf9cdc95764cee9f56d5a212998cfa1a8871b262df2a4/grpcio_status-1.69.0.tar.gz", hash = "sha256:595ef84e5178d6281caa732ccf68ff83259241608d26b0e9c40a5e66eee2a2d2", size = 13662 }
+sdist = { url = "https://files.pythonhosted.org/packages/4c/d1/2397797c810020eac424e1aac10fbdc5edb6b9b4ad6617e0ed53ca907653/grpcio_status-1.70.0.tar.gz", hash = "sha256:0e7b42816512433b18b9d764285ff029bde059e9d41f8fe10a60631bd8348101", size = 13681 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f6/e2/346a766a4232f74f45f8bc70e636fc3a6677e6bc3893382187829085f12e/grpcio_status-1.69.0-py3-none-any.whl", hash = "sha256:d6b2a3c9562c03a817c628d7ba9a925e209c228762d6d7677ae5c9401a542853", size = 14428 },
+    { url = "https://files.pythonhosted.org/packages/e6/34/49e558040e069feebac70cdd1b605f38738c0277ac5d38e2ce3d03e1b1ec/grpcio_status-1.70.0-py3-none-any.whl", hash = "sha256:fc5a2ae2b9b1c1969cc49f3262676e6854aa2398ec69cb5bd6c47cd501904a85", size = 14429 },
 ]
 
 [[package]]
 name = "grpcio-tools"
-version = "1.69.0"
+version = "1.70.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "grpcio" },
     { name = "protobuf" },
     { name = "setuptools" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/64/ec/1c25136ca1697eaa09a02effe3e74959fd9fb6aba9960d7340dd6341c5ce/grpcio_tools-1.69.0.tar.gz", hash = "sha256:3e1a98f4d9decb84979e1ddd3deb09c0a33a84b6e3c0776d5bde4097e3ab66dd", size = 5323319 }
+sdist = { url = "https://files.pythonhosted.org/packages/c1/fe/3adf1035c1f9e9243516530beae67e197f2acc17562ec75f03a0ba77fc55/grpcio_tools-1.70.0.tar.gz", hash = "sha256:e578fee7c1c213c8e471750d92631d00f178a15479fb2cb3b939a07fc125ccd3", size = 5323149 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/00/90/7df7326552fec627adcf3880cf13e9a5b23c090bbcedba367f64fa2bb54b/grpcio_tools-1.69.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:8c210630faa581c3bd08953dac4ad21a7f49862f3b92d69686e9b436d2f1265d", size = 2388795 },
-    { url = "https://files.pythonhosted.org/packages/e2/03/6ccaa58b3ca1734d0868a389148e22ac15248a9be4c223805339f7904e31/grpcio_tools-1.69.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:09b66ea279fcdaebae4ec34b1baf7577af3b14322738aa980c1c33cfea71f7d7", size = 5703156 },
-    { url = "https://files.pythonhosted.org/packages/c9/f6/162b456684d2444b43e45ace4e889087301e5890bbfd16ee6b2aedf36219/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:be94a4bfa56d356aae242cc54072c9ccc2704b659eaae2fd599a94afebf791ce", size = 2350725 },
-    { url = "https://files.pythonhosted.org/packages/db/3a/2e83fea8c90b9902d68964491d014d688177a6ad0303dbbe6c2c16f25da6/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28778debad73a8c8e0a0e07e6a2f76eecce43adbc205d17dd244d2d58bb0f0aa", size = 2727230 },
-    { url = "https://files.pythonhosted.org/packages/63/06/be27b8f1811ff4cc556bdec64a9004755a929df035dc606466a75c9ac0fa/grpcio_tools-1.69.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:449308d93e4c97ae3a4503510c6d64978748ff5e21429c85da14fdc783c0f498", size = 2472752 },
-    { url = "https://files.pythonhosted.org/packages/a3/43/f94578afa1535287b7b0ba39eeb23b2b8304a2a5b8e325ed7079d2ad9cba/grpcio_tools-1.69.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b9343651e73bc6e0df6bb518c2638bf9cc2194b50d060cdbcf1b2121cd4e4ae3", size = 3344074 },
-    { url = "https://files.pythonhosted.org/packages/13/d1/5f9030cbb6195f3bb182e740f349cdaa71d9c38c1b2572f401270709d7d2/grpcio_tools-1.69.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2f08b063612553e726e328aef3a27adfaea8d92712b229012afc54d59da88a02", size = 2953778 },
-    { url = "https://files.pythonhosted.org/packages/0c/cb/4812660e150d197de81296fa04ed6ad012d1aeac23bbe21be5f51493f455/grpcio_tools-1.69.0-cp310-cp310-win32.whl", hash = "sha256:599ffd39525e7bbb6412a63e56a2e6c1af8f3493fe4305260efd4a11d064cce0", size = 957556 },
-    { url = "https://files.pythonhosted.org/packages/4e/c7/c7d5f5418909764e63208b9f76812db3287ece4f79500e815178194e1db9/grpcio_tools-1.69.0-cp310-cp310-win_amd64.whl", hash = "sha256:02f92e3c2bae67ece818787f8d3d89df0fa1e5e6bbb7c1493824fd5dfad886dd", size = 1114783 },
-    { url = "https://files.pythonhosted.org/packages/7e/f4/575f536bada8d8f5f8943c317ae28faafe7b4aaf95ef84a599f4f3e67db3/grpcio_tools-1.69.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:c18df5d1c8e163a29863583ec51237d08d7059ef8d4f7661ee6d6363d3e38fe3", size = 2388772 },
-    { url = "https://files.pythonhosted.org/packages/87/94/1157342b046f51c4d076f21ef76da6d89323929b7e870389204fd49e3f09/grpcio_tools-1.69.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:37876ae49235ef2e61e5059faf45dc5e7142ca54ae61aec378bb9483e0cd7e95", size = 5726348 },
-    { url = "https://files.pythonhosted.org/packages/36/5c/cfd9160ef1867e025844b2695d436bb953c2d5f9c20eaaa7da6fd739ab0c/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:33120920e29959eaa37a1268c6a22af243d086b1a5e5222b4203e29560ece9ce", size = 2350857 },
-    { url = "https://files.pythonhosted.org/packages/61/70/10614b8bc39f06548a0586fdd5d97843da4789965e758fba87726bde8c2f/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:788bb3ecd1b44664d829d319b3c1ebc15c7d7b5e7d1f22706ab57d6acd2c6301", size = 2727157 },
-    { url = "https://files.pythonhosted.org/packages/37/fb/33faedb3e991dceb7a2bf802d3875bff7d6a6b6a80d314197adc73739cae/grpcio_tools-1.69.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f453b11a112e3774c8957ec2570669f3da1f7fbc8ee242482c38981496e88da2", size = 2472882 },
-    { url = "https://files.pythonhosted.org/packages/41/f7/abddc158919a982f6b8e61d4a5c72569b2963304c162c3ca53c6c14d23ee/grpcio_tools-1.69.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7e5c5dc2b656755cb58b11a7e87b65258a4a8eaff01b6c30ffcb230dd447c03d", size = 3343987 },
-    { url = "https://files.pythonhosted.org/packages/ba/46/e7219456aefe29137728246a67199fcbfdaa99ede93d2045a6406f0e4c0b/grpcio_tools-1.69.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8eabf0a7a98c14322bc74f9910c96f98feebe311e085624b2d022924d4f652ca", size = 2953659 },
-    { url = "https://files.pythonhosted.org/packages/74/be/262c5d2b681930f8c58012500741fe06cb40a770c9d395650efe9042467f/grpcio_tools-1.69.0-cp311-cp311-win32.whl", hash = "sha256:ad567bea43d018c2215e1db10316eda94ca19229a834a3221c15d132d24c1b8a", size = 957447 },
-    { url = "https://files.pythonhosted.org/packages/8e/55/68153acca126dced35f888e708a65169df8fa8a4d5f0e78166a395e3fa9c/grpcio_tools-1.69.0-cp311-cp311-win_amd64.whl", hash = "sha256:3d64e801586dbea3530f245d48b9ed031738cc3eb099d5ce2fdb1b3dc2e1fb20", size = 1114753 },
-    { url = "https://files.pythonhosted.org/packages/5b/f6/9cd1aa47556664564b873cd187d8dec978ff2f4a539d8c6d5d2f418d3d36/grpcio_tools-1.69.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8ef8efe8beac4cc1e30d41893e4096ca2601da61001897bd17441645de2d4d3c", size = 2388440 },
-    { url = "https://files.pythonhosted.org/packages/62/37/0bcd8431e44b38f648f70368dd60542d10ffaffa109563349ee635013e10/grpcio_tools-1.69.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:a00e87a0c5a294028115a098819899b08dd18449df5b2aac4a2b87ba865e8681", size = 5726135 },
-    { url = "https://files.pythonhosted.org/packages/8b/f5/2ec994bbf522a231ce54c41a2d3621e77bece1240aafe31f12804052af0f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:7722700346d5b223159532e046e51f2ff743ed4342e5fe3e0457120a4199015e", size = 2350247 },
-    { url = "https://files.pythonhosted.org/packages/a9/29/9ebf54315a499a766e4c3bd53124267491162e9049c2d9ed45f43222b98f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a934116fdf202cb675246056ee54645c743e2240632f86a37e52f91a405c7143", size = 2727994 },
-    { url = "https://files.pythonhosted.org/packages/f0/2a/1a031018660b5d95c1a4c587a0babd0d28f0aa0c9a40dbca330567049a3f/grpcio_tools-1.69.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e6a6d44359ca836acfbc58103daf94b3bb8ac919d659bb348dcd7fbecedc293", size = 2472625 },
-    { url = "https://files.pythonhosted.org/packages/74/bf/76d24078e1c76976a10760c3193b6c62685a7aed64b1cb0d8242afa16f1d/grpcio_tools-1.69.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e27662c0597fd1ab5399a583d358b5203edcb6fc2b29d6245099dfacd51a6ddc", size = 3344290 },
-    { url = "https://files.pythonhosted.org/packages/f1/f7/4ab645e4955ca1e5240b0bbd557662cec4838f0e21e072ff40f4e191b48d/grpcio_tools-1.69.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7bbb2b2fb81d95bcdd1d8331defb5f5dc256dbe423bb98b682cf129cdd432366", size = 2953592 },
-    { url = "https://files.pythonhosted.org/packages/8f/32/57e67b126f209f289fc32009309d155b8dbe9ac760c32733746e4dda7b51/grpcio_tools-1.69.0-cp312-cp312-win32.whl", hash = "sha256:e11accd10cf4af5031ac86c45f1a13fb08f55e005cea070917c12e78fe6d2aa2", size = 957042 },
-    { url = "https://files.pythonhosted.org/packages/19/64/7bfcb4e50a0ce87690c24696cd666f528e672119966abead09ae65a2e1da/grpcio_tools-1.69.0-cp312-cp312-win_amd64.whl", hash = "sha256:6df4c6ac109af338a8ccde29d184e0b0bdab13d78490cb360ff9b192a1aec7e2", size = 1114248 },
-    { url = "https://files.pythonhosted.org/packages/0c/ef/a9867f612e3aa5e69d299e47a72ea8dafa476b1f099462c9a1223cd6a83c/grpcio_tools-1.69.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c320c4faa1431f2e1252ef2325a970ac23b2fd04ffef6c12f96dd4552c3445c", size = 2388281 },
-    { url = "https://files.pythonhosted.org/packages/4b/53/b2752d8ec338778e48d76845d605a0f8bca9e43a5f09428e5ed1a76e4e1d/grpcio_tools-1.69.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:5f1224596ad74dd14444b20c37122b361c5d203b67e14e018b995f3c5d76eede", size = 5725856 },
-    { url = "https://files.pythonhosted.org/packages/83/dd/195d3639634c0c1d1e48b6693c074d66a64f16c748df2f40bcee74aa04e2/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:965a0cf656a113bc32d15ac92ca51ed702a75d5370ae0afbdd36f818533a708a", size = 2350180 },
-    { url = "https://files.pythonhosted.org/packages/8c/18/c412884fa0e888d8a271f3e31d23e3765cde0efe2404653ab67971c411c2/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:978835768c11a7f28778b3b7c40f839d8a57f765c315e80c4246c23900d56149", size = 2726724 },
-    { url = "https://files.pythonhosted.org/packages/be/c7/dfb59b7e25d760bfdd93f0aef7dd0e2a37f8437ac3017b8b526c68764e2f/grpcio_tools-1.69.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:094c7cec9bd271a32dfb7c620d4a558c63fcb0122fd1651b9ed73d6afd4ae6fe", size = 2472127 },
-    { url = "https://files.pythonhosted.org/packages/f2/b6/af4edf0a181fd7b148a83d491f5677d7d1c9f86f03282f8f0209d9dfb793/grpcio_tools-1.69.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:b51bf4981b3d7e47c2569efadff08284787124eb3dea0f63f491d39703231d3c", size = 3344015 },
-    { url = "https://files.pythonhosted.org/packages/0a/9f/4c2b5ae642f7d3df73c16df6c7d53e9443cb0e49e1dcf2c8d1a49058e0b5/grpcio_tools-1.69.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ea7aaf0dc1a828e2133357a9e9553fd1bb4e766890d52a506cc132e40632acdc", size = 2952942 },
-    { url = "https://files.pythonhosted.org/packages/97/8e/6b707871db5927a17ad7475c070916bff4f32463a51552b424779236ab65/grpcio_tools-1.69.0-cp313-cp313-win32.whl", hash = "sha256:4320f11b79d3a148cc23bad1b81719ce1197808dc2406caa8a8ba0a5cfb0260d", size = 956242 },
-    { url = "https://files.pythonhosted.org/packages/27/e2/b419a02b50240143605f77cd50cb07f724caf0fd35a01540a4f044ae9f21/grpcio_tools-1.69.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9bae733654e0eb8ca83aa1d0d6b6c2f4a3525ce70d5ffc07df68d28f6520137", size = 1113616 },
-    { url = "https://files.pythonhosted.org/packages/43/22/b1154a8c019698aee265bfb7a3deba46925c4a7b0ce68a75e80dbb649514/grpcio_tools-1.69.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:01121b6570932bfb7d8b2ce2c0055dba902a415477079e249d85fe4494f72db2", size = 2389129 },
-    { url = "https://files.pythonhosted.org/packages/9b/11/35856fc16817feeba962e502d17d10543a5e1229b4b57a1761b8918cd2e1/grpcio_tools-1.69.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:9861e282aa7b3656c67e84d0c25ee0e9210b955e0ec2c64699b8f80483f90853", size = 5726369 },
-    { url = "https://files.pythonhosted.org/packages/a7/94/cfba957813713a9b42a2ea561e7faf24ff995c00acc9866576a62591abf4/grpcio_tools-1.69.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:00adf628259e8c314a02ca1580d6a8b14eeef266f5dd5e15bf92c1efbbcf63c0", size = 2350830 },
-    { url = "https://files.pythonhosted.org/packages/06/c0/c66345bee75406318536d2a3cbbb254290de1228c42ba4a34cb68d31c066/grpcio_tools-1.69.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:371d03ac31b76ba77d44bdba6a8560f344c6d1ed558babab64760da085e392b7", size = 2727551 },
-    { url = "https://files.pythonhosted.org/packages/cd/c0/0dec90e2af8f59f20d3042ed219ab55f37f49859de53c425ad339793ff09/grpcio_tools-1.69.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6730414c01fe9027ba12538fd6e192e1bea94d5b819a1e03d15e89aab1b4573", size = 2472947 },
-    { url = "https://files.pythonhosted.org/packages/e0/e5/fba8010868909ab6301068b4c8615a372420d110e68cac7aae8085c52541/grpcio_tools-1.69.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:5562a1b1b67deffd04fbb1bcf8f1634580538ce35895b77cdfaec1fb115efd95", size = 3344136 },
-    { url = "https://files.pythonhosted.org/packages/9f/3b/c03c5919bc207c5e10723f1bdd0268475e756fa39d6f6c184aa3896b9ab1/grpcio_tools-1.69.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9f8996efddc867134f22bbf8a368b1b2a018d0a9b0ac9d3185cfd81d1abd8066", size = 2953685 },
-    { url = "https://files.pythonhosted.org/packages/d4/d7/9e00a22a0a64b9207a3ddf41aa50dd0d20df209056264098885a6a0d5c18/grpcio_tools-1.69.0-cp39-cp39-win32.whl", hash = "sha256:8f5959d8a453d613e7137831f6885b43b5c378ec317943b4ec599046baa97bfc", size = 957547 },
-    { url = "https://files.pythonhosted.org/packages/64/51/f6b198152399d17247d962340947728fb1b06da6bc0c0a542446b2ffee49/grpcio_tools-1.69.0-cp39-cp39-win_amd64.whl", hash = "sha256:5d47abf7e0662dd5dbb9cc252c3616e5fbc5f71d34e3f6332cd24bcdf2940abd", size = 1114931 },
+    { url = "https://files.pythonhosted.org/packages/69/4f/97343e9af496fde5fd141874cb075ad8f338a99b1bfc1aef1f1041887e31/grpcio_tools-1.70.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:4d456521290e25b1091975af71604facc5c7db162abdca67e12a0207b8bbacbe", size = 2380731 },
+    { url = "https://files.pythonhosted.org/packages/54/48/a43b5546eeacf3171d6789aae4d0ab1f2d4203e44eb07ffc60373ac90c26/grpcio_tools-1.70.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:d50080bca84f53f3a05452e06e6251cbb4887f5a1d1321d1989e26d6e0dc398d", size = 5935297 },
+    { url = "https://files.pythonhosted.org/packages/a8/63/6f1d3c4fe4342b82cf14fd4c04d762d3ece41e5c60ca53a7532f867c7fa8/grpcio_tools-1.70.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:02e3bf55fb569fe21b54a32925979156e320f9249bb247094c4cbaa60c23a80d", size = 2336438 },
+    { url = "https://files.pythonhosted.org/packages/d9/01/e1dff616f1d088b6024767c914d13fed5800e5cc02c6904396fd01cb41ad/grpcio_tools-1.70.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88a3ec6fa2381f616d567f996503e12ca353777941b61030fd9733fd5772860e", size = 2729489 },
+    { url = "https://files.pythonhosted.org/packages/3d/60/a7c493d5cb4962e88e04c4045282ab1c60cbe480fd8105e0472950d43c97/grpcio_tools-1.70.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6034a0579fab2aed8685fa1a558de084668b1e9b01a82a4ca7458b9bedf4654c", size = 2463411 },
+    { url = "https://files.pythonhosted.org/packages/b7/1a/90c63bd2cc681936e3d8ff27f3b70a6ed7bf9f2fd40b51c18c81b0e167a3/grpcio_tools-1.70.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:701bbb1ff406a21a771f5b1df6be516c0a59236774b6836eaad7696b1d128ea8", size = 3341102 },
+    { url = "https://files.pythonhosted.org/packages/9c/01/e70919607bbb77c087c7fd6a8dc8c21a3f575d0cf71ae19e7ca709a10abc/grpcio_tools-1.70.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6eeb86864e1432fc1ab61e03395a2a4c04e9dd9c89db07e6fe68c7c2ac8ec24f", size = 2944181 },
+    { url = "https://files.pythonhosted.org/packages/17/27/34d3903480e0cffb64a6002a0766784047cac0ba65bd9f2824a0c6c86111/grpcio_tools-1.70.0-cp310-cp310-win32.whl", hash = "sha256:d53c8c45e843b5836781ad6b82a607c72c2f9a3f556e23d703a0e099222421fa", size = 947441 },
+    { url = "https://files.pythonhosted.org/packages/48/8a/b3b2fd2c8710837185b98abf06e3e775d101a09d2c2192f8f77b91c392b5/grpcio_tools-1.70.0-cp310-cp310-win_amd64.whl", hash = "sha256:22024caee36ab65c2489594d718921dcbb5bd18d61c5417a9ede94fd8dc8a589", size = 1119450 },
+    { url = "https://files.pythonhosted.org/packages/ab/2b/446a63000acab303bbc1b84fa7dbfa4857d96e95ab53e85083ba16c60d4a/grpcio_tools-1.70.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:5f5aba12d98d25c7ab2dd983939e2c21556a7d15f903b286f24d88d2c6e30c0a", size = 2380860 },
+    { url = "https://files.pythonhosted.org/packages/0c/d2/48e82de83bf34f9a5207ea808a1c6e074bf657720664eb6c9f0bab38dbf2/grpcio_tools-1.70.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:d47a6c6cfc526b290b7b53a37dd7e6932983f7a168b56aab760b4b597c47f30f", size = 5957716 },
+    { url = "https://files.pythonhosted.org/packages/fa/f7/a735faa8fc96778aa54e321ac6820bab03ee4eea305cc1209b095dfdffee/grpcio_tools-1.70.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:b5a9beadd1e24772ffa2c70f07d72f73330d356b78b246e424f4f2ed6c6713f3", size = 2336501 },
+    { url = "https://files.pythonhosted.org/packages/47/ed/4bed599c061b65149b32569347a857098819d75c2419c4202f9de1e06250/grpcio_tools-1.70.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bb8135eef160a62505f074bf7a3d62f3b13911c3c14037c5392bf877114213b5", size = 2729638 },
+    { url = "https://files.pythonhosted.org/packages/4f/43/d8850889a2041cf94e882712df0e323cd6bbf24f8f4c50e2f0d80c68da7d/grpcio_tools-1.70.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7ac9b3e13ace8467a586c53580ee22f9732c355583f3c344ef8c6c0666219cc", size = 2463251 },
+    { url = "https://files.pythonhosted.org/packages/a8/2e/2407641c70ca0afe03a04c3c29f0b51e1582759e3d5c995217b4ed0ce2bd/grpcio_tools-1.70.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:63f367363a4a1489a0046b19f9d561216ea0d206c40a6f1bf07a58ccfb7be480", size = 3340968 },
+    { url = "https://files.pythonhosted.org/packages/de/bb/591799e6b0445028d74552964e47d7b0b23ff5ce9c377688b318de331f12/grpcio_tools-1.70.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:54ceffef59a059d2c7304554a8bbb20eedb05a3f937159ab1c332c1b28e12c9f", size = 2944466 },
+    { url = "https://files.pythonhosted.org/packages/3f/90/b73293fff616574cbdf70437efb3b2ee6af3705c6b2cc19dd02dfb01708f/grpcio_tools-1.70.0-cp311-cp311-win32.whl", hash = "sha256:7a90a66a46821140a2a2b0be787dfabe42e22e9a5ba9cc70726b3e5c71a3b785", size = 947335 },
+    { url = "https://files.pythonhosted.org/packages/88/cc/12ad066dc722285ee3f7d398d4272dc43857de6b7e6fa509a385ca4a857f/grpcio_tools-1.70.0-cp311-cp311-win_amd64.whl", hash = "sha256:4ebf09733545a69c166b02caa14c34451e38855544820dab7fdde5c28e2dbffe", size = 1119053 },
+    { url = "https://files.pythonhosted.org/packages/58/8d/21f3f0c6e8ddc7ffd82873a6ff767a568a3384043adc034c49fd72020884/grpcio_tools-1.70.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:ec5d6932c3173d7618267b3b3fd77b9243949c5ec04302b7338386d4f8544e0b", size = 2380552 },
+    { url = "https://files.pythonhosted.org/packages/e1/10/def56ecb8e139a96aae9d408d891f32f24a066c57179ce5f78e7edf70a35/grpcio_tools-1.70.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:f22852da12f53b02a3bdb29d0c32fcabab9c7c8f901389acffec8461083f110d", size = 5956826 },
+    { url = "https://files.pythonhosted.org/packages/63/5e/f10375b90b7dc14d1b5095797d4f79b34e584fbc9bda06e093ad316a96dd/grpcio_tools-1.70.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:7d45067e6efd20881e98a0e1d7edd7f207b1625ad7113321becbfe0a6ebee46c", size = 2335835 },
+    { url = "https://files.pythonhosted.org/packages/ec/33/d770fbdf824edfc0f9297be046d4d48fbc81b2dbf802827ade65110f0a47/grpcio_tools-1.70.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3020c97f03b30eee3c26aa2a55fbe003f1729c6f879a378507c2c78524db7c12", size = 2729501 },
+    { url = "https://files.pythonhosted.org/packages/e2/fb/8442f386fa71056abe7ebbc153eaac8cbe32875ed659a641ca526ab9f341/grpcio_tools-1.70.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7fd472fce3b33bdf7fbc24d40da7ab10d7a088bcaf59c37433c2c57330fbcb6", size = 2462824 },
+    { url = "https://files.pythonhosted.org/packages/46/4e/1703d2586663078613baed553de052e029b3d7fe311e90d3f023c85e612a/grpcio_tools-1.70.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3875543d74ce1a698a11f498f83795216ce929cb29afa5fac15672c7ba1d6dd2", size = 3340759 },
+    { url = "https://files.pythonhosted.org/packages/59/d9/f61e427b0e1d7305396dacea65d1e0612eb2bc66b02328ef6bde117624fb/grpcio_tools-1.70.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a130c24d617a3a57369da784080dfa8848444d41b7ae1250abc06e72e706a8d9", size = 2944463 },
+    { url = "https://files.pythonhosted.org/packages/8d/8f/8f6f511ad90e12d7c2f396ad9efe46019c0a77a5f5f69e46998c834405e4/grpcio_tools-1.70.0-cp312-cp312-win32.whl", hash = "sha256:8eae17c920d14e2e451dbb18f5d8148f884e10228061941b33faa8fceee86e73", size = 946776 },
+    { url = "https://files.pythonhosted.org/packages/83/0f/aff5d01ce9ae94ed02b79e033b0c469e560221340c09120270109de4986a/grpcio_tools-1.70.0-cp312-cp312-win_amd64.whl", hash = "sha256:99caa530242a0a832d8b6a6ab94b190c9b449d3e237f953911b4d56207569436", size = 1118594 },
+    { url = "https://files.pythonhosted.org/packages/49/2a/bf442acb748b2a53281e5e7cc3fa36c25ae99436cd2f2cfe684096d4c39f/grpcio_tools-1.70.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:f024688d04e7a9429489ed695b85628075c3c6d655198ba3c6ccbd1d8b7c333b", size = 2380142 },
+    { url = "https://files.pythonhosted.org/packages/dc/a2/984dabaf1cdc41e267acdd37232026ede28f55bc6f9e932907bcbbb46773/grpcio_tools-1.70.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:1fa9a81621d7178498dedcf94eb8f276a7594327faf3dd5fd1935ce2819a2bdb", size = 5955907 },
+    { url = "https://files.pythonhosted.org/packages/cd/78/ebefc32418be93828b46eca5952ef1cb0400b33883bc20c22b1fc2a51f61/grpcio_tools-1.70.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:c6da2585c0950cdb650df1ff6d85b3fe31e22f8370b9ee11f8fe641d5b4bf096", size = 2335428 },
+    { url = "https://files.pythonhosted.org/packages/a0/f8/5d4b58dc846bf28b8b9abf07f5d091eb078fc4f01184adb3b374cf5119a4/grpcio_tools-1.70.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70234b592af17050ec30cf35894790cef52aeae87639efe6db854a7fa783cc8c", size = 2728481 },
+    { url = "https://files.pythonhosted.org/packages/b0/28/46833d415b2c2e3e0f36763c528da48785c94580240684e56410abd08aa0/grpcio_tools-1.70.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c021b040d0a9f5bb96a725c4d2b95008aad127d6bed124a7bbe854973014f5b", size = 2462401 },
+    { url = "https://files.pythonhosted.org/packages/fa/8a/c771a09aea58275106e08e7dd37470c6e8555dfcea9a7b44d1c5adc80370/grpcio_tools-1.70.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:114a42e566e5b16a47e98f7910a6c0074b37e2d1faacaae13222e463d0d0d43c", size = 3340068 },
+    { url = "https://files.pythonhosted.org/packages/3a/be/e3dfa73435c633859c4a045c299105e99a6c6a41cda524148bf9c8d4dc99/grpcio_tools-1.70.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:4cae365d7e3ba297256216a9a256458b286f75c64603f017972b3ad1ee374437", size = 2944317 },
+    { url = "https://files.pythonhosted.org/packages/b6/bd/e30fb2b0ce2c0c48caf994b1ebedb56fc7103e26062dd31a41ad1e528eb7/grpcio_tools-1.70.0-cp313-cp313-win32.whl", hash = "sha256:ae139a8d3ddd8353f62af3af018e99ebcd2f4a237bd319cb4b6f58dd608aaa54", size = 946136 },
+    { url = "https://files.pythonhosted.org/packages/0f/8a/92aba852bbe2ddf3e44c354b4162b3cf350b810523ffb2d0e5937bd3f249/grpcio_tools-1.70.0-cp313-cp313-win_amd64.whl", hash = "sha256:04bf30c0eb2741defe3ab6e0a6102b022d69cfd39d68fab9b954993ceca8d346", size = 1118147 },
+    { url = "https://files.pythonhosted.org/packages/b5/a7/4f1823dae0dd3e9f12eaf33f0e505fb3f5c3c2ce4e4351045819a8c1862e/grpcio_tools-1.70.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:52d7e7ef11867fe7de577076b1f2ac6bf106b2325130e3de66f8c364c96ff332", size = 2381002 },
+    { url = "https://files.pythonhosted.org/packages/4a/6c/5186c4f8bbd0c29a983ee09b42f36310dc4b8d654b03a622eb93e29c98dc/grpcio_tools-1.70.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:0f7ed0372afd9f5eb938334e84681396257015ab92e03de009aa3170e64b24d0", size = 5958443 },
+    { url = "https://files.pythonhosted.org/packages/e2/32/f075b0619071f49103cedc1aa7db90d8dd76222dd97dcad757ecb9b541ee/grpcio_tools-1.70.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:24a5b0328ffcfe0c4a9024f302545abdb8d6f24921409a5839f2879555b96fea", size = 2336520 },
+    { url = "https://files.pythonhosted.org/packages/09/71/d66008aab2e44ff7aa1916cdfa6777fa823c665d0c9f5e163c056a8295d4/grpcio_tools-1.70.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9387b30f3b2f46942fb5718624d7421875a6ce458620d6e15817172d78db1e1a", size = 2729454 },
+    { url = "https://files.pythonhosted.org/packages/ed/c7/42c807214318575afe9af36cea1fa2245851de21d86276283fe90afe6aa1/grpcio_tools-1.70.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4545264e06e1cd7fb21b9447bb5126330bececb4bc626c98f793fda2fd910bf8", size = 2463326 },
+    { url = "https://files.pythonhosted.org/packages/6c/28/42436a16b038f16e3c9c688cdf1c87c7bf9f1b578d835ee96537bf8272cf/grpcio_tools-1.70.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:79b723ce30416e8e1d7ff271f97ade79aaf30309a595d80c377105c07f5b20fd", size = 3341064 },
+    { url = "https://files.pythonhosted.org/packages/08/ef/69a670269b8575b77188e43553e812eefc7b17693881ca52ddf839652309/grpcio_tools-1.70.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1c0917dce12af04529606d437def83962d51c59dcde905746134222e94a2ab1b", size = 2944096 },
+    { url = "https://files.pythonhosted.org/packages/dd/ca/e3548728937558e1883c63b2cd90357bc7802b77481cbc77d3b72f3e27d1/grpcio_tools-1.70.0-cp39-cp39-win32.whl", hash = "sha256:5cb0baa52d4d44690fac6b1040197c694776a291a90e2d3c369064b4d5bc6642", size = 947346 },
+    { url = "https://files.pythonhosted.org/packages/be/66/7c1a552545a9597fbd33d77c817f1f0cc56736ca64aa0821948f945118d6/grpcio_tools-1.70.0-cp39-cp39-win_amd64.whl", hash = "sha256:840ec536ab933db2ef8d5acaa6b712d0e9e8f397f62907c852ec50a3f69cdb78", size = 1119339 },
 ]
 
 [[package]]
@@ -708,7 +720,7 @@ wheels = [
 
 [[package]]
 name = "huggingface-hub"
-version = "0.27.1"
+version = "0.28.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -719,9 +731,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e1/d2/d6976de7542792fc077b498d64af64882b6d8bb40679284ec0bff77d5929/huggingface_hub-0.27.1.tar.gz", hash = "sha256:c004463ca870283909d715d20f066ebd6968c2207dae9393fdffb3c1d4d8f98b", size = 379407 }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/ce/a734204aaae6c35a22f9956ebcd8d8708ae5b842e15d6f42bd6f49e634a4/huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae", size = 387074 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6c/3f/50f6b25fafdcfb1c089187a328c95081abf882309afd86f4053951507cd1/huggingface_hub-0.27.1-py3-none-any.whl", hash = "sha256:1c5155ca7d60b60c2e2fc38cbb3ffb7f7c3adf48f824015b219af9061771daec", size = 450658 },
+    { url = "https://files.pythonhosted.org/packages/ea/da/6c2bea5327b640920267d3bf2c9fc114cfbd0a5de234d81cda80cc9e33c8/huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7", size = 464068 },
 ]
 
 [[package]]
@@ -1068,8 +1080,7 @@ name = "networkx"
 version = "3.4.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.13'",
-    "python_full_version == '3.12.*'",
+    "python_full_version >= '3.12'",
     "python_full_version == '3.11.*'",
     "python_full_version == '3.10.*'",
 ]
@@ -1135,70 +1146,69 @@ wheels = [
 
 [[package]]
 name = "numpy"
-version = "2.2.1"
+version = "2.2.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.13'",
-    "python_full_version == '3.12.*'",
+    "python_full_version >= '3.12'",
     "python_full_version == '3.11.*'",
     "python_full_version == '3.10.*'",
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f2/a5/fdbf6a7871703df6160b5cf3dd774074b086d278172285c52c2758b76305/numpy-2.2.1.tar.gz", hash = "sha256:45681fd7128c8ad1c379f0ca0776a8b0c6583d2f69889ddac01559dfe4390918", size = 20227662 }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/d0/c12ddfd3a02274be06ffc71f3efc6d0e457b0409c4481596881e748cb264/numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f", size = 20233295 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c7/c4/5588367dc9f91e1a813beb77de46ea8cab13f778e1b3a0e661ab031aba44/numpy-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5edb4e4caf751c1518e6a26a83501fda79bff41cc59dac48d70e6d65d4ec4440", size = 21213214 },
-    { url = "https://files.pythonhosted.org/packages/d8/8b/32dd9f08419023a4cf856c5ad0b4eba9b830da85eafdef841a104c4fc05a/numpy-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa3017c40d513ccac9621a2364f939d39e550c542eb2a894b4c8da92b38896ab", size = 14352248 },
-    { url = "https://files.pythonhosted.org/packages/84/2d/0e895d02940ba6e12389f0ab5cac5afcf8dc2dc0ade4e8cad33288a721bd/numpy-2.2.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:61048b4a49b1c93fe13426e04e04fdf5a03f456616f6e98c7576144677598675", size = 5391007 },
-    { url = "https://files.pythonhosted.org/packages/11/b9/7f1e64a0d46d9c2af6d17966f641fb12d5b8ea3003f31b2308f3e3b9a6aa/numpy-2.2.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:7671dc19c7019103ca44e8d94917eba8534c76133523ca8406822efdd19c9308", size = 6926174 },
-    { url = "https://files.pythonhosted.org/packages/2e/8c/043fa4418bc9364e364ab7aba8ff6ef5f6b9171ade22de8fbcf0e2fa4165/numpy-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4250888bcb96617e00bfa28ac24850a83c9f3a16db471eca2ee1f1714df0f957", size = 14330914 },
-    { url = "https://files.pythonhosted.org/packages/f7/b6/d8110985501ca8912dfc1c3bbef99d66e62d487f72e46b2337494df77364/numpy-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7746f235c47abc72b102d3bce9977714c2444bdfaea7888d241b4c4bb6a78bf", size = 16379607 },
-    { url = "https://files.pythonhosted.org/packages/e2/57/bdca9fb8bdaa810c3a4ff2eb3231379b77f618a7c0d24be9f7070db50775/numpy-2.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:059e6a747ae84fce488c3ee397cee7e5f905fd1bda5fb18c66bc41807ff119b2", size = 15541760 },
-    { url = "https://files.pythonhosted.org/packages/97/55/3b9147b3cbc3b6b1abc2a411dec5337a46c873deca0dd0bf5bef9d0579cc/numpy-2.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f62aa6ee4eb43b024b0e5a01cf65a0bb078ef8c395e8713c6e8a12a697144528", size = 18168476 },
-    { url = "https://files.pythonhosted.org/packages/00/e7/7c2cde16c9b87a8e14fdd262ca7849c4681cf48c8a774505f7e6f5e3b643/numpy-2.2.1-cp310-cp310-win32.whl", hash = "sha256:48fd472630715e1c1c89bf1feab55c29098cb403cc184b4859f9c86d4fcb6a95", size = 6570985 },
-    { url = "https://files.pythonhosted.org/packages/a1/a8/554b0e99fc4ac11ec481254781a10da180d0559c2ebf2c324232317349ee/numpy-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:b541032178a718c165a49638d28272b771053f628382d5e9d1c93df23ff58dbf", size = 12913384 },
-    { url = "https://files.pythonhosted.org/packages/59/14/645887347124e101d983e1daf95b48dc3e136bf8525cb4257bf9eab1b768/numpy-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:40f9e544c1c56ba8f1cf7686a8c9b5bb249e665d40d626a23899ba6d5d9e1484", size = 21217379 },
-    { url = "https://files.pythonhosted.org/packages/9f/fd/2279000cf29f58ccfd3778cbf4670dfe3f7ce772df5e198c5abe9e88b7d7/numpy-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9b57eaa3b0cd8db52049ed0330747b0364e899e8a606a624813452b8203d5f7", size = 14388520 },
-    { url = "https://files.pythonhosted.org/packages/58/b0/034eb5d5ba12d66ab658ff3455a31f20add0b78df8203c6a7451bd1bee21/numpy-2.2.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bc8a37ad5b22c08e2dbd27df2b3ef7e5c0864235805b1e718a235bcb200cf1cb", size = 5389286 },
-    { url = "https://files.pythonhosted.org/packages/5d/69/6f3cccde92e82e7835fdb475c2bf439761cbf8a1daa7c07338e1e132dfec/numpy-2.2.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:9036d6365d13b6cbe8f27a0eaf73ddcc070cae584e5ff94bb45e3e9d729feab5", size = 6930345 },
-    { url = "https://files.pythonhosted.org/packages/d1/72/1cd38e91ab563e67f584293fcc6aca855c9ae46dba42e6b5ff4600022899/numpy-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51faf345324db860b515d3f364eaa93d0e0551a88d6218a7d61286554d190d73", size = 14335748 },
-    { url = "https://files.pythonhosted.org/packages/f2/d4/f999444e86986f3533e7151c272bd8186c55dda554284def18557e013a2a/numpy-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38efc1e56b73cc9b182fe55e56e63b044dd26a72128fd2fbd502f75555d92591", size = 16391057 },
-    { url = "https://files.pythonhosted.org/packages/99/7b/85cef6a3ae1b19542b7afd97d0b296526b6ef9e3c43ea0c4d9c4404fb2d0/numpy-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:31b89fa67a8042e96715c68e071a1200c4e172f93b0fbe01a14c0ff3ff820fc8", size = 15556943 },
-    { url = "https://files.pythonhosted.org/packages/69/7e/b83cc884c3508e91af78760f6b17ab46ad649831b1fa35acb3eb26d9e6d2/numpy-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c86e2a209199ead7ee0af65e1d9992d1dce7e1f63c4b9a616500f93820658d0", size = 18180785 },
-    { url = "https://files.pythonhosted.org/packages/b2/9f/eb4a9a38867de059dcd4b6e18d47c3867fbd3795d4c9557bb49278f94087/numpy-2.2.1-cp311-cp311-win32.whl", hash = "sha256:b34d87e8a3090ea626003f87f9392b3929a7bbf4104a05b6667348b6bd4bf1cd", size = 6568983 },
-    { url = "https://files.pythonhosted.org/packages/6d/1e/be3b9f3073da2f8c7fa361fcdc231b548266b0781029fdbaf75eeab997fd/numpy-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:360137f8fb1b753c5cde3ac388597ad680eccbbbb3865ab65efea062c4a1fd16", size = 12917260 },
-    { url = "https://files.pythonhosted.org/packages/62/12/b928871c570d4a87ab13d2cc19f8817f17e340d5481621930e76b80ffb7d/numpy-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:694f9e921a0c8f252980e85bce61ebbd07ed2b7d4fa72d0e4246f2f8aa6642ab", size = 20909861 },
-    { url = "https://files.pythonhosted.org/packages/3d/c3/59df91ae1d8ad7c5e03efd63fd785dec62d96b0fe56d1f9ab600b55009af/numpy-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3683a8d166f2692664262fd4900f207791d005fb088d7fdb973cc8d663626faa", size = 14095776 },
-    { url = "https://files.pythonhosted.org/packages/af/4e/8ed5868efc8e601fb69419644a280e9c482b75691466b73bfaab7d86922c/numpy-2.2.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:780077d95eafc2ccc3ced969db22377b3864e5b9a0ea5eb347cc93b3ea900315", size = 5126239 },
-    { url = "https://files.pythonhosted.org/packages/1a/74/dd0bbe650d7bc0014b051f092f2de65e34a8155aabb1287698919d124d7f/numpy-2.2.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:55ba24ebe208344aa7a00e4482f65742969a039c2acfcb910bc6fcd776eb4355", size = 6659296 },
-    { url = "https://files.pythonhosted.org/packages/7f/11/4ebd7a3f4a655764dc98481f97bd0a662fb340d1001be6050606be13e162/numpy-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b1d07b53b78bf84a96898c1bc139ad7f10fda7423f5fd158fd0f47ec5e01ac7", size = 14047121 },
-    { url = "https://files.pythonhosted.org/packages/7f/a7/c1f1d978166eb6b98ad009503e4d93a8c1962d0eb14a885c352ee0276a54/numpy-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5062dc1a4e32a10dc2b8b13cedd58988261416e811c1dc4dbdea4f57eea61b0d", size = 16096599 },
-    { url = "https://files.pythonhosted.org/packages/3d/6d/0e22afd5fcbb4d8d0091f3f46bf4e8906399c458d4293da23292c0ba5022/numpy-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fce4f615f8ca31b2e61aa0eb5865a21e14f5629515c9151850aa936c02a1ee51", size = 15243932 },
-    { url = "https://files.pythonhosted.org/packages/03/39/e4e5832820131ba424092b9610d996b37e5557180f8e2d6aebb05c31ae54/numpy-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:67d4cda6fa6ffa073b08c8372aa5fa767ceb10c9a0587c707505a6d426f4e046", size = 17861032 },
-    { url = "https://files.pythonhosted.org/packages/5f/8a/3794313acbf5e70df2d5c7d2aba8718676f8d054a05abe59e48417fb2981/numpy-2.2.1-cp312-cp312-win32.whl", hash = "sha256:32cb94448be47c500d2c7a95f93e2f21a01f1fd05dd2beea1ccd049bb6001cd2", size = 6274018 },
-    { url = "https://files.pythonhosted.org/packages/17/c1/c31d3637f2641e25c7a19adf2ae822fdaf4ddd198b05d79a92a9ce7cb63e/numpy-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:ba5511d8f31c033a5fcbda22dd5c813630af98c70b2661f2d2c654ae3cdfcfc8", size = 12613843 },
-    { url = "https://files.pythonhosted.org/packages/20/d6/91a26e671c396e0c10e327b763485ee295f5a5a7a48c553f18417e5a0ed5/numpy-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f1d09e520217618e76396377c81fba6f290d5f926f50c35f3a5f72b01a0da780", size = 20896464 },
-    { url = "https://files.pythonhosted.org/packages/8c/40/5792ccccd91d45e87d9e00033abc4f6ca8a828467b193f711139ff1f1cd9/numpy-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3ecc47cd7f6ea0336042be87d9e7da378e5c7e9b3c8ad0f7c966f714fc10d821", size = 14111350 },
-    { url = "https://files.pythonhosted.org/packages/c0/2a/fb0a27f846cb857cef0c4c92bef89f133a3a1abb4e16bba1c4dace2e9b49/numpy-2.2.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f419290bc8968a46c4933158c91a0012b7a99bb2e465d5ef5293879742f8797e", size = 5111629 },
-    { url = "https://files.pythonhosted.org/packages/eb/e5/8e81bb9d84db88b047baf4e8b681a3e48d6390bc4d4e4453eca428ecbb49/numpy-2.2.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5b6c390bfaef8c45a260554888966618328d30e72173697e5cabe6b285fb2348", size = 6645865 },
-    { url = "https://files.pythonhosted.org/packages/7a/1a/a90ceb191dd2f9e2897c69dde93ccc2d57dd21ce2acbd7b0333e8eea4e8d/numpy-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:526fc406ab991a340744aad7e25251dd47a6720a685fa3331e5c59fef5282a59", size = 14043508 },
-    { url = "https://files.pythonhosted.org/packages/f1/5a/e572284c86a59dec0871a49cd4e5351e20b9c751399d5f1d79628c0542cb/numpy-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f74e6fdeb9a265624ec3a3918430205dff1df7e95a230779746a6af78bc615af", size = 16094100 },
-    { url = "https://files.pythonhosted.org/packages/0c/2c/a79d24f364788386d85899dd280a94f30b0950be4b4a545f4fa4ed1d4ca7/numpy-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:53c09385ff0b72ba79d8715683c1168c12e0b6e84fb0372e97553d1ea91efe51", size = 15239691 },
-    { url = "https://files.pythonhosted.org/packages/cf/79/1e20fd1c9ce5a932111f964b544facc5bb9bde7865f5b42f00b4a6a9192b/numpy-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f3eac17d9ec51be534685ba877b6ab5edc3ab7ec95c8f163e5d7b39859524716", size = 17856571 },
-    { url = "https://files.pythonhosted.org/packages/be/5b/cc155e107f75d694f562bdc84a26cc930569f3dfdfbccb3420b626065777/numpy-2.2.1-cp313-cp313-win32.whl", hash = "sha256:9ad014faa93dbb52c80d8f4d3dcf855865c876c9660cb9bd7553843dd03a4b1e", size = 6270841 },
-    { url = "https://files.pythonhosted.org/packages/44/be/0e5cd009d2162e4138d79a5afb3b5d2341f0fe4777ab6e675aa3d4a42e21/numpy-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:164a829b6aacf79ca47ba4814b130c4020b202522a93d7bff2202bfb33b61c60", size = 12606618 },
-    { url = "https://files.pythonhosted.org/packages/a8/87/04ddf02dd86fb17c7485a5f87b605c4437966d53de1e3745d450343a6f56/numpy-2.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4dfda918a13cc4f81e9118dea249e192ab167a0bb1966272d5503e39234d694e", size = 20921004 },
-    { url = "https://files.pythonhosted.org/packages/6e/3e/d0e9e32ab14005425d180ef950badf31b862f3839c5b927796648b11f88a/numpy-2.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:733585f9f4b62e9b3528dd1070ec4f52b8acf64215b60a845fa13ebd73cd0712", size = 14119910 },
-    { url = "https://files.pythonhosted.org/packages/b5/5b/aa2d1905b04a8fb681e08742bb79a7bddfc160c7ce8e1ff6d5c821be0236/numpy-2.2.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:89b16a18e7bba224ce5114db863e7029803c179979e1af6ad6a6b11f70545008", size = 5153612 },
-    { url = "https://files.pythonhosted.org/packages/ce/35/6831808028df0648d9b43c5df7e1051129aa0d562525bacb70019c5f5030/numpy-2.2.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:676f4eebf6b2d430300f1f4f4c2461685f8269f94c89698d832cdf9277f30b84", size = 6668401 },
-    { url = "https://files.pythonhosted.org/packages/b1/38/10ef509ad63a5946cc042f98d838daebfe7eaf45b9daaf13df2086b15ff9/numpy-2.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f5cdf9f493b35f7e41e8368e7d7b4bbafaf9660cba53fb21d2cd174ec09631", size = 14014198 },
-    { url = "https://files.pythonhosted.org/packages/df/f8/c80968ae01df23e249ee0a4487fae55a4c0fe2f838dfe9cc907aa8aea0fa/numpy-2.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1ad395cf254c4fbb5b2132fee391f361a6e8c1adbd28f2cd8e79308a615fe9d", size = 16076211 },
-    { url = "https://files.pythonhosted.org/packages/09/69/05c169376016a0b614b432967ac46ff14269eaffab80040ec03ae1ae8e2c/numpy-2.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:08ef779aed40dbc52729d6ffe7dd51df85796a702afbf68a4f4e41fafdc8bda5", size = 15220266 },
-    { url = "https://files.pythonhosted.org/packages/f1/ff/94a4ce67ea909f41cf7ea712aebbe832dc67decad22944a1020bb398a5ee/numpy-2.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:26c9c4382b19fcfbbed3238a14abf7ff223890ea1936b8890f058e7ba35e8d71", size = 17852844 },
-    { url = "https://files.pythonhosted.org/packages/46/72/8a5dbce4020dfc595592333ef2fbb0a187d084ca243b67766d29d03e0096/numpy-2.2.1-cp313-cp313t-win32.whl", hash = "sha256:93cf4e045bae74c90ca833cba583c14b62cb4ba2cba0abd2b141ab52548247e2", size = 6326007 },
-    { url = "https://files.pythonhosted.org/packages/7b/9c/4fce9cf39dde2562584e4cfd351a0140240f82c0e3569ce25a250f47037d/numpy-2.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bff7d8ec20f5f42607599f9994770fa65d76edca264a87b5e4ea5629bce12268", size = 12693107 },
-    { url = "https://files.pythonhosted.org/packages/f1/65/d36a76b811ffe0a4515e290cb05cb0e22171b1b0f0db6bee9141cf023545/numpy-2.2.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7ba9cc93a91d86365a5d270dee221fdc04fb68d7478e6bf6af650de78a8339e3", size = 21044672 },
-    { url = "https://files.pythonhosted.org/packages/aa/3f/b644199f165063154df486d95198d814578f13dd4d8c1651e075bf1cb8af/numpy-2.2.1-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:3d03883435a19794e41f147612a77a8f56d4e52822337844fff3d4040a142964", size = 6789873 },
-    { url = "https://files.pythonhosted.org/packages/d7/df/2adb0bb98a3cbe8a6c3c6d1019aede1f1d8b83927ced228a46cc56c7a206/numpy-2.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4511d9e6071452b944207c8ce46ad2f897307910b402ea5fa975da32e0102800", size = 16194933 },
-    { url = "https://files.pythonhosted.org/packages/13/3e/1959d5219a9e6d200638d924cedda6a606392f7186a4ed56478252e70d55/numpy-2.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5c5cc0cbabe9452038ed984d05ac87910f89370b9242371bd9079cb4af61811e", size = 12820057 },
+    { url = "https://files.pythonhosted.org/packages/70/2a/69033dc22d981ad21325314f8357438078f5c28310a6d89fb3833030ec8a/numpy-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7079129b64cb78bdc8d611d1fd7e8002c0a2565da6a47c4df8062349fee90e3e", size = 21215825 },
+    { url = "https://files.pythonhosted.org/packages/31/2c/39f91e00bbd3d5639b027ac48c55dc5f2992bd2b305412d26be4c830862a/numpy-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ec6c689c61df613b783aeb21f945c4cbe6c51c28cb70aae8430577ab39f163e", size = 14354996 },
+    { url = "https://files.pythonhosted.org/packages/0a/2c/d468ebd253851af10de5b3e8f3418ebabfaab5f0337a75299fbeb8b8c17a/numpy-2.2.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:40c7ff5da22cd391944a28c6a9c638a5eef77fcf71d6e3a79e1d9d9e82752715", size = 5393621 },
+    { url = "https://files.pythonhosted.org/packages/7f/f4/3d8a5a0da297034106c5de92be881aca7079cde6058934215a1de91334f6/numpy-2.2.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:995f9e8181723852ca458e22de5d9b7d3ba4da3f11cc1cb113f093b271d7965a", size = 6928931 },
+    { url = "https://files.pythonhosted.org/packages/47/a7/029354ab56edd43dd3f5efbfad292b8844f98b93174f322f82353fa46efa/numpy-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b78ea78450fd96a498f50ee096f69c75379af5138f7881a51355ab0e11286c97", size = 14333157 },
+    { url = "https://files.pythonhosted.org/packages/e3/d7/11fc594838d35c43519763310c316d4fd56f8600d3fc80a8e13e325b5c5c/numpy-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3fbe72d347fbc59f94124125e73fc4976a06927ebc503ec5afbfb35f193cd957", size = 16381794 },
+    { url = "https://files.pythonhosted.org/packages/af/d4/dd9b19cd4aff9c79d3f54d17f8be815407520d3116004bc574948336981b/numpy-2.2.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8e6da5cffbbe571f93588f562ed130ea63ee206d12851b60819512dd3e1ba50d", size = 15543990 },
+    { url = "https://files.pythonhosted.org/packages/30/97/ab96b7650f27f684a9b1e46757a7294ecc50cab27701d05f146e9f779627/numpy-2.2.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:09d6a2032faf25e8d0cadde7fd6145118ac55d2740132c1d845f98721b5ebcfd", size = 18170896 },
+    { url = "https://files.pythonhosted.org/packages/81/9b/bae9618cab20db67a2ca9d711795cad29b2ca4b73034dd3b5d05b962070a/numpy-2.2.2-cp310-cp310-win32.whl", hash = "sha256:159ff6ee4c4a36a23fe01b7c3d07bd8c14cc433d9720f977fcd52c13c0098160", size = 6573458 },
+    { url = "https://files.pythonhosted.org/packages/92/9b/95678092febd14070cfb7906ea7932e71e9dd5a6ab3ee948f9ed975e905d/numpy-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:64bd6e1762cd7f0986a740fee4dff927b9ec2c5e4d9a28d056eb17d332158014", size = 12915812 },
+    { url = "https://files.pythonhosted.org/packages/21/67/32c68756eed84df181c06528ff57e09138f893c4653448c4967311e0f992/numpy-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:642199e98af1bd2b6aeb8ecf726972d238c9877b0f6e8221ee5ab945ec8a2189", size = 21220002 },
+    { url = "https://files.pythonhosted.org/packages/3b/89/f43bcad18f2b2e5814457b1c7f7b0e671d0db12c8c0e43397ab8cb1831ed/numpy-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6d9fc9d812c81e6168b6d405bf00b8d6739a7f72ef22a9214c4241e0dc70b323", size = 14391215 },
+    { url = "https://files.pythonhosted.org/packages/9c/e6/efb8cd6122bf25e86e3dd89d9dbfec9e6861c50e8810eed77d4be59b51c6/numpy-2.2.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:c7d1fd447e33ee20c1f33f2c8e6634211124a9aabde3c617687d8b739aa69eac", size = 5391918 },
+    { url = "https://files.pythonhosted.org/packages/47/e2/fccf89d64d9b47ffb242823d4e851fc9d36fa751908c9aac2807924d9b4e/numpy-2.2.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:451e854cfae0febe723077bd0cf0a4302a5d84ff25f0bfece8f29206c7bed02e", size = 6933133 },
+    { url = "https://files.pythonhosted.org/packages/34/22/5ece749c0e5420a9380eef6fbf83d16a50010bd18fef77b9193d80a6760e/numpy-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd249bc894af67cbd8bad2c22e7cbcd46cf87ddfca1f1289d1e7e54868cc785c", size = 14338187 },
+    { url = "https://files.pythonhosted.org/packages/5b/86/caec78829311f62afa6fa334c8dfcd79cffb4d24bcf96ee02ae4840d462b/numpy-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02935e2c3c0c6cbe9c7955a8efa8908dd4221d7755644c59d1bba28b94fd334f", size = 16393429 },
+    { url = "https://files.pythonhosted.org/packages/c8/4e/0c25f74c88239a37924577d6ad780f3212a50f4b4b5f54f5e8c918d726bd/numpy-2.2.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a972cec723e0563aa0823ee2ab1df0cb196ed0778f173b381c871a03719d4826", size = 15559103 },
+    { url = "https://files.pythonhosted.org/packages/d4/bd/d557f10fa50dc4d5871fb9606af563249b66af2fc6f99041a10e8757c6f1/numpy-2.2.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d6d6a0910c3b4368d89dde073e630882cdb266755565155bc33520283b2d9df8", size = 18182967 },
+    { url = "https://files.pythonhosted.org/packages/30/e9/66cc0f66386d78ed89e45a56e2a1d051e177b6e04477c4a41cd590ef4017/numpy-2.2.2-cp311-cp311-win32.whl", hash = "sha256:860fd59990c37c3ef913c3ae390b3929d005243acca1a86facb0773e2d8d9e50", size = 6571499 },
+    { url = "https://files.pythonhosted.org/packages/66/a3/4139296b481ae7304a43581046b8f0a20da6a0dfe0ee47a044cade796603/numpy-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:da1eeb460ecce8d5b8608826595c777728cdf28ce7b5a5a8c8ac8d949beadcf2", size = 12919805 },
+    { url = "https://files.pythonhosted.org/packages/0c/e6/847d15770ab7a01e807bdfcd4ead5bdae57c0092b7dc83878171b6af97bb/numpy-2.2.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ac9bea18d6d58a995fac1b2cb4488e17eceeac413af014b1dd26170b766d8467", size = 20912636 },
+    { url = "https://files.pythonhosted.org/packages/d1/af/f83580891577b13bd7e261416120e036d0d8fb508c8a43a73e38928b794b/numpy-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23ae9f0c2d889b7b2d88a3791f6c09e2ef827c2446f1c4a3e3e76328ee4afd9a", size = 14098403 },
+    { url = "https://files.pythonhosted.org/packages/2b/86/d019fb60a9d0f1d4cf04b014fe88a9135090adfadcc31c1fadbb071d7fa7/numpy-2.2.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:3074634ea4d6df66be04f6728ee1d173cfded75d002c75fac79503a880bf3825", size = 5128938 },
+    { url = "https://files.pythonhosted.org/packages/7a/1b/50985edb6f1ec495a1c36452e860476f5b7ecdc3fc59ea89ccad3c4926c5/numpy-2.2.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8ec0636d3f7d68520afc6ac2dc4b8341ddb725039de042faf0e311599f54eb37", size = 6661937 },
+    { url = "https://files.pythonhosted.org/packages/f4/1b/17efd94cad1b9d605c3f8907fb06bcffc4ce4d1d14d46b95316cccccf2b9/numpy-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ffbb1acd69fdf8e89dd60ef6182ca90a743620957afb7066385a7bbe88dc748", size = 14049518 },
+    { url = "https://files.pythonhosted.org/packages/5b/73/65d2f0b698df1731e851e3295eb29a5ab8aa06f763f7e4188647a809578d/numpy-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0349b025e15ea9d05c3d63f9657707a4e1d471128a3b1d876c095f328f8ff7f0", size = 16099146 },
+    { url = "https://files.pythonhosted.org/packages/d5/69/308f55c0e19d4b5057b5df286c5433822e3c8039ede06d4051d96f1c2c4e/numpy-2.2.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:463247edcee4a5537841d5350bc87fe8e92d7dd0e8c71c995d2c6eecb8208278", size = 15246336 },
+    { url = "https://files.pythonhosted.org/packages/f0/d8/d8d333ad0d8518d077a21aeea7b7c826eff766a2b1ce1194dea95ca0bacf/numpy-2.2.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9dd47ff0cb2a656ad69c38da850df3454da88ee9a6fde0ba79acceee0e79daba", size = 17863507 },
+    { url = "https://files.pythonhosted.org/packages/82/6e/0b84ad3103ffc16d6673e63b5acbe7901b2af96c2837174c6318c98e27ab/numpy-2.2.2-cp312-cp312-win32.whl", hash = "sha256:4525b88c11906d5ab1b0ec1f290996c0020dd318af8b49acaa46f198b1ffc283", size = 6276491 },
+    { url = "https://files.pythonhosted.org/packages/fc/84/7f801a42a67b9772a883223a0a1e12069a14626c81a732bd70aac57aebc1/numpy-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:5acea83b801e98541619af398cc0109ff48016955cc0818f478ee9ef1c5c3dcb", size = 12616372 },
+    { url = "https://files.pythonhosted.org/packages/e1/fe/df5624001f4f5c3e0b78e9017bfab7fdc18a8d3b3d3161da3d64924dd659/numpy-2.2.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b208cfd4f5fe34e1535c08983a1a6803fdbc7a1e86cf13dd0c61de0b51a0aadc", size = 20899188 },
+    { url = "https://files.pythonhosted.org/packages/a9/80/d349c3b5ed66bd3cb0214be60c27e32b90a506946857b866838adbe84040/numpy-2.2.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d0bbe7dd86dca64854f4b6ce2ea5c60b51e36dfd597300057cf473d3615f2369", size = 14113972 },
+    { url = "https://files.pythonhosted.org/packages/9d/50/949ec9cbb28c4b751edfa64503f0913cbfa8d795b4a251e7980f13a8a655/numpy-2.2.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:22ea3bb552ade325530e72a0c557cdf2dea8914d3a5e1fecf58fa5dbcc6f43cd", size = 5114294 },
+    { url = "https://files.pythonhosted.org/packages/8d/f3/399c15629d5a0c68ef2aa7621d430b2be22034f01dd7f3c65a9c9666c445/numpy-2.2.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:128c41c085cab8a85dc29e66ed88c05613dccf6bc28b3866cd16050a2f5448be", size = 6648426 },
+    { url = "https://files.pythonhosted.org/packages/2c/03/c72474c13772e30e1bc2e558cdffd9123c7872b731263d5648b5c49dd459/numpy-2.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:250c16b277e3b809ac20d1f590716597481061b514223c7badb7a0f9993c7f84", size = 14045990 },
+    { url = "https://files.pythonhosted.org/packages/83/9c/96a9ab62274ffafb023f8ee08c88d3d31ee74ca58869f859db6845494fa6/numpy-2.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c8854b09bc4de7b041148d8550d3bd712b5c21ff6a8ed308085f190235d7ff", size = 16096614 },
+    { url = "https://files.pythonhosted.org/packages/d5/34/cd0a735534c29bec7093544b3a509febc9b0df77718a9b41ffb0809c9f46/numpy-2.2.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b6fb9c32a91ec32a689ec6410def76443e3c750e7cfc3fb2206b985ffb2b85f0", size = 15242123 },
+    { url = "https://files.pythonhosted.org/packages/5e/6d/541717a554a8f56fa75e91886d9b79ade2e595918690eb5d0d3dbd3accb9/numpy-2.2.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:57b4012e04cc12b78590a334907e01b3a85efb2107df2b8733ff1ed05fce71de", size = 17859160 },
+    { url = "https://files.pythonhosted.org/packages/b9/a5/fbf1f2b54adab31510728edd06a05c1b30839f37cf8c9747cb85831aaf1b/numpy-2.2.2-cp313-cp313-win32.whl", hash = "sha256:4dbd80e453bd34bd003b16bd802fac70ad76bd463f81f0c518d1245b1c55e3d9", size = 6273337 },
+    { url = "https://files.pythonhosted.org/packages/56/e5/01106b9291ef1d680f82bc47d0c5b5e26dfed15b0754928e8f856c82c881/numpy-2.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:5a8c863ceacae696aff37d1fd636121f1a512117652e5dfb86031c8d84836369", size = 12609010 },
+    { url = "https://files.pythonhosted.org/packages/9f/30/f23d9876de0f08dceb707c4dcf7f8dd7588266745029debb12a3cdd40be6/numpy-2.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:b3482cb7b3325faa5f6bc179649406058253d91ceda359c104dac0ad320e1391", size = 20924451 },
+    { url = "https://files.pythonhosted.org/packages/6a/ec/6ea85b2da9d5dfa1dbb4cb3c76587fc8ddcae580cb1262303ab21c0926c4/numpy-2.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9491100aba630910489c1d0158034e1c9a6546f0b1340f716d522dc103788e39", size = 14122390 },
+    { url = "https://files.pythonhosted.org/packages/68/05/bfbdf490414a7dbaf65b10c78bc243f312c4553234b6d91c94eb7c4b53c2/numpy-2.2.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:41184c416143defa34cc8eb9d070b0a5ba4f13a0fa96a709e20584638254b317", size = 5156590 },
+    { url = "https://files.pythonhosted.org/packages/f7/ec/fe2e91b2642b9d6544518388a441bcd65c904cea38d9ff998e2e8ebf808e/numpy-2.2.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:7dca87ca328f5ea7dafc907c5ec100d187911f94825f8700caac0b3f4c384b49", size = 6671958 },
+    { url = "https://files.pythonhosted.org/packages/b1/6f/6531a78e182f194d33ee17e59d67d03d0d5a1ce7f6be7343787828d1bd4a/numpy-2.2.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bc61b307655d1a7f9f4b043628b9f2b721e80839914ede634e3d485913e1fb2", size = 14019950 },
+    { url = "https://files.pythonhosted.org/packages/e1/fb/13c58591d0b6294a08cc40fcc6b9552d239d773d520858ae27f39997f2ae/numpy-2.2.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fad446ad0bc886855ddf5909cbf8cb5d0faa637aaa6277fb4b19ade134ab3c7", size = 16079759 },
+    { url = "https://files.pythonhosted.org/packages/2c/f2/f2f8edd62abb4b289f65a7f6d1f3650273af00b91b7267a2431be7f1aec6/numpy-2.2.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:149d1113ac15005652e8d0d3f6fd599360e1a708a4f98e43c9c77834a28238cb", size = 15226139 },
+    { url = "https://files.pythonhosted.org/packages/aa/29/14a177f1a90b8ad8a592ca32124ac06af5eff32889874e53a308f850290f/numpy-2.2.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:106397dbbb1896f99e044efc90360d098b3335060375c26aa89c0d8a97c5f648", size = 17856316 },
+    { url = "https://files.pythonhosted.org/packages/95/03/242ae8d7b97f4e0e4ab8dd51231465fb23ed5e802680d629149722e3faf1/numpy-2.2.2-cp313-cp313t-win32.whl", hash = "sha256:0eec19f8af947a61e968d5429f0bd92fec46d92b0008d0a6685b40d6adf8a4f4", size = 6329134 },
+    { url = "https://files.pythonhosted.org/packages/80/94/cd9e9b04012c015cb6320ab3bf43bc615e248dddfeb163728e800a5d96f0/numpy-2.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:97b974d3ba0fb4612b77ed35d7627490e8e3dff56ab41454d9e8b23448940576", size = 12696208 },
+    { url = "https://files.pythonhosted.org/packages/96/7e/1dd770ee68916ed358991ab62c2cc353ffd98d0b75b901d52183ca28e8bb/numpy-2.2.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b0531f0b0e07643eb089df4c509d30d72c9ef40defa53e41363eca8a8cc61495", size = 21047291 },
+    { url = "https://files.pythonhosted.org/packages/d1/3c/ccd08578dc532a8e6927952339d4a02682b776d5e85be49ed0760308433e/numpy-2.2.2-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:e9e82dcb3f2ebbc8cb5ce1102d5f1c5ed236bf8a11730fb45ba82e2841ec21df", size = 6792494 },
+    { url = "https://files.pythonhosted.org/packages/7c/28/8754b9aee4f97199f9a047f73bb644b5a2014994a6d7b061ba67134a42de/numpy-2.2.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0d4142eb40ca6f94539e4db929410f2a46052a0fe7a2c1c59f6179c39938d2a", size = 16197312 },
+    { url = "https://files.pythonhosted.org/packages/26/96/deb93f871f401045a684ca08a009382b247d14996d7a94fea6aa43c67b94/numpy-2.2.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:356ca982c188acbfa6af0d694284d8cf20e95b1c3d0aefa8929376fea9146f60", size = 12822674 },
 ]
 
 [[package]]
@@ -1206,7 +1216,6 @@ name = "nvidia-cublas-cu12"
 version = "12.4.5.8"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771 },
     { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805 },
 ]
 
@@ -1215,7 +1224,6 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556 },
     { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957 },
 ]
 
@@ -1224,7 +1232,6 @@ name = "nvidia-cuda-nvrtc-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372 },
     { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306 },
 ]
 
@@ -1233,7 +1240,6 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177 },
     { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737 },
 ]
 
@@ -1256,7 +1262,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548 },
     { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
 ]
 
@@ -1265,7 +1270,6 @@ name = "nvidia-curand-cu12"
 version = "10.3.5.147"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811 },
     { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206 },
 ]
 
@@ -1279,7 +1283,6 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111 },
     { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
 ]
 
@@ -1291,10 +1294,17 @@ dependencies = [
     { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987 },
     { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
 ]
 
+[[package]]
+name = "nvidia-cusparselt-cu12"
+version = "0.6.2"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/a8/bcbb63b53a4b1234feeafb65544ee55495e1bb37ec31b999b963cbccfd1d/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9", size = 150057751 },
+]
+
 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.21.5"
@@ -1308,7 +1318,6 @@ name = "nvidia-nvjitlink-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510 },
     { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810 },
 ]
 
@@ -1317,51 +1326,50 @@ name = "nvidia-nvtx-cu12"
 version = "12.4.127"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417 },
     { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
 ]
 
 [[package]]
 name = "opentelemetry-api"
-version = "1.29.0"
+version = "1.30.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "deprecated" },
     { name = "importlib-metadata" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/bc/8e/b886a5e9861afa188d1fe671fb96ff9a1d90a23d57799331e137cc95d573/opentelemetry_api-1.29.0.tar.gz", hash = "sha256:d04a6cf78aad09614f52964ecb38021e248f5714dc32c2e0d8fd99517b4d69cf", size = 62900 }
+sdist = { url = "https://files.pythonhosted.org/packages/2b/6d/bbbf879826b7f3c89a45252010b5796fb1f1a0d45d9dc4709db0ef9a06c8/opentelemetry_api-1.30.0.tar.gz", hash = "sha256:375893400c1435bf623f7dfb3bcd44825fe6b56c34d0667c542ea8257b1a1240", size = 63703 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/43/53/5249ea860d417a26a3a6f1bdedfc0748c4f081a3adaec3d398bc0f7c6a71/opentelemetry_api-1.29.0-py3-none-any.whl", hash = "sha256:5fcd94c4141cc49c736271f3e1efb777bebe9cc535759c54c936cca4f1b312b8", size = 64304 },
+    { url = "https://files.pythonhosted.org/packages/36/0a/eea862fae6413d8181b23acf8e13489c90a45f17986ee9cf4eab8a0b9ad9/opentelemetry_api-1.30.0-py3-none-any.whl", hash = "sha256:d5f5284890d73fdf47f843dda3210edf37a38d66f44f2b5aedc1e89ed455dc09", size = 64955 },
 ]
 
 [[package]]
 name = "opentelemetry-exporter-otlp"
-version = "1.29.0"
+version = "1.30.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "opentelemetry-exporter-otlp-proto-grpc" },
     { name = "opentelemetry-exporter-otlp-proto-http" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/91/23/824e71822969cd3018897f5b0246baf8305bf7635f20df1ce5dfc423c32d/opentelemetry_exporter_otlp-1.29.0.tar.gz", hash = "sha256:ee7dfcccbb5e87ad9b389908452e10b7beeab55f70a83f41ce5b8c4efbde6544", size = 6159 }
+sdist = { url = "https://files.pythonhosted.org/packages/a8/5f/612208105a64018bdf1eae09f2bdddb2fb1ab292bb5e5aae8a3f567ddd3f/opentelemetry_exporter_otlp-1.30.0.tar.gz", hash = "sha256:da7769f95cd5be5b09dd4188ac153a33709eda652217f2d10aed6518c8e60f0d", size = 6188 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cf/54/2a84533f39bb240958d691bb3ddf1c3fb6a92356654fb2e02a210f65ce6b/opentelemetry_exporter_otlp-1.29.0-py3-none-any.whl", hash = "sha256:b8da6e20f5b0ffe604154b1e16a407eade17ce310c42fb85bb4e1246fc3688ad", size = 7011 },
+    { url = "https://files.pythonhosted.org/packages/e2/9a/beb4d395cd581d3738c3b3942f2c2487991b6066a143e4e42b8e84e4ba3a/opentelemetry_exporter_otlp-1.30.0-py3-none-any.whl", hash = "sha256:44e11054ec571ccfed73a83c6429dee5d334d061d0e0572e3160d6de97156dbc", size = 7042 },
 ]
 
 [[package]]
 name = "opentelemetry-exporter-otlp-proto-common"
-version = "1.29.0"
+version = "1.30.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "opentelemetry-proto" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b1/58/f7fd7eaf592b2521999a4271ab3ce1c82fe37fe9b0dc25c348398d95d66a/opentelemetry_exporter_otlp_proto_common-1.29.0.tar.gz", hash = "sha256:e7c39b5dbd1b78fe199e40ddfe477e6983cb61aa74ba836df09c3869a3e3e163", size = 19133 }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/d7/44098bf1ef89fc5810cdbda05faa2ae9322a0dbda4921cdc965dc68a9856/opentelemetry_exporter_otlp_proto_common-1.30.0.tar.gz", hash = "sha256:ddbfbf797e518411857d0ca062c957080279320d6235a279f7b64ced73c13897", size = 19640 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9e/75/7609bda3d72bf307839570b226180513e854c01443ebe265ed732a4980fc/opentelemetry_exporter_otlp_proto_common-1.29.0-py3-none-any.whl", hash = "sha256:a9d7376c06b4da9cf350677bcddb9618ed4b8255c3f6476975f5e38274ecd3aa", size = 18459 },
+    { url = "https://files.pythonhosted.org/packages/ee/54/f4b3de49f8d7d3a78fd6e6e1a6fd27dd342eb4d82c088b9078c6a32c3808/opentelemetry_exporter_otlp_proto_common-1.30.0-py3-none-any.whl", hash = "sha256:5468007c81aa9c44dc961ab2cf368a29d3475977df83b4e30aeed42aa7bc3b38", size = 18747 },
 ]
 
 [[package]]
 name = "opentelemetry-exporter-otlp-proto-grpc"
-version = "1.29.0"
+version = "1.30.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "deprecated" },
@@ -1372,14 +1380,14 @@ dependencies = [
     { name = "opentelemetry-proto" },
     { name = "opentelemetry-sdk" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/41/aa/b3f2190613141f35fe15145bf438334fdd1eac8aeeee4f7ecbc887999443/opentelemetry_exporter_otlp_proto_grpc-1.29.0.tar.gz", hash = "sha256:3d324d07d64574d72ed178698de3d717f62a059a93b6b7685ee3e303384e73ea", size = 26224 }
+sdist = { url = "https://files.pythonhosted.org/packages/86/3e/c7246df92c25e6ce95c349ad21597b4471b01ec9471e95d5261f1629fe92/opentelemetry_exporter_otlp_proto_grpc-1.30.0.tar.gz", hash = "sha256:d0f10f0b9b9a383b7d04a144d01cb280e70362cccc613987e234183fd1f01177", size = 26256 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f2/de/4b4127a25d1594851d99032f3a9acb09cb512d11edec713410fb906607f4/opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl", hash = "sha256:5a2a3a741a2543ed162676cf3eefc2b4150e6f4f0a193187afb0d0e65039c69c", size = 18520 },
+    { url = "https://files.pythonhosted.org/packages/5e/35/d9f63fd84c2ed8dbd407bcbb933db4ed6e1b08e7fbdaca080b9ac309b927/opentelemetry_exporter_otlp_proto_grpc-1.30.0-py3-none-any.whl", hash = "sha256:2906bcae3d80acc54fd1ffcb9e44d324e8631058b502ebe4643ca71d1ff30830", size = 18550 },
 ]
 
 [[package]]
 name = "opentelemetry-exporter-otlp-proto-http"
-version = "1.29.0"
+version = "1.30.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "deprecated" },
@@ -1390,14 +1398,14 @@ dependencies = [
     { name = "opentelemetry-sdk" },
     { name = "requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ab/88/e70a2e9fbb1bddb1ab7b6d74fb02c68601bff5948292ce33464c84ee082e/opentelemetry_exporter_otlp_proto_http-1.29.0.tar.gz", hash = "sha256:b10d174e3189716f49d386d66361fbcf6f2b9ad81e05404acdee3f65c8214204", size = 15041 }
+sdist = { url = "https://files.pythonhosted.org/packages/04/f9/abb9191d536e6a2e2b7903f8053bf859a76bf784e3ca19a5749550ef19e4/opentelemetry_exporter_otlp_proto_http-1.30.0.tar.gz", hash = "sha256:c3ae75d4181b1e34a60662a6814d0b94dd33b628bee5588a878bed92cee6abdc", size = 15073 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/31/49/a1c3d24e8fe73b5f422e21b46c24aed3db7fd9427371c06442e7bdfe4d3b/opentelemetry_exporter_otlp_proto_http-1.29.0-py3-none-any.whl", hash = "sha256:b228bdc0f0cfab82eeea834a7f0ffdd2a258b26aa33d89fb426c29e8e934d9d0", size = 17217 },
+    { url = "https://files.pythonhosted.org/packages/e1/3c/cdf34bc459613f2275aff9b258f35acdc4c4938dad161d17437de5d4c034/opentelemetry_exporter_otlp_proto_http-1.30.0-py3-none-any.whl", hash = "sha256:9578e790e579931c5ffd50f1e6975cbdefb6a0a0a5dea127a6ae87df10e0a589", size = 17245 },
 ]
 
 [[package]]
 name = "opentelemetry-instrumentation"
-version = "0.50b0"
+version = "0.51b0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "opentelemetry-api" },
@@ -1405,14 +1413,14 @@ dependencies = [
     { name = "packaging" },
     { name = "wrapt" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/79/2e/2e59a7cb636dc394bd7cf1758ada5e8ed87590458ca6bb2f9c26e0243847/opentelemetry_instrumentation-0.50b0.tar.gz", hash = "sha256:7d98af72de8dec5323e5202e46122e5f908592b22c6d24733aad619f07d82979", size = 26539 }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/5a/4c7f02235ac1269b48f3855f6be1afc641f31d4888d28b90b732fbce7141/opentelemetry_instrumentation-0.51b0.tar.gz", hash = "sha256:4ca266875e02f3988536982467f7ef8c32a38b8895490ddce9ad9604649424fa", size = 27760 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ff/b1/55a77152a83ec8998e520a3a575f44af1020cfe4bdc000b7538583293b85/opentelemetry_instrumentation-0.50b0-py3-none-any.whl", hash = "sha256:b8f9fc8812de36e1c6dffa5bfc6224df258841fb387b6dfe5df15099daa10630", size = 30728 },
+    { url = "https://files.pythonhosted.org/packages/40/2c/48fa93f1acca9f79a06da0df7bfe916632ecc7fce1971067b3e46bcae55b/opentelemetry_instrumentation-0.51b0-py3-none-any.whl", hash = "sha256:c6de8bd26b75ec8b0e54dff59e198946e29de6a10ec65488c357d4b34aa5bdcf", size = 30923 },
 ]
 
 [[package]]
 name = "opentelemetry-instrumentation-grpc"
-version = "0.50b0"
+version = "0.51b0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "opentelemetry-api" },
@@ -1420,65 +1428,66 @@ dependencies = [
     { name = "opentelemetry-semantic-conventions" },
     { name = "wrapt" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/51/56/658110193718ddde6e8e68ef3ad3fee7850055820a9fc2bd7ec1347afeca/opentelemetry_instrumentation_grpc-0.50b0.tar.gz", hash = "sha256:12381fbc0a7a91410fb9dad5f26f6de5eb5c30cd19c840fa9bfee78b584af7e7", size = 30746 }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/9b/aeb09dfbe1503451922e1c9dfb1534bfec5002d1b9befa156068c88118a8/opentelemetry_instrumentation_grpc-0.51b0.tar.gz", hash = "sha256:70931c1a9f5fdde9d4498762fd98db41ad119a7e08e432052ff83a6495bf8ac5", size = 30777 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/85/94/94ea66294e0dca44e53267d68a7de68424c9120f142dbc2e7db503d85f3a/opentelemetry_instrumentation_grpc-0.50b0-py3-none-any.whl", hash = "sha256:d4a13cad1c7ce019429e0253a00bee8852314be90004d8cf0051df51d9da1ac3", size = 27069 },
+    { url = "https://files.pythonhosted.org/packages/86/f6/964af7c1d9f14292be4786cc94b4869d7f3b261dff6cbf557a4c9db0e336/opentelemetry_instrumentation_grpc-0.51b0-py3-none-any.whl", hash = "sha256:cf54d77c706d7e85f293675d0990e97577b0d14ef92f4a30d55507f08f194c9b", size = 27104 },
 ]
 
 [[package]]
 name = "opentelemetry-proto"
-version = "1.29.0"
+version = "1.30.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "protobuf" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/80/52/fd3b3d79e1b00ad2dcac92db6885e49bedbf7a6828647954e4952d653132/opentelemetry_proto-1.29.0.tar.gz", hash = "sha256:3c136aa293782e9b44978c738fff72877a4b78b5d21a64e879898db7b2d93e5d", size = 34320 }
+sdist = { url = "https://files.pythonhosted.org/packages/31/6e/c1ff2e3b0cd3a189a6be03fd4d63441d73d7addd9117ab5454e667b9b6c7/opentelemetry_proto-1.30.0.tar.gz", hash = "sha256:afe5c9c15e8b68d7c469596e5b32e8fc085eb9febdd6fb4e20924a93a0389179", size = 34362 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bd/66/a500e38ee322d89fce61c74bd7769c8ef3bebc6c2f43fda5f3fc3441286d/opentelemetry_proto-1.29.0-py3-none-any.whl", hash = "sha256:495069c6f5495cbf732501cdcd3b7f60fda2b9d3d4255706ca99b7ca8dec53ff", size = 55818 },
+    { url = "https://files.pythonhosted.org/packages/56/d7/85de6501f7216995295f7ec11e470142e6a6e080baacec1753bbf272e007/opentelemetry_proto-1.30.0-py3-none-any.whl", hash = "sha256:c6290958ff3ddacc826ca5abbeb377a31c2334387352a259ba0df37c243adc11", size = 55854 },
 ]
 
 [[package]]
 name = "opentelemetry-sdk"
-version = "1.29.0"
+version = "1.30.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "opentelemetry-api" },
     { name = "opentelemetry-semantic-conventions" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/0c/5a/1ed4c3cf6c09f80565fc085f7e8efa0c222712fd2a9412d07424705dcf72/opentelemetry_sdk-1.29.0.tar.gz", hash = "sha256:b0787ce6aade6ab84315302e72bd7a7f2f014b0fb1b7c3295b88afe014ed0643", size = 157229 }
+sdist = { url = "https://files.pythonhosted.org/packages/93/ee/d710062e8a862433d1be0b85920d0c653abe318878fef2d14dfe2c62ff7b/opentelemetry_sdk-1.30.0.tar.gz", hash = "sha256:c9287a9e4a7614b9946e933a67168450b9ab35f08797eb9bc77d998fa480fa18", size = 158633 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/1d/512b86af21795fb463726665e2f61db77d384e8779fdcf4cb0ceec47866d/opentelemetry_sdk-1.29.0-py3-none-any.whl", hash = "sha256:173be3b5d3f8f7d671f20ea37056710217959e774e2749d984355d1f9391a30a", size = 118078 },
+    { url = "https://files.pythonhosted.org/packages/97/28/64d781d6adc6bda2260067ce2902bd030cf45aec657e02e28c5b4480b976/opentelemetry_sdk-1.30.0-py3-none-any.whl", hash = "sha256:14fe7afc090caad881addb6926cec967129bd9260c4d33ae6a217359f6b61091", size = 118717 },
 ]
 
 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.50b0"
+version = "0.51b0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "deprecated" },
     { name = "opentelemetry-api" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e7/4e/d7c7c91ff47cd96fe4095dd7231701aec7347426fd66872ff320d6cd1fcc/opentelemetry_semantic_conventions-0.50b0.tar.gz", hash = "sha256:02dc6dbcb62f082de9b877ff19a3f1ffaa3c306300fa53bfac761c4567c83d38", size = 100459 }
+sdist = { url = "https://files.pythonhosted.org/packages/1e/c0/0f9ef4605fea7f2b83d55dd0b0d7aebe8feead247cd6facd232b30907b4f/opentelemetry_semantic_conventions-0.51b0.tar.gz", hash = "sha256:3fabf47f35d1fd9aebcdca7e6802d86bd5ebc3bc3408b7e3248dde6e87a18c47", size = 107191 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/da/fb/dc15fad105450a015e913cfa4f5c27b6a5f1bea8fb649f8cae11e699c8af/opentelemetry_semantic_conventions-0.50b0-py3-none-any.whl", hash = "sha256:e87efba8fdb67fb38113efea6a349531e75ed7ffc01562f65b802fcecb5e115e", size = 166602 },
+    { url = "https://files.pythonhosted.org/packages/2e/75/d7bdbb6fd8630b4cafb883482b75c4fc276b6426619539d266e32ac53266/opentelemetry_semantic_conventions-0.51b0-py3-none-any.whl", hash = "sha256:fdc777359418e8d06c86012c3dc92c88a6453ba662e941593adb062e48c2eeae", size = 177416 },
 ]
 
 [[package]]
 name = "outlines"
-version = "0.1.13"
+version = "0.1.14"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "airportsdata" },
     { name = "cloudpickle" },
     { name = "diskcache" },
+    { name = "genson" },
     { name = "interegular" },
     { name = "jinja2" },
     { name = "jsonschema" },
     { name = "lark" },
     { name = "nest-asyncio" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "outlines-core" },
     { name = "pycountry" },
     { name = "pydantic" },
@@ -1488,9 +1497,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/35/bf/77024d3775cb4534cd26564f3a54c909a7c4d85bf59aab197ac97ce723f9/outlines-0.1.13.tar.gz", hash = "sha256:0233cb3ffae9cb6b01ad8d3c32b7d87e3f1cf7bdc7b28a0bc82cd3d277c09bca", size = 2490245 }
+sdist = { url = "https://files.pythonhosted.org/packages/e9/54/dfa5fea9283174f5febbd0aae3f082c3e2c2888bedcc9fb6f07f174d9382/outlines-0.1.14.tar.gz", hash = "sha256:35f0c49fc7eedc64ec08e2d6fd434845cf63cc0c3fdeb5900ac7902d074e57be", size = 2492199 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/eb/2e/add66052a28ec22181560cab1dcaf82265a5f74533d76d336d1e0d7a17de/outlines-0.1.13-py3-none-any.whl", hash = "sha256:d34d36dd5001ffc24ae60588aa3afa27b58c8dfd3ba5f9b8ccb92298b91e5903", size = 87680 },
+    { url = "https://files.pythonhosted.org/packages/bf/bb/a9639289a5d591a33585086aa3bf7722bc93e77769d050dd888db5d87395/outlines-0.1.14-py3-none-any.whl", hash = "sha256:a5090d50c368ed078051de25686a53032cd9f1702528afc646c3dae9482598ce", size = 88002 },
 ]
 
 [[package]]
@@ -1544,7 +1553,7 @@ version = "2.2.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "python-dateutil" },
     { name = "pytz" },
     { name = "tzdata" },
@@ -1602,7 +1611,7 @@ dependencies = [
     { name = "accelerate" },
     { name = "huggingface-hub" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "packaging" },
     { name = "psutil" },
     { name = "pyyaml" },
@@ -1901,16 +1910,16 @@ wheels = [
 
 [[package]]
 name = "pydantic"
-version = "2.10.5"
+version = "2.10.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-types" },
     { name = "pydantic-core" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/6a/c7/ca334c2ef6f2e046b1144fe4bb2a5da8a4c574e7f2ebf7e16b34a6a2fa92/pydantic-2.10.5.tar.gz", hash = "sha256:278b38dbbaec562011d659ee05f63346951b3a248a6f3642e1bc68894ea2b4ff", size = 761287 }
+sdist = { url = "https://files.pythonhosted.org/packages/b7/ae/d5220c5c52b158b1de7ca89fc5edb72f304a70a4c540c84c8844bf4008de/pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236", size = 761681 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/58/26/82663c79010b28eddf29dcdd0ea723439535fa917fce5905885c0e9ba562/pydantic-2.10.5-py3-none-any.whl", hash = "sha256:4dd4e322dbe55472cb7ca7e73f4b63574eecccf2835ffa2af9021ce113c83c53", size = 431426 },
+    { url = "https://files.pythonhosted.org/packages/f4/3c/8cc1cc84deffa6e25d2d0c688ebb80635dfdbf1dbea3e30c541c8cf4d860/pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584", size = 431696 },
 ]
 
 [[package]]
@@ -2050,11 +2059,11 @@ wheels = [
 
 [[package]]
 name = "pytz"
-version = "2024.2"
+version = "2025.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/3a/31/3c70bf7603cc2dca0f19bdc53b4537a797747a58875b552c8c413d963a3f/pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a", size = 319692 }
+sdist = { url = "https://files.pythonhosted.org/packages/5f/57/df1c9157c8d5a05117e455d66fd7cf6dbc46974f832b1058ed4856785d8a/pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e", size = 319617 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725", size = 508002 },
+    { url = "https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57", size = 507930 },
 ]
 
 [[package]]
@@ -2112,15 +2121,16 @@ wheels = [
 
 [[package]]
 name = "referencing"
-version = "0.35.1"
+version = "0.36.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "attrs" },
     { name = "rpds-py" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/99/5b/73ca1f8e72fff6fa52119dbd185f73a907b1989428917b24cff660129b6d/referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c", size = 62991 }
+sdist = { url = "https://files.pythonhosted.org/packages/2f/db/98b5c277be99dd18bfd91dd04e1b759cad18d1a338188c936e92f921c7e2/referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa", size = 74744 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/59/2056f61236782a2c86b33906c025d4f4a0b17be0161b63b70fd9e8775d36/referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de", size = 26684 },
+    { url = "https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0", size = 26775 },
 ]
 
 [[package]]
@@ -2412,13 +2422,12 @@ name = "scipy"
 version = "1.15.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.13'",
-    "python_full_version == '3.12.*'",
+    "python_full_version >= '3.12'",
     "python_full_version == '3.11.*'",
     "python_full_version == '3.10.*'",
 ]
 dependencies = [
-    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/76/c6/8eb0654ba0c7d0bb1bf67bf8fbace101a8e4f250f7722371105e8b6f68fc/scipy-1.15.1.tar.gz", hash = "sha256:033a75ddad1463970c96a88063a1df87ccfddd526437136b6ee81ff0312ebdf6", size = 59407493 }
 wheels = [
@@ -2556,7 +2565,7 @@ dependencies = [
     { name = "hf-transfer" },
     { name = "loguru" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "opentelemetry-api" },
     { name = "opentelemetry-exporter-otlp" },
     { name = "opentelemetry-instrumentation-grpc" },
@@ -2716,7 +2725,7 @@ wheels = [
 
 [[package]]
 name = "torch"
-version = "2.5.1"
+version = "2.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -2733,32 +2742,36 @@ dependencies = [
     { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "setuptools", marker = "python_full_version >= '3.12'" },
     { name = "sympy" },
-    { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "typing-extensions" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/2a/ef/834af4a885b31a0b32fff2d80e1e40f771e1566ea8ded55347502440786a/torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:71328e1bbe39d213b8721678f9dcac30dfc452a46d586f1d514a6aa0a99d4744", size = 906446312 },
-    { url = "https://files.pythonhosted.org/packages/69/f0/46e74e0d145f43fa506cb336eaefb2d240547e4ce1f496e442711093ab25/torch-2.5.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:34bfa1a852e5714cbfa17f27c49d8ce35e1b7af5608c4bc6e81392c352dbc601", size = 91919522 },
-    { url = "https://files.pythonhosted.org/packages/a5/13/1eb674c8efbd04d71e4a157ceba991904f633e009a584dd65dccbafbb648/torch-2.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:32a037bd98a241df6c93e4c789b683335da76a2ac142c0973675b715102dc5fa", size = 203088048 },
-    { url = "https://files.pythonhosted.org/packages/a9/9d/e0860474ee0ff8f6ef2c50ec8f71a250f38d78a9b9df9fd241ad3397a65b/torch-2.5.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:23d062bf70776a3d04dbe74db950db2a5245e1ba4f27208a87f0d743b0d06e86", size = 63877046 },
-    { url = "https://files.pythonhosted.org/packages/d1/35/e8b2daf02ce933e4518e6f5682c72fd0ed66c15910ea1fb4168f442b71c4/torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:de5b7d6740c4b636ef4db92be922f0edc425b65ed78c5076c43c42d362a45457", size = 906474467 },
-    { url = "https://files.pythonhosted.org/packages/40/04/bd91593a4ca178ece93ca55f27e2783aa524aaccbfda66831d59a054c31e/torch-2.5.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:340ce0432cad0d37f5a31be666896e16788f1adf8ad7be481196b503dad675b9", size = 91919450 },
-    { url = "https://files.pythonhosted.org/packages/0d/4a/e51420d46cfc90562e85af2fee912237c662ab31140ab179e49bd69401d6/torch-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:603c52d2fe06433c18b747d25f5c333f9c1d58615620578c326d66f258686f9a", size = 203098237 },
-    { url = "https://files.pythonhosted.org/packages/d0/db/5d9cbfbc7968d79c5c09a0bc0bc3735da079f2fd07cc10498a62b320a480/torch-2.5.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:31f8c39660962f9ae4eeec995e3049b5492eb7360dd4f07377658ef4d728fa4c", size = 63884466 },
-    { url = "https://files.pythonhosted.org/packages/8b/5c/36c114d120bfe10f9323ed35061bc5878cc74f3f594003854b0ea298942f/torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:ed231a4b3a5952177fafb661213d690a72caaad97d5824dd4fc17ab9e15cec03", size = 906389343 },
-    { url = "https://files.pythonhosted.org/packages/6d/69/d8ada8b6e0a4257556d5b4ddeb4345ea8eeaaef3c98b60d1cca197c7ad8e/torch-2.5.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:3f4b7f10a247e0dcd7ea97dc2d3bfbfc90302ed36d7f3952b0008d0df264e697", size = 91811673 },
-    { url = "https://files.pythonhosted.org/packages/5f/ba/607d013b55b9fd805db2a5c2662ec7551f1910b4eef39653eeaba182c5b2/torch-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:73e58e78f7d220917c5dbfad1a40e09df9929d3b95d25e57d9f8558f84c9a11c", size = 203046841 },
-    { url = "https://files.pythonhosted.org/packages/57/6c/bf52ff061da33deb9f94f4121fde7ff3058812cb7d2036c97bc167793bd1/torch-2.5.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:8c712df61101964eb11910a846514011f0b6f5920c55dbf567bff8a34163d5b1", size = 63858109 },
-    { url = "https://files.pythonhosted.org/packages/69/72/20cb30f3b39a9face296491a86adb6ff8f1a47a897e4d14667e6cf89d5c3/torch-2.5.1-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:9b61edf3b4f6e3b0e0adda8b3960266b9009d02b37555971f4d1c8f7a05afed7", size = 906393265 },
-    { url = "https://files.pythonhosted.org/packages/a9/18/81c399e8f4f1580d34bf99d827cb5fb5cf7a18a266bb5d30ca3ec2e89ba6/torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:1f3b7fb3cf7ab97fae52161423f81be8c6b8afac8d9760823fd623994581e1a3", size = 906479005 },
-    { url = "https://files.pythonhosted.org/packages/5d/86/1c4b168d52cddb8d17952a7b5b25f69ef0da1fc34de1223d73d0d9db1801/torch-2.5.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7974e3dce28b5a21fb554b73e1bc9072c25dde873fa00d54280861e7a009d7dc", size = 91846074 },
-    { url = "https://files.pythonhosted.org/packages/76/49/4a0a8b19ce8f9bf32fcab4e863c7e2366f519f9826c84ca250567b11a014/torch-2.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:46c817d3ea33696ad3b9df5e774dba2257e9a4cd3c4a3afbf92f6bb13ac5ce2d", size = 203000888 },
-    { url = "https://files.pythonhosted.org/packages/25/07/3548a7cfcf69d0eccec2ee79ee3913f1cdaadb27b36946774db86729ee47/torch-2.5.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:8046768b7f6d35b85d101b4b38cba8aa2f3cd51952bc4c06a49580f2ce682291", size = 63876023 },
+    { url = "https://files.pythonhosted.org/packages/37/81/aa9ab58ec10264c1abe62c8b73f5086c3c558885d6beecebf699f0dbeaeb/torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961", size = 766685561 },
+    { url = "https://files.pythonhosted.org/packages/86/86/e661e229df2f5bfc6eab4c97deb1286d598bbeff31ab0cdb99b3c0d53c6f/torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab", size = 95751887 },
+    { url = "https://files.pythonhosted.org/packages/20/e0/5cb2f8493571f0a5a7273cd7078f191ac252a402b5fb9cb6091f14879109/torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341", size = 204165139 },
+    { url = "https://files.pythonhosted.org/packages/e5/16/ea1b7842413a7b8a5aaa5e99e8eaf3da3183cc3ab345ad025a07ff636301/torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628", size = 66520221 },
+    { url = "https://files.pythonhosted.org/packages/78/a9/97cbbc97002fff0de394a2da2cdfa859481fdca36996d7bd845d50aa9d8d/torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1", size = 766715424 },
+    { url = "https://files.pythonhosted.org/packages/6d/fa/134ce8f8a7ea07f09588c9cc2cea0d69249efab977707cf67669431dcf5c/torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d", size = 95759416 },
+    { url = "https://files.pythonhosted.org/packages/11/c5/2370d96b31eb1841c3a0883a492c15278a6718ccad61bb6a649c80d1d9eb/torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7", size = 204164970 },
+    { url = "https://files.pythonhosted.org/packages/0b/fa/f33a4148c6fb46ca2a3f8de39c24d473822d5774d652b66ed9b1214da5f7/torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21", size = 66530713 },
+    { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563 },
+    { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867 },
+    { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469 },
+    { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538 },
+    { url = "https://files.pythonhosted.org/packages/24/85/ead1349fc30fe5a32cadd947c91bda4a62fbfd7f8c34ee61f6398d38fb48/torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf", size = 766626191 },
+    { url = "https://files.pythonhosted.org/packages/dd/b0/26f06f9428b250d856f6d512413e9e800b78625f63801cbba13957432036/torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b", size = 95611439 },
+    { url = "https://files.pythonhosted.org/packages/c2/9c/fc5224e9770c83faed3a087112d73147cd7c7bfb7557dcf9ad87e1dda163/torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc", size = 204126475 },
+    { url = "https://files.pythonhosted.org/packages/88/8b/d60c0491ab63634763be1537ad488694d316ddc4a20eaadd639cedc53971/torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2", size = 66536783 },
+    { url = "https://files.pythonhosted.org/packages/40/bb/feb5644baa621fd8e1e88bf51f6fa38ab3f985d472a764144ff4867ac1d6/torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053", size = 766680961 },
+    { url = "https://files.pythonhosted.org/packages/ee/11/08774a8198a33263947c59e04b8a0bf85a61a44e82100c46cf833bbce35e/torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e", size = 95782656 },
+    { url = "https://files.pythonhosted.org/packages/c1/0d/56fb07032accbfebb4555638b6002ec5678d0942da85497e40f9405ab756/torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a", size = 204061417 },
+    { url = "https://files.pythonhosted.org/packages/b3/17/41f681b87290a1d2f1394f943e470f8b0b3c2987b7df8dc078d8831fce5b/torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c", size = 66520446 },
 ]
 
 [[package]]
@@ -2775,13 +2788,13 @@ wheels = [
 
 [[package]]
 name = "transformers"
-version = "4.48.2"
+version = "4.48.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "huggingface-hub" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
@@ -2790,23 +2803,21 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/c5/cf/1093586e09c8d889d2f6b8ffe6a1369e1e179eb7b8e732fc0f348a8fe58f/transformers-4.48.2.tar.gz", hash = "sha256:dcfb73473e61f22fb3366fe2471ed2e42779ecdd49527a1bdf1937574855d516", size = 8370945 }
+sdist = { url = "https://files.pythonhosted.org/packages/e3/82/cebeb7af5e64440f1638f18c4ed0f89156d0eeaa6290d98da8ca93ac3872/transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb", size = 8373458 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bd/40/902c95a2a6f5d2d120c940ac4bd1f937c01035af529803c13d65ca33c2d1/transformers-4.48.2-py3-none-any.whl", hash = "sha256:493bc5b0268b116eff305edf6656367fc89cf570e7a9d5891369e04751db698a", size = 9667774 },
+    { url = "https://files.pythonhosted.org/packages/b6/1a/efeecb8d83705f2f4beac98d46f2148c95ecd7babfb31b5c0f1e7017e83d/transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36", size = 9669412 },
 ]
 
 [[package]]
 name = "triton"
-version = "3.1.0"
+version = "3.2.0"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "filelock", marker = "python_full_version < '3.13'" },
-]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 },
-    { url = "https://files.pythonhosted.org/packages/86/17/d9a5cf4fcf46291856d1e90762e36cbabd2a56c7265da0d1d9508c8e3943/triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c", size = 209506424 },
-    { url = "https://files.pythonhosted.org/packages/78/eb/65f5ba83c2a123f6498a3097746607e5b2f16add29e36765305e4ac7fdd8/triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc", size = 209551444 },
-    { url = "https://files.pythonhosted.org/packages/c4/69/57e0fed438d547524e08bfedc587078314176ad1c15c8be904d3f03149ec/triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11", size = 209460480 },
+    { url = "https://files.pythonhosted.org/packages/01/65/3ffa90e158a2c82f0716eee8d26a725d241549b7d7aaf7e4f44ac03ebd89/triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62", size = 253090354 },
+    { url = "https://files.pythonhosted.org/packages/a7/2e/757d2280d4fefe7d33af7615124e7e298ae7b8e3bc4446cdb8e88b0f9bab/triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220", size = 253157636 },
+    { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365 },
+    { url = "https://files.pythonhosted.org/packages/c7/30/37a3384d1e2e9320331baca41e835e90a3767303642c7a80d4510152cbcf/triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0", size = 253154278 },
+    { url = "https://files.pythonhosted.org/packages/bc/74/9f12bdedeb110242d8bb1bd621f6605e753ee0cbf73cf7f3a62b8173f190/triton-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30ceed0eff2c4a73b14eb63e052992f44bbdf175f3fad21e1ac8097a772de7ee", size = 253057866 },
 ]
 
 [[package]]
@@ -2826,11 +2837,11 @@ wheels = [
 
 [[package]]
 name = "types-protobuf"
-version = "5.29.1.20241207"
+version = "5.29.1.20250208"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/70/89/b661a447139f665ccea8e39bfdd52a92f803df4b5de0e6001a3537feaacb/types_protobuf-5.29.1.20241207.tar.gz", hash = "sha256:2ebcadb8ab3ef2e3e2f067e0882906d64ba0dc65fc5b0fd7a8b692315b4a0be9", size = 59190 }
+sdist = { url = "https://files.pythonhosted.org/packages/ff/75/8effaebae86ae9e489d4d453b30c6da728e8a9267a261418d476c5c2fbe6/types_protobuf-5.29.1.20250208.tar.gz", hash = "sha256:c1acd6a59ab554dbe09b5d1fa7dd701e2fcfb2212937a3af1c03b736060b792a", size = 59330 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/6e/cdf152187019d6f6d04066b23e48659d961b527e9c6d43b48459d160e332/types_protobuf-5.29.1.20241207-py3-none-any.whl", hash = "sha256:92893c42083e9b718c678badc0af7a9a1307b92afe1599e5cba5f3d35b668b2f", size = 73902 },
+    { url = "https://files.pythonhosted.org/packages/96/e9/095c36efdc9dbb1df75ce42f37c0de6a6879aad937a08f0b97942b7c5968/types_protobuf-5.29.1.20250208-py3-none-any.whl", hash = "sha256:c5f8bfb4afdc1b5cbca1848f2c8b361a2090add7401f410b22b599ef647bf483", size = 73925 },
 ]
 
 [[package]]
@@ -2844,11 +2855,11 @@ wheels = [
 
 [[package]]
 name = "tzdata"
-version = "2024.2"
+version = "2025.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e1/34/943888654477a574a86a98e9896bae89c7aa15078ec29f490fef2f1e5384/tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc", size = 193282 }
+sdist = { url = "https://files.pythonhosted.org/packages/43/0f/fa4723f22942480be4ca9527bbde8d43f6c3f2fe8412f00e7f5f6746bc8b/tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694", size = 194950 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 },
+    { url = "https://files.pythonhosted.org/packages/0f/dd/84f10e23edd882c6f968c21c2434fe67bd4a528967067515feca9e611e5e/tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639", size = 346762 },
 ]
 
 [[package]]
diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh
index 94ea9436..cd551ed5 100755
--- a/tgi-entrypoint.sh
+++ b/tgi-entrypoint.sh
@@ -2,5 +2,5 @@
 
 ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
 
-source ./server/.venv/bin/activate
+source ./.venv/bin/activate
 exec text-generation-launcher $@

From 6df0fc0b55cf9c8e8124e7a3e3d8ed96a9aeb5f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 14 Feb 2025 11:33:49 +0100
Subject: [PATCH 086/183] Support sigmoid scoring function in GPTQ-MoE (#3017)

---
 .../layers/moe/gptq_marlin.py                      | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/server/text_generation_server/layers/moe/gptq_marlin.py b/server/text_generation_server/layers/moe/gptq_marlin.py
index 75c076ab..d1ce4f3e 100644
--- a/server/text_generation_server/layers/moe/gptq_marlin.py
+++ b/server/text_generation_server/layers/moe/gptq_marlin.py
@@ -74,8 +74,10 @@ class GPTQMarlinSparseMoELayer(nn.Module):
         scoring_func: Optional[str] = None,
         e_score_correction_bias: Optional[float] = None,
     ):
-        assert scoring_func == "softmax", f"scoring func {scoring_func} is not handled"
-        assert e_score_correction_bias is None, "scoring correction bias is not handled"
+        assert scoring_func in (
+            "sigmoid",
+            "softmax",
+        ), f"scoring func {scoring_func} is not handled"
         super().__init__()
 
         if not (
@@ -98,6 +100,8 @@ class GPTQMarlinSparseMoELayer(nn.Module):
         self.topk = topk
         self.topk_group = topk_group
         self.renormalize = renormalize
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
 
         self.gate_up_proj = _load_expert_multi_weights_col(
             prefix=prefix,
@@ -139,6 +143,8 @@ class GPTQMarlinSparseMoELayer(nn.Module):
             num_expert_group=self.n_expert_group,
             topk_group=self.topk_group,
             num_bits=self.bits,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias,
         )
 
 
@@ -256,6 +262,8 @@ def fused_marlin_moe(
     num_expert_group: Optional[int] = None,
     custom_routing_function: Optional[Callable] = None,
     topk_group: Optional[int] = None,
+    scoring_func: Optional[str] = None,
+    e_score_correction_bias: Optional[float] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -307,6 +315,8 @@ def fused_marlin_moe(
             renormalize=renormalize,
             num_expert_group=num_expert_group,
             topk_group=topk_group,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
         )
     elif custom_routing_function is None:
         topk_weights, topk_ids = moe_kernels.fused_topk(

From cfd4fbb479835e80ec06e37f3717b64a6792a222 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Fri, 14 Feb 2025 13:40:57 +0100
Subject: [PATCH 087/183] [Backend] Add Llamacpp backend (#2975)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add llamacpp backend

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Get rid of llama_batch_get_one()

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Use max_batch_total_tokens

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Handle max_batch_size

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add some input validation checks

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Handle ctx args & fix sampling

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add GPU args

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add --defrag-threshold

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add a stupid batch mechanism

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Cleanup

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add --numa

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Fix args

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Enable flash attention by default

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add --offload-kqv

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Fix batch_pos

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* backend(llama): add CUDA Dockerfile_llamacpp for now

* Only export the latest logits

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Output real logprobs

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Fix batching

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Fix seq iterations

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Auto-detect n_threads when not provided

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Clear request cache after completion

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Remove warmup

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Cleanup

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* backend(llama): add CUDA architectures build argument for Dockerfile

* Add specific args for batch

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add --type-v & --type-k

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Bump llamacpp to b4623

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Disable graceful shutdown in debug mode

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update Dockerfile_llamacpp

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Cleanup Dockerfile

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update Cargo.lock

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update args

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Simplify batching logic

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Set TGI_LLAMA_PKG_CUDA from CUDA_VERSION

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Rename bindings

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Remove n_ctx

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Make max_batch_total_tokens optional

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Ensure all samplers are freed on error

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Initialize penalty_last_n with llamacpp default value

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Cleanup

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Improve default settings

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add doc

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update docs

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Thanks clippy

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Thanks cargo fmt

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update docs

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Do not use HOSTNAME env

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Bump llama.cpp & cuda

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Fix requirements.txt

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Fix fmt

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Enable KQV offload by default

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Remove Ngrok tunneling

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Remove .cargo/config.toml

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Fix Dockerfile

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add missing cuda prefix

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Handle custom llama.cpp dir

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Cleanup

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add README.md

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add HF transfer

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Fix bool args

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update doc

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update doc

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

---------

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com>
---
 Cargo.lock                           | 927 +++++++++++++++------------
 Cargo.toml                           |   1 +
 Dockerfile_llamacpp                  |  76 +++
 backends/llamacpp/Cargo.toml         |  21 +
 backends/llamacpp/README.md          |  24 +
 backends/llamacpp/build.rs           |  48 ++
 backends/llamacpp/requirements.txt   |   3 +
 backends/llamacpp/src/backend.rs     | 679 ++++++++++++++++++++
 backends/llamacpp/src/main.rs        | 284 ++++++++
 docs/source/_toctree.yml             |   2 +
 docs/source/backends/llamacpp.md     | 120 ++++
 docs/source/multi_backend_support.md |   2 +
 12 files changed, 1765 insertions(+), 422 deletions(-)
 create mode 100644 Dockerfile_llamacpp
 create mode 100644 backends/llamacpp/Cargo.toml
 create mode 100644 backends/llamacpp/README.md
 create mode 100644 backends/llamacpp/build.rs
 create mode 100644 backends/llamacpp/requirements.txt
 create mode 100644 backends/llamacpp/src/backend.rs
 create mode 100644 backends/llamacpp/src/main.rs
 create mode 100644 docs/source/backends/llamacpp.md

diff --git a/Cargo.lock b/Cargo.lock
index 915de0d5..547cff9b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -24,7 +24,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
 dependencies = [
  "cfg-if",
- "getrandom",
+ "getrandom 0.2.15",
  "once_cell",
  "serde",
  "version_check",
@@ -48,9 +48,9 @@ checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1"
 
 [[package]]
 name = "allocator-api2"
-version = "0.2.20"
+version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
 
 [[package]]
 name = "android-tzdata"
@@ -108,19 +108,20 @@ dependencies = [
 
 [[package]]
 name = "anstyle-wincon"
-version = "3.0.6"
+version = "3.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
+checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e"
 dependencies = [
  "anstyle",
+ "once_cell",
  "windows-sys 0.59.0",
 ]
 
 [[package]]
 name = "anyhow"
-version = "1.0.93"
+version = "1.0.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
+checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
 
 [[package]]
 name = "arbitrary"
@@ -142,7 +143,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -181,18 +182,18 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "async-trait"
-version = "0.1.83"
+version = "0.1.85"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd"
+checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -266,28 +267,26 @@ dependencies = [
 
 [[package]]
 name = "aws-lc-rs"
-version = "1.11.0"
+version = "1.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe7c2840b66236045acd2607d5866e274380afd87ef99d6226e961e2cb47df45"
+checksum = "4c2b7ddaa2c56a367ad27a094ad8ef4faacf8a617c2575acb2ba88949df999ca"
 dependencies = [
  "aws-lc-sys",
- "mirai-annotations",
  "paste",
  "zeroize",
 ]
 
 [[package]]
 name = "aws-lc-sys"
-version = "0.23.0"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad3a619a9de81e1d7de1f1186dcba4506ed661a0e483d84410fdef0ee87b2f96"
+checksum = "71b2ddd3ada61a305e1d8bb6c005d1eaa7d14d903681edfc400406d523a9b491"
 dependencies = [
- "bindgen",
+ "bindgen 0.69.5",
  "cc",
  "cmake",
  "dunce",
  "fs_extra",
- "libc",
  "paste",
 ]
 
@@ -304,7 +303,7 @@ dependencies = [
  "futures-util",
  "http 0.2.12",
  "http-body 0.4.6",
- "hyper 0.14.31",
+ "hyper 0.14.32",
  "itoa",
  "matchit",
  "memchr",
@@ -333,10 +332,10 @@ dependencies = [
  "axum-core 0.4.5",
  "bytes",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body 1.0.1",
  "http-body-util",
- "hyper 1.5.1",
+ "hyper 1.6.0",
  "hyper-util",
  "itoa",
  "matchit",
@@ -351,7 +350,7 @@ dependencies = [
  "serde_urlencoded",
  "sync_wrapper 1.0.2",
  "tokio",
- "tower 0.5.1",
+ "tower 0.5.2",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -383,7 +382,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body 1.0.1",
  "http-body-util",
  "mime",
@@ -404,7 +403,7 @@ dependencies = [
  "axum 0.7.9",
  "futures-core",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "opentelemetry 0.21.0",
  "pin-project-lite",
  "tower 0.4.13",
@@ -452,7 +451,7 @@ version = "0.69.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "cexpr",
  "clang-sys",
  "itertools 0.12.1",
@@ -463,12 +462,32 @@ dependencies = [
  "proc-macro2",
  "quote",
  "regex",
- "rustc-hash",
+ "rustc-hash 1.1.0",
  "shlex",
- "syn 2.0.89",
+ "syn 2.0.96",
  "which",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.71.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
+dependencies = [
+ "bitflags 2.8.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.13.0",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash 2.1.0",
+ "shlex",
+ "syn 2.0.96",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.8.0"
@@ -498,9 +517,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.6.0"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
 
 [[package]]
 name = "bitstream-io"
@@ -531,9 +550,9 @@ checksum = "c360505aed52b7ec96a3636c3f039d99103c37d1d9b4f7a8c743d3ea9ffcd03b"
 
 [[package]]
 name = "bumpalo"
-version = "3.16.0"
+version = "3.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
+checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
 
 [[package]]
 name = "bytecount"
@@ -543,9 +562,9 @@ checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
 
 [[package]]
 name = "bytemuck"
-version = "1.20.0"
+version = "1.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a"
+checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3"
 
 [[package]]
 name = "byteorder"
@@ -561,9 +580,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
 
 [[package]]
 name = "bytes"
-version = "1.8.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da"
+checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
 
 [[package]]
 name = "camino"
@@ -576,9 +595,9 @@ dependencies = [
 
 [[package]]
 name = "cargo-platform"
-version = "0.1.8"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24b1f0365a6c6bb4020cd05806fd0d33c44d38046b8bd7f0e40814b9763cabfc"
+checksum = "e35af189006b9c0f00a064685c727031e3ed2d8020f7ba284d78cc2671bd36ea"
 dependencies = [
  "serde",
 ]
@@ -594,7 +613,7 @@ dependencies = [
  "semver",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -620,9 +639,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.1"
+version = "1.2.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47"
+checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229"
 dependencies = [
  "jobserver",
  "libc",
@@ -704,9 +723,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.21"
+version = "4.5.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f"
+checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -714,9 +733,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.21"
+version = "4.5.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec"
+checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7"
 dependencies = [
  "anstream",
  "anstyle",
@@ -726,27 +745,27 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.18"
+version = "4.5.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
+checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "clap_lex"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
+checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
 
 [[package]]
 name = "cmake"
-version = "0.1.51"
+version = "0.1.53"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
+checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6"
 dependencies = [
  "cc",
 ]
@@ -775,9 +794,9 @@ checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
 
 [[package]]
 name = "compact_str"
-version = "0.8.0"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
+checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32"
 dependencies = [
  "castaway",
  "cfg-if",
@@ -789,15 +808,15 @@ dependencies = [
 
 [[package]]
 name = "console"
-version = "0.15.8"
+version = "0.15.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
+checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
 dependencies = [
  "encode_unicode",
- "lazy_static",
  "libc",
- "unicode-width 0.1.14",
- "windows-sys 0.52.0",
+ "once_cell",
+ "unicode-width 0.2.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -828,9 +847,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
 dependencies = [
  "libc",
 ]
@@ -882,18 +901,18 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-channel"
-version = "0.5.13"
+version = "0.5.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2"
+checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471"
 dependencies = [
  "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
 dependencies = [
  "crossbeam-epoch",
  "crossbeam-utils",
@@ -910,9 +929,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.20"
+version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
 [[package]]
 name = "crossterm"
@@ -920,7 +939,7 @@ version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "crossterm_winapi",
  "mio",
  "parking_lot",
@@ -941,9 +960,9 @@ dependencies = [
 
 [[package]]
 name = "crunchy"
-version = "0.2.2"
+version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
 
 [[package]]
 name = "crypto-common"
@@ -988,46 +1007,61 @@ dependencies = [
 
 [[package]]
 name = "cxx"
-version = "1.0.130"
+version = "1.0.137"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23c042a0ba58aaff55299632834d1ea53ceff73d62373f62c9ae60890ad1b942"
+checksum = "0fc894913dccfed0f84106062c284fa021c3ba70cb1d78797d6f5165d4492e45"
 dependencies = [
  "cc",
+ "cxxbridge-cmd",
  "cxxbridge-flags",
  "cxxbridge-macro",
+ "foldhash",
  "link-cplusplus",
 ]
 
 [[package]]
 name = "cxx-build"
-version = "1.0.130"
+version = "1.0.137"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45dc1c88d0fdac57518a9b1f6c4f4fb2aca8f3c30c0d03d7d8518b47ca0bcea6"
+checksum = "503b2bfb6b3e8ce7f95d865a67419451832083d3186958290cee6c53e39dfcfe"
 dependencies = [
  "cc",
  "codespan-reporting",
  "proc-macro2",
  "quote",
  "scratch",
- "syn 2.0.89",
+ "syn 2.0.96",
+]
+
+[[package]]
+name = "cxxbridge-cmd"
+version = "1.0.137"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e0d2cb64a95b4b5a381971482235c4db2e0208302a962acdbe314db03cbbe2fb"
+dependencies = [
+ "clap 4.5.27",
+ "codespan-reporting",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "cxxbridge-flags"
-version = "1.0.130"
+version = "1.0.137"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa7ed7d30b289e2592cc55bc2ccd89803a63c913e008e6eb59f06cddf45bb52f"
+checksum = "5f797b0206463c9c2a68ed605ab28892cca784f1ef066050f4942e3de26ad885"
 
 [[package]]
 name = "cxxbridge-macro"
-version = "1.0.130"
+version = "1.0.137"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b8c465d22de46b851c04630a5fc749a26005b263632ed2e0d9cc81518ead78d"
+checksum = "e79010a2093848e65a3e0f7062d3f02fb2ef27f866416dfe436fccfa73d3bb59"
 dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -1051,7 +1085,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -1062,7 +1096,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -1092,7 +1126,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -1102,15 +1136,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
-[[package]]
-name = "diff"
-version = "0.1.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
-
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -1150,7 +1178,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -1161,9 +1189,9 @@ checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
 
 [[package]]
 name = "easy-cast"
-version = "0.5.2"
+version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10936778145f3bea71fd9bf61332cce28c28e96a380714f7ab34838b80733fd6"
+checksum = "72852736692ec862655eca398c9bb1b476161b563c9f80f45f4808b9629750d6"
 dependencies = [
  "libm",
 ]
@@ -1185,9 +1213,9 @@ dependencies = [
 
 [[package]]
 name = "encode_unicode"
-version = "0.3.6"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
+checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
 
 [[package]]
 name = "encoding_rs"
@@ -1206,12 +1234,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.9"
+version = "0.3.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
+checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
 dependencies = [
  "libc",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -1251,15 +1279,15 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.2.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4"
+checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
 
 [[package]]
 name = "fdeflate"
-version = "0.3.6"
+version = "0.3.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07c6f4c64c1d33a3111c4466f7365ebdcc37c5bd1ea0d62aae2e3d722aacbedb"
+checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c"
 dependencies = [
  "simd-adler32",
 ]
@@ -1311,9 +1339,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
 [[package]]
 name = "foldhash"
-version = "0.1.3"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2"
+checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
 
 [[package]]
 name = "foreign-types"
@@ -1411,7 +1439,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -1471,7 +1499,19 @@ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 dependencies = [
  "cfg-if",
  "libc",
- "wasi",
+ "wasi 0.11.0+wasi-snapshot-preview1",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi 0.13.3+wasi-0.2.2",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -1492,9 +1532,9 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
 
 [[package]]
 name = "glob"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
+checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
 
 [[package]]
 name = "grpc-metadata"
@@ -1518,7 +1558,7 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "http 0.2.12",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
  "slab",
  "tokio",
  "tokio-util",
@@ -1536,8 +1576,8 @@ dependencies = [
  "fnv",
  "futures-core",
  "futures-sink",
- "http 1.1.0",
- "indexmap 2.6.0",
+ "http 1.2.0",
+ "indexmap 2.7.1",
  "slab",
  "tokio",
  "tokio-util",
@@ -1577,9 +1617,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.15.1"
+version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
 dependencies = [
  "allocator-api2",
  "equivalent",
@@ -1629,18 +1669,18 @@ dependencies = [
  "reqwest 0.11.27",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "ureq",
 ]
 
 [[package]]
 name = "home"
-version = "0.5.9"
+version = "0.5.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf"
 dependencies = [
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -1667,9 +1707,9 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "1.1.0"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
+checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea"
 dependencies = [
  "bytes",
  "fnv",
@@ -1694,7 +1734,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
 dependencies = [
  "bytes",
- "http 1.1.0",
+ "http 1.2.0",
 ]
 
 [[package]]
@@ -1705,16 +1745,16 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f"
 dependencies = [
  "bytes",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body 1.0.1",
  "pin-project-lite",
 ]
 
 [[package]]
 name = "httparse"
-version = "1.9.5"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946"
+checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a"
 
 [[package]]
 name = "httpdate"
@@ -1724,9 +1764,9 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
 
 [[package]]
 name = "hyper"
-version = "0.14.31"
+version = "0.14.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c08302e8fa335b151b788c775ff56e7a03ae64ff85c548ee820fecb70356e85"
+checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7"
 dependencies = [
  "bytes",
  "futures-channel",
@@ -1748,15 +1788,15 @@ dependencies = [
 
 [[package]]
 name = "hyper"
-version = "1.5.1"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97818827ef4f364230e16705d4706e2897df2bb60617d6ca15d598025a3c481f"
+checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
 dependencies = [
  "bytes",
  "futures-channel",
  "futures-util",
  "h2 0.4.7",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body 1.0.1",
  "httparse",
  "httpdate",
@@ -1769,16 +1809,16 @@ dependencies = [
 
 [[package]]
 name = "hyper-rustls"
-version = "0.27.3"
+version = "0.27.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333"
+checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2"
 dependencies = [
  "futures-util",
- "http 1.1.0",
- "hyper 1.5.1",
+ "http 1.2.0",
+ "hyper 1.6.0",
  "hyper-util",
  "log",
- "rustls 0.23.17",
+ "rustls 0.23.21",
  "rustls-native-certs",
  "rustls-pki-types",
  "tokio",
@@ -1792,7 +1832,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper 0.14.31",
+ "hyper 0.14.32",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
@@ -1805,7 +1845,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
 dependencies = [
  "bytes",
- "hyper 0.14.31",
+ "hyper 0.14.32",
  "native-tls",
  "tokio",
  "tokio-native-tls",
@@ -1820,9 +1860,9 @@ dependencies = [
  "bytes",
  "futures-channel",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body 1.0.1",
- "hyper 1.5.1",
+ "hyper 1.6.0",
  "pin-project-lite",
  "socket2",
  "tokio",
@@ -1968,7 +2008,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -2023,9 +2063,9 @@ dependencies = [
 
 [[package]]
 name = "image-webp"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e031e8e3d94711a9ccb5d6ea357439ef3dcbed361798bd4071dc4d9793fbe22f"
+checksum = "b77d01e822461baa8409e156015a1d91735549f0f2c17691bd2d996bef238f7f"
 dependencies = [
  "byteorder-lite",
  "quick-error",
@@ -2049,20 +2089,20 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.6.0"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
+checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652"
 dependencies = [
  "equivalent",
- "hashbrown 0.15.1",
+ "hashbrown 0.15.2",
  "serde",
 ]
 
 [[package]]
 name = "indicatif"
-version = "0.17.9"
+version = "0.17.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
+checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
 dependencies = [
  "console",
  "number_prefix",
@@ -2085,23 +2125,22 @@ checksum = "94bd26b1b737bc11f183620072e188d1c6ede67e0e78682228d66b49ec510e17"
 dependencies = [
  "opentelemetry 0.20.0",
  "opentelemetry-otlp",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
  "tracing-opentelemetry 0.21.0",
 ]
 
 [[package]]
 name = "instability"
-version = "0.3.3"
+version = "0.3.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b829f37dead9dc39df40c2d3376c179fdfd2ac771f53f55d3c30dc096a3c0c6e"
+checksum = "0bf9fed6d91cfb734e7476a06bde8300a1b94e217e1b523b6f0cd1a01998c71d"
 dependencies = [
  "darling",
  "indoc",
- "pretty_assertions",
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -2112,14 +2151,14 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "ipnet"
-version = "2.10.1"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708"
+checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
 
 [[package]]
 name = "is_terminal_polyfill"
@@ -2165,9 +2204,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.13"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "540654e97a3f4470a492cd30ff187bc95d89557a903a2bbf112e2fae98104ef2"
+checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
 
 [[package]]
 name = "jobserver"
@@ -2186,18 +2225,19 @@ checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0"
 
 [[package]]
 name = "js-sys"
-version = "0.3.72"
+version = "0.3.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9"
+checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
 dependencies = [
+ "once_cell",
  "wasm-bindgen",
 ]
 
 [[package]]
 name = "jsonschema"
-version = "0.28.0"
+version = "0.28.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74d8eb539cdb4222da29bb658cc9881aa2477b33fb1a74c5c31450395fc1a4b2"
+checksum = "4b8f66fe41fa46a5c83ed1c717b7e0b4635988f427083108c8cf0a882cc13441"
 dependencies = [
  "ahash",
  "base64 0.22.1",
@@ -2212,7 +2252,7 @@ dependencies = [
  "percent-encoding",
  "referencing",
  "regex-syntax 0.8.5",
- "reqwest 0.12.9",
+ "reqwest 0.12.12",
  "serde",
  "serde_json",
  "uuid-simd",
@@ -2244,9 +2284,9 @@ checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
 [[package]]
 name = "libfuzzer-sys"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa"
+checksum = "cf78f52d400cf2d84a3a973a78a592b4adc535739e0a5597a0da6f0c357adc75"
 dependencies = [
  "arbitrary",
  "cc",
@@ -2254,9 +2294,9 @@ dependencies = [
 
 [[package]]
 name = "libloading"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
+checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
 dependencies = [
  "cfg-if",
  "windows-targets 0.52.6",
@@ -2274,7 +2314,7 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "libc",
 ]
 
@@ -2289,15 +2329,15 @@ dependencies = [
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.14"
+version = "0.4.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
+checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
 
 [[package]]
 name = "litemap"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
+checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
 
 [[package]]
 name = "lock_api"
@@ -2311,9 +2351,9 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.22"
+version = "0.4.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
+checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
 
 [[package]]
 name = "loop9"
@@ -2330,7 +2370,7 @@ version = "0.12.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
 dependencies = [
- "hashbrown 0.15.1",
+ "hashbrown 0.15.2",
 ]
 
 [[package]]
@@ -2413,15 +2453,15 @@ checksum = "b4f0c8427b39666bf970460908b213ec09b3b350f20c0c2eabcbba51704a08e6"
 dependencies = [
  "base64 0.22.1",
  "http-body-util",
- "hyper 1.5.1",
+ "hyper 1.6.0",
  "hyper-rustls",
  "hyper-util",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
  "ipnet",
  "metrics",
  "metrics-util",
  "quanta",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tracing",
 ]
@@ -2459,9 +2499,9 @@ dependencies = [
 
 [[package]]
 name = "minijinja"
-version = "2.5.0"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c37e1b517d1dcd0e51dc36c4567b9d5a29262b3ec8da6cb5d35e27a8fb529b5"
+checksum = "cff7b8df5e85e30b87c2b0b3f58ba3a87b68e133738bf512a7713769326dbca9"
 dependencies = [
  "serde",
  "serde_json",
@@ -2469,9 +2509,9 @@ dependencies = [
 
 [[package]]
 name = "minijinja-contrib"
-version = "2.5.0"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fe51f1a6a8285f03fcd1544d834234fe8db285f29e1c2253600c93b3ae19242"
+checksum = "7ac3e47a9006ed0500425a092c9f8b2e56d10f8aeec8ce870c5e8a7c6ef2d7c3"
 dependencies = [
  "minijinja",
  "serde",
@@ -2485,9 +2525,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.8.0"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1"
+checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924"
 dependencies = [
  "adler2",
  "simd-adler32",
@@ -2495,23 +2535,16 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "1.0.2"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
- "hermit-abi 0.3.9",
  "libc",
  "log",
- "wasi",
+ "wasi 0.11.0+wasi-snapshot-preview1",
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "mirai-annotations"
-version = "1.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
-
 [[package]]
 name = "monostate"
 version = "0.1.13"
@@ -2530,7 +2563,7 @@ checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -2552,7 +2585,7 @@ dependencies = [
  "futures",
  "pin-project",
  "rand",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
  "tracing",
@@ -2560,9 +2593,9 @@ dependencies = [
 
 [[package]]
 name = "native-tls"
-version = "0.2.12"
+version = "0.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
+checksum = "0dab59f8e050d5df8e4dd87d9206fb6f65a483e20ac9fda365ade4fab353196c"
 dependencies = [
  "libc",
  "log",
@@ -2596,7 +2629,7 @@ dependencies = [
  "bytes",
  "futures",
  "hostname",
- "hyper 0.14.31",
+ "hyper 0.14.32",
  "muxado",
  "once_cell",
  "parking_lot",
@@ -2604,7 +2637,7 @@ dependencies = [
  "rustls-pemfile",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-retry",
  "tokio-util",
@@ -2618,7 +2651,7 @@ version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "cfg-if",
  "cfg_aliases 0.1.1",
  "libc",
@@ -2630,7 +2663,7 @@ version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "cfg-if",
  "cfg_aliases 0.2.1",
  "libc",
@@ -2730,7 +2763,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -2801,9 +2834,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
 [[package]]
 name = "object"
-version = "0.36.5"
+version = "0.36.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e"
+checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
 dependencies = [
  "memchr",
 ]
@@ -2844,11 +2877,11 @@ checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
 
 [[package]]
 name = "openssl"
-version = "0.10.68"
+version = "0.10.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5"
+checksum = "f5e534d133a060a3c19daec1eb3e98ec6f4685978834f2dbadfe2ec215bab64e"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -2865,14 +2898,14 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "openssl-probe"
-version = "0.1.5"
+version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
+checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
 
 [[package]]
 name = "openssl-sys"
@@ -2904,11 +2937,11 @@ checksum = "1e32339a5dc40459130b3bd269e9892439f55b33e772d2a9d402a789baaf4e8a"
 dependencies = [
  "futures-core",
  "futures-sink",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
  "js-sys",
  "once_cell",
  "pin-project-lite",
- "thiserror",
+ "thiserror 1.0.69",
  "urlencoding",
 ]
 
@@ -2926,7 +2959,7 @@ dependencies = [
  "opentelemetry_api",
  "opentelemetry_sdk 0.20.0",
  "prost 0.11.9",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tonic 0.9.2",
 ]
@@ -2964,7 +2997,7 @@ dependencies = [
  "js-sys",
  "once_cell",
  "pin-project-lite",
- "thiserror",
+ "thiserror 1.0.69",
  "urlencoding",
 ]
 
@@ -2986,7 +3019,7 @@ dependencies = [
  "rand",
  "regex",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-stream",
 ]
@@ -3005,10 +3038,10 @@ dependencies = [
  "glob",
  "once_cell",
  "opentelemetry 0.21.0",
- "ordered-float 4.5.0",
+ "ordered-float 4.6.0",
  "percent-encoding",
  "rand",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -3028,9 +3061,9 @@ dependencies = [
 
 [[package]]
 name = "ordered-float"
-version = "4.5.0"
+version = "4.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c65ee1f9701bf938026630b455d5315f490640234259037edb259798b3bcf85e"
+checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951"
 dependencies = [
  "num-traits",
 ]
@@ -3048,9 +3081,9 @@ dependencies = [
 
 [[package]]
 name = "outref"
-version = "0.5.1"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
+checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e"
 
 [[package]]
 name = "overload"
@@ -3111,34 +3144,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
  "fixedbitset",
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
 ]
 
 [[package]]
 name = "pin-project"
-version = "1.1.7"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be57f64e946e500c8ee36ef6331845d40a93055567ec57e8fae13efd33759b95"
+checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.1.7"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c"
+checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.15"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
 
 [[package]]
 name = "pin-utils"
@@ -3182,9 +3215,9 @@ dependencies = [
 
 [[package]]
 name = "png"
-version = "0.17.14"
+version = "0.17.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52f9d46a34a05a6a57566bc2bfae066ef07585a6e3fa30fbbdff5936380623f0"
+checksum = "82151a2fc869e011c153adc57cf2789ccb8d9906ce52c0b39a6b5697749d7526"
 dependencies = [
  "bitflags 1.3.2",
  "crc32fast",
@@ -3195,9 +3228,9 @@ dependencies = [
 
 [[package]]
 name = "portable-atomic"
-version = "1.9.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2"
+checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
 
 [[package]]
 name = "powerfmt"
@@ -3214,24 +3247,14 @@ dependencies = [
  "zerocopy",
 ]
 
-[[package]]
-name = "pretty_assertions"
-version = "1.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ae130e2f271fbc2ac3a40fb1d07180839cdbbe443c7a27e1e3c13c5cac0116d"
-dependencies = [
- "diff",
- "yansi",
-]
-
 [[package]]
 name = "prettyplease"
-version = "0.2.25"
+version = "0.2.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033"
+checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac"
 dependencies = [
  "proc-macro2",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -3260,9 +3283,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.92"
+version = "1.0.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
 dependencies = [
  "unicode-ident",
 ]
@@ -3283,7 +3306,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a65f2e60fbf1063868558d69c6beacf412dc755f9fc020f514b7955fc914fe30"
 dependencies = [
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -3323,7 +3346,7 @@ dependencies = [
  "prost 0.12.6",
  "prost-types",
  "regex",
- "syn 2.0.89",
+ "syn 2.0.96",
  "tempfile",
 ]
 
@@ -3350,7 +3373,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -3409,7 +3432,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -3422,7 +3445,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-build-config",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -3436,15 +3459,15 @@ dependencies = [
 
 [[package]]
 name = "quanta"
-version = "0.12.3"
+version = "0.12.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e5167a477619228a0b284fac2674e3c388cba90631d7b7de620e6f1fcd08da5"
+checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
 dependencies = [
  "crossbeam-utils",
  "libc",
  "once_cell",
  "raw-cpuid",
- "wasi",
+ "wasi 0.11.0+wasi-snapshot-preview1",
  "web-sys",
  "winapi",
 ]
@@ -3457,9 +3480,9 @@ checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
 
 [[package]]
 name = "quote"
-version = "1.0.37"
+version = "1.0.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
 dependencies = [
  "proc-macro2",
 ]
@@ -3491,7 +3514,7 @@ version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 dependencies = [
- "getrandom",
+ "getrandom 0.2.15",
 ]
 
 [[package]]
@@ -3500,7 +3523,7 @@ version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fdef7f9be5c0122f890d58bdf4d964349ba6a6161f705907526d891efabba57d"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "cassowary",
  "compact_str",
  "crossterm",
@@ -3545,7 +3568,7 @@ dependencies = [
  "rand_chacha",
  "simd_helpers",
  "system-deps",
- "thiserror",
+ "thiserror 1.0.69",
  "v_frame",
  "wasm-bindgen",
 ]
@@ -3567,11 +3590,11 @@ dependencies = [
 
 [[package]]
 name = "raw-cpuid"
-version = "11.2.0"
+version = "11.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ab240315c661615f2ee9f0f2cd32d5a7343a84d5ebcccb99d46e6637565e7b0"
+checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
 ]
 
 [[package]]
@@ -3607,11 +3630,11 @@ dependencies = [
 
 [[package]]
 name = "redox_syscall"
-version = "0.5.7"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f"
+checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
 ]
 
 [[package]]
@@ -3620,9 +3643,9 @@ version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
 dependencies = [
- "getrandom",
+ "getrandom 0.2.15",
  "libredox",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -3642,14 +3665,14 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "referencing"
-version = "0.28.0"
+version = "0.28.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "093a875008827c0ae15c746189966e162faa05bf347719d06302c548ac63630f"
+checksum = "d0dcb5ab28989ad7c91eb1b9531a37a1a137cc69a0499aee4117cae4a107c464"
 dependencies = [
  "ahash",
  "fluent-uri",
@@ -3716,7 +3739,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.12",
  "http-body 0.4.6",
- "hyper 0.14.31",
+ "hyper 0.14.32",
  "hyper-tls",
  "ipnet",
  "js-sys",
@@ -3744,19 +3767,19 @@ dependencies = [
 
 [[package]]
 name = "reqwest"
-version = "0.12.9"
+version = "0.12.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f"
+checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da"
 dependencies = [
  "base64 0.22.1",
  "bytes",
  "futures-channel",
  "futures-core",
  "futures-util",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body 1.0.1",
  "http-body-util",
- "hyper 1.5.1",
+ "hyper 1.6.0",
  "hyper-util",
  "ipnet",
  "js-sys",
@@ -3770,6 +3793,7 @@ dependencies = [
  "serde_urlencoded",
  "sync_wrapper 1.0.2",
  "tokio",
+ "tower 0.5.2",
  "tower-service",
  "url",
  "wasm-bindgen",
@@ -3807,7 +3831,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d"
 dependencies = [
  "cc",
  "cfg-if",
- "getrandom",
+ "getrandom 0.2.15",
  "libc",
  "spin 0.9.8",
  "untrusted 0.9.0",
@@ -3834,7 +3858,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rust-embed-utils",
- "syn 2.0.89",
+ "syn 2.0.96",
  "walkdir",
 ]
 
@@ -3860,6 +3884,12 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
+[[package]]
+name = "rustc-hash"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497"
+
 [[package]]
 name = "rustc_version"
 version = "0.4.1"
@@ -3871,15 +3901,15 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.41"
+version = "0.38.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
+checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -3910,9 +3940,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.23.17"
+version = "0.23.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f1a745511c54ba6d4465e8d5dfbd81b45791756de28d4981af70d6dca128f1e"
+checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8"
 dependencies = [
  "aws-lc-rs",
  "log",
@@ -3932,7 +3962,7 @@ dependencies = [
  "openssl-probe",
  "rustls-pki-types",
  "schannel",
- "security-framework 3.0.1",
+ "security-framework 3.2.0",
 ]
 
 [[package]]
@@ -3946,9 +3976,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-pki-types"
-version = "1.10.0"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
+checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c"
 
 [[package]]
 name = "rustls-webpki"
@@ -3964,15 +3994,15 @@ dependencies = [
 
 [[package]]
 name = "rustversion"
-version = "1.0.18"
+version = "1.0.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248"
+checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4"
 
 [[package]]
 name = "ryu"
-version = "1.0.18"
+version = "1.0.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
+checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd"
 
 [[package]]
 name = "same-file"
@@ -4020,7 +4050,7 @@ version = "2.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "core-foundation 0.9.4",
  "core-foundation-sys",
  "libc",
@@ -4029,11 +4059,11 @@ dependencies = [
 
 [[package]]
 name = "security-framework"
-version = "3.0.1"
+version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1415a607e92bec364ea2cf9264646dcce0f91e6d65281bd6f2819cca3bf39c8"
+checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "core-foundation 0.10.0",
  "core-foundation-sys",
  "libc",
@@ -4042,9 +4072,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.12.1"
+version = "2.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2"
+checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -4052,18 +4082,18 @@ dependencies = [
 
 [[package]]
 name = "semver"
-version = "1.0.23"
+version = "1.0.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
+checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03"
 dependencies = [
  "serde",
 ]
 
 [[package]]
 name = "serde"
-version = "1.0.215"
+version = "1.0.217"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
+checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
 dependencies = [
  "serde_derive",
 ]
@@ -4090,22 +4120,22 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.215"
+version = "1.0.217"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
+checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.133"
+version = "1.0.138"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
+checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
  "itoa",
  "memchr",
  "ryu",
@@ -4246,9 +4276,9 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
 [[package]]
 name = "socket2"
-version = "0.5.7"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
+checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -4315,7 +4345,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -4337,9 +4367,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.89"
+version = "2.0.96"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
+checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4369,7 +4399,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -4453,12 +4483,13 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
 
 [[package]]
 name = "tempfile"
-version = "3.14.0"
+version = "3.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
+checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91"
 dependencies = [
  "cfg-if",
  "fastrand",
+ "getrandom 0.3.1",
  "once_cell",
  "rustix",
  "windows-sys 0.59.0",
@@ -4478,16 +4509,16 @@ name = "text-generation-backends-trtllm"
 version = "3.1.1-dev0"
 dependencies = [
  "async-trait",
- "clap 4.5.21",
+ "clap 4.5.27",
  "cmake",
  "cxx",
  "cxx-build",
- "hashbrown 0.15.1",
+ "hashbrown 0.15.2",
  "hf-hub",
  "pkg-config",
  "pyo3",
  "text-generation-router",
- "thiserror",
+ "thiserror 1.0.69",
  "tokenizers",
  "tokio",
  "tokio-stream",
@@ -4499,7 +4530,7 @@ name = "text-generation-benchmark"
 version = "3.1.1-dev0"
 dependencies = [
  "average",
- "clap 4.5.21",
+ "clap 4.5.27",
  "float-ord",
  "hf-hub",
  "ratatui",
@@ -4507,7 +4538,7 @@ dependencies = [
  "serde_json",
  "tabled",
  "text-generation-client",
- "thiserror",
+ "thiserror 1.0.69",
  "tokenizers",
  "tokio",
  "tracing",
@@ -4524,7 +4555,7 @@ dependencies = [
  "grpc-metadata",
  "prost 0.12.6",
  "prost-build",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tonic 0.10.2",
  "tonic-build",
@@ -4536,7 +4567,7 @@ dependencies = [
 name = "text-generation-launcher"
 version = "3.1.1-dev0"
 dependencies = [
- "clap 4.5.21",
+ "clap 4.5.27",
  "ctrlc",
  "float_eq",
  "hf-hub",
@@ -4547,7 +4578,7 @@ dependencies = [
  "reqwest 0.11.27",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
  "tracing-subscriber",
  "vergen",
@@ -4564,7 +4595,7 @@ dependencies = [
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
  "chrono",
- "clap 4.5.21",
+ "clap 4.5.27",
  "csv",
  "futures",
  "futures-util",
@@ -4590,7 +4621,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sysinfo",
- "thiserror",
+ "thiserror 1.0.69",
  "tokenizers",
  "tokio",
  "tokio-stream",
@@ -4605,6 +4636,23 @@ dependencies = [
  "vergen",
 ]
 
+[[package]]
+name = "text-generation-router-llamacpp"
+version = "3.1.1-dev0"
+dependencies = [
+ "async-trait",
+ "bindgen 0.71.1",
+ "clap 4.5.27",
+ "num_cpus",
+ "pkg-config",
+ "text-generation-router",
+ "thiserror 2.0.11",
+ "tokenizers",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+]
+
 [[package]]
 name = "text-generation-router-v2"
 version = "3.1.1-dev0"
@@ -4614,7 +4662,7 @@ dependencies = [
  "axum 0.7.9",
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
- "clap 4.5.21",
+ "clap 4.5.27",
  "futures",
  "futures-util",
  "grpc-metadata",
@@ -4639,7 +4687,7 @@ dependencies = [
  "serde_json",
  "slotmap",
  "text-generation-router",
- "thiserror",
+ "thiserror 1.0.69",
  "tokenizers",
  "tokio",
  "tokio-stream",
@@ -4663,7 +4711,7 @@ dependencies = [
  "axum 0.7.9",
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
- "clap 4.5.21",
+ "clap 4.5.27",
  "criterion",
  "futures",
  "futures-util",
@@ -4690,7 +4738,7 @@ dependencies = [
  "serde_json",
  "slotmap",
  "text-generation-router",
- "thiserror",
+ "thiserror 1.0.69",
  "tokenizers",
  "tokio",
  "tokio-stream",
@@ -4720,7 +4768,16 @@ version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
- "thiserror-impl",
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
+dependencies = [
+ "thiserror-impl 2.0.11",
 ]
 
 [[package]]
@@ -4731,7 +4788,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -4757,9 +4825,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.36"
+version = "0.3.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21"
 dependencies = [
  "deranged",
  "itoa",
@@ -4780,9 +4848,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.18"
+version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de"
 dependencies = [
  "num-conv",
  "time-core",
@@ -4810,14 +4878,14 @@ dependencies = [
 
 [[package]]
 name = "tokenizers"
-version = "0.20.3"
+version = "0.20.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67b67c92f6d705e2a1d106fb0b28c696f9074901a9c656ee5d9f5de204c39bf7"
+checksum = "3b08cc37428a476fc9e20ac850132a513a2e1ce32b6a31addf2b74fa7033b905"
 dependencies = [
  "aho-corasick",
  "derive_builder",
  "esaxx-rs",
- "getrandom",
+ "getrandom 0.2.15",
  "hf-hub",
  "indicatif",
  "itertools 0.12.1",
@@ -4835,7 +4903,7 @@ dependencies = [
  "serde",
  "serde_json",
  "spm_precompiled",
- "thiserror",
+ "thiserror 1.0.69",
  "unicode-normalization-alignments",
  "unicode-segmentation",
  "unicode_categories",
@@ -4877,7 +4945,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -4903,12 +4971,11 @@ dependencies = [
 
 [[package]]
 name = "tokio-rustls"
-version = "0.26.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
+checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37"
 dependencies = [
- "rustls 0.23.17",
- "rustls-pki-types",
+ "rustls 0.23.21",
  "tokio",
 ]
 
@@ -4925,9 +4992,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.12"
+version = "0.7.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a"
+checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078"
 dependencies = [
  "bytes",
  "futures-core",
@@ -4964,7 +5031,7 @@ version = "0.22.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
  "serde",
  "serde_spanned",
  "toml_datetime",
@@ -4986,7 +5053,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.12",
  "http-body 0.4.6",
- "hyper 0.14.31",
+ "hyper 0.14.32",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
@@ -5013,7 +5080,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.12",
  "http-body 0.4.6",
- "hyper 0.14.31",
+ "hyper 0.14.32",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
@@ -5036,7 +5103,7 @@ dependencies = [
  "proc-macro2",
  "prost-build",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -5061,14 +5128,14 @@ dependencies = [
 
 [[package]]
 name = "tower"
-version = "0.5.1"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2873938d487c3cfb9aed7546dc9f2711d867c9f90c46b889989a2cb84eba6b4f"
+checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
 dependencies = [
  "futures-core",
  "futures-util",
  "pin-project-lite",
- "sync_wrapper 0.1.2",
+ "sync_wrapper 1.0.2",
  "tokio",
  "tower-layer",
  "tower-service",
@@ -5081,9 +5148,9 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 2.8.0",
  "bytes",
- "http 1.1.0",
+ "http 1.2.0",
  "http-body 1.0.1",
  "http-body-util",
  "pin-project-lite",
@@ -5105,9 +5172,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
  "log",
  "pin-project-lite",
@@ -5117,20 +5184,20 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
 dependencies = [
  "once_cell",
  "valuable",
@@ -5198,7 +5265,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9920abb6a3ee3a2af7d30c9ff02900f8481935d36723c3da95cf807468218e8c"
 dependencies = [
- "http 1.1.0",
+ "http 1.2.0",
  "opentelemetry 0.21.0",
  "tracing",
  "tracing-opentelemetry 0.22.0",
@@ -5206,9 +5273,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-serde"
-version = "0.1.3"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
 dependencies = [
  "serde",
  "tracing-core",
@@ -5216,9 +5283,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.18"
+version = "0.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
  "matchers",
  "nu-ansi-term",
@@ -5249,15 +5316,15 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
 
 [[package]]
 name = "unicase"
-version = "2.8.0"
+version = "2.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df"
+checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.14"
+version = "1.0.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
+checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034"
 
 [[package]]
 name = "unicode-normalization-alignments"
@@ -5343,9 +5410,9 @@ dependencies = [
 
 [[package]]
 name = "url"
-version = "2.5.3"
+version = "2.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada"
+checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
 dependencies = [
  "form_urlencoded",
  "idna",
@@ -5382,7 +5449,7 @@ version = "4.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c5afb1a60e207dca502682537fefcfd9921e71d0b83e9576060f09abc6efab23"
 dependencies = [
- "indexmap 2.6.0",
+ "indexmap 2.7.1",
  "serde",
  "serde_json",
  "utoipa-gen",
@@ -5398,7 +5465,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "regex",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -5419,24 +5486,24 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.11.0"
+version = "1.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a"
+checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b"
 dependencies = [
- "getrandom",
+ "getrandom 0.2.15",
  "rand",
  "uuid-macro-internal",
 ]
 
 [[package]]
 name = "uuid-macro-internal"
-version = "1.11.0"
+version = "1.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b91f57fe13a38d0ce9e28a03463d8d3c2468ed03d75375110ec71d93b449a08"
+checksum = "f8a86d88347b61a0e17b9908a67efcc594130830bf1045653784358dd023e294"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -5463,9 +5530,9 @@ dependencies = [
 
 [[package]]
 name = "valuable"
-version = "0.1.0"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
+checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
 
 [[package]]
 name = "vcpkg"
@@ -5533,48 +5600,58 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
-name = "wasm-bindgen"
-version = "0.2.95"
+name = "wasi"
+version = "0.13.3+wasi-0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e"
+checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
 dependencies = [
  "cfg-if",
  "once_cell",
+ "rustversion",
  "wasm-bindgen-macro",
 ]
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.95"
+version = "0.2.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358"
+checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
 dependencies = [
  "bumpalo",
  "log",
- "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.45"
+version = "0.4.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b"
+checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61"
 dependencies = [
  "cfg-if",
  "js-sys",
+ "once_cell",
  "wasm-bindgen",
  "web-sys",
 ]
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.95"
+version = "0.2.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56"
+checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -5582,28 +5659,31 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.95"
+version = "0.2.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68"
+checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.95"
+version = "0.2.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d"
+checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
+dependencies = [
+ "unicode-ident",
+]
 
 [[package]]
 name = "web-sys"
-version = "0.3.72"
+version = "0.3.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112"
+checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -5962,9 +6042,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
 name = "winnow"
-version = "0.6.20"
+version = "0.6.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b"
+checksum = "ad699df48212c6cc6eb4435f35500ac6fd3b9913324f938aea302022ce19d310"
 dependencies = [
  "memchr",
 ]
@@ -5979,6 +6059,15 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "wit-bindgen-rt"
+version = "0.33.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "write16"
 version = "1.0.0"
@@ -5991,17 +6080,11 @@ version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
 
-[[package]]
-name = "yansi"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
-
 [[package]]
 name = "yoke"
-version = "0.7.4"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
+checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
 dependencies = [
  "serde",
  "stable_deref_trait",
@@ -6011,13 +6094,13 @@ dependencies = [
 
 [[package]]
 name = "yoke-derive"
-version = "0.7.4"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
+checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
  "synstructure",
 ]
 
@@ -6039,27 +6122,27 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
 name = "zerofrom"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
+checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
 dependencies = [
  "zerofrom-derive",
 ]
 
 [[package]]
 name = "zerofrom-derive"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
+checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
  "synstructure",
 ]
 
@@ -6088,7 +6171,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -6120,9 +6203,9 @@ dependencies = [
 
 [[package]]
 name = "zune-jpeg"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16099418600b4d8f028622f73ff6e3deaabdff330fb9a2a131dea781ee8b0768"
+checksum = "99a5bab8d7dedf81405c4bb1f2b83ea057643d9cb28778cea9eecddeedd2e028"
 dependencies = [
  "zune-core",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 6fd4b51d..df7f2a73 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "backends/v3",
     "backends/grpc-metadata",
     "backends/trtllm",
+    "backends/llamacpp",
     "launcher",
     "router"
 ]
diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp
new file mode 100644
index 00000000..2eb62a1f
--- /dev/null
+++ b/Dockerfile_llamacpp
@@ -0,0 +1,76 @@
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps
+
+ARG llamacpp_version=b4651
+ARG llamacpp_cuda=OFF
+ARG cuda_arch=75-real;80-real;86-real;89-real;90-real
+
+WORKDIR /opt/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt install -y \
+    clang \
+    cmake \
+    curl \
+    git \
+    python3-dev \
+    libssl-dev \
+    pkg-config \
+    tar
+
+ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN tar -xzf ${llamacpp_version}.tar.gz \
+ && cd llama.cpp-${llamacpp_version} \
+ && cmake -B build \
+    -DCMAKE_INSTALL_PREFIX=/usr \
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
+    -DCMAKE_C_COMPILER=clang \
+    -DCMAKE_CXX_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
+    -DGGML_CUDA=${llamacpp_cuda} \
+    -DLLAMA_BUILD_COMMON=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
+ && cmake --build build --parallel --config Release \
+ && cmake --install build
+
+WORKDIR /app
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef --locked
+
+FROM deps AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM deps AS builder
+COPY --from=planner /app/recipe.json recipe.json
+RUN cargo chef cook \
+    --recipe-path recipe.json \
+    --profile release-opt \
+    --package text-generation-router-llamacpp
+COPY . .
+RUN cargo build \
+    --profile release-opt \
+    --package text-generation-router-llamacpp --frozen
+
+FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
+
+RUN apt update && apt install -y \
+    python3-venv \
+    python3-pip
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+COPY backends/llamacpp/requirements.txt requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
+COPY --from=builder /usr/lib/libggml*.so /usr/lib/
+COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+ENTRYPOINT ["text-generation-router-llamacpp"]
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
new file mode 100644
index 00000000..18c2ed0a
--- /dev/null
+++ b/backends/llamacpp/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "text-generation-router-llamacpp"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[build-dependencies]
+bindgen = "0.71.1"
+pkg-config = "0.3.31"
+
+[dependencies]
+async-trait = "0.1.85"
+clap = "4.5.27"
+num_cpus = "1.16.0"
+text-generation-router = { path = "../../router" }
+thiserror = "2.0.11"
+tokenizers.workspace = true
+tokio = "1.43.0"
+tokio-stream = "0.1.17"
+tracing = "0.1.41"
diff --git a/backends/llamacpp/README.md b/backends/llamacpp/README.md
new file mode 100644
index 00000000..0971efc5
--- /dev/null
+++ b/backends/llamacpp/README.md
@@ -0,0 +1,24 @@
+# Llamacpp backend
+
+If all your dependencies are installed at the system level, running
+cargo build should be sufficient. However, if you want to experiment
+with different versions of llama.cpp, some additional setup is required.
+
+## Install llama.cpp
+
+    LLAMACPP_PREFIX=$(pwd)/llama.cpp.out
+
+    git clone https://github.com/ggerganov/llama.cpp
+    cd llama.cpp
+    cmake -B build \
+        -DCMAKE_INSTALL_PREFIX="$LLAMACPP_PREFIX" \
+        -DLLAMA_BUILD_COMMON=OFF \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DLLAMA_BUILD_EXAMPLES=OFF \
+        -DLLAMA_BUILD_SERVER=OFF
+    cmake --build build --config Release -j
+    cmake --install build
+
+## Build TGI
+
+    PKG_CONFIG_PATH="$LLAMACPP_PREFIX/lib/pkgconfig" cargo build
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
new file mode 100644
index 00000000..499583cd
--- /dev/null
+++ b/backends/llamacpp/build.rs
@@ -0,0 +1,48 @@
+use bindgen::callbacks::{ItemInfo, ParseCallbacks};
+use std::env;
+use std::path::PathBuf;
+
+#[derive(Debug)]
+struct PrefixStripper;
+
+impl ParseCallbacks for PrefixStripper {
+    fn generated_name_override(&self, item_info: ItemInfo<'_>) -> Option<String> {
+        item_info.name.strip_prefix("llama_").map(str::to_string)
+    }
+}
+
+fn main() {
+    if let Some(cuda_version) = option_env!("CUDA_VERSION") {
+        let mut version: Vec<&str> = cuda_version.split('.').collect();
+        if version.len() > 2 {
+            version.pop();
+        }
+        let cuda_version = format!("cuda-{}", version.join("."));
+        pkg_config::Config::new().probe(&cuda_version).unwrap();
+    }
+    let llama = pkg_config::Config::new().probe("llama").unwrap();
+
+    for path in &llama.link_paths {
+        println!("cargo:rustc-link-arg=-Wl,-rpath,{}", path.display());
+    }
+    println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
+
+    let bindings = bindgen::Builder::default()
+        .clang_args(
+            llama
+                .include_paths
+                .iter()
+                .map(|p| format!("-I{}", p.display())),
+        )
+        .header_contents("llama_bindings.h", "#include <llama.h>")
+        .prepend_enum_name(false)
+        .parse_callbacks(Box::new(PrefixStripper))
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
+        .generate()
+        .expect("Unable to generate bindings");
+
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    bindings
+        .write_to_file(out_path.join("llamacpp.rs"))
+        .expect("Couldn't write bindings!");
+}
diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt
new file mode 100644
index 00000000..5c5d0cc7
--- /dev/null
+++ b/backends/llamacpp/requirements.txt
@@ -0,0 +1,3 @@
+transformers==4.48.2
+huggingface-hub==0.28.1
+hf-transfer==0.1.9
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
new file mode 100644
index 00000000..1566e1bf
--- /dev/null
+++ b/backends/llamacpp/src/backend.rs
@@ -0,0 +1,679 @@
+mod llamacpp {
+    #![allow(non_upper_case_globals)]
+    #![allow(non_camel_case_types)]
+    #![allow(non_snake_case)]
+    #![allow(dead_code)]
+    include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
+}
+use async_trait::async_trait;
+use std::ffi::CString;
+use std::mem::replace;
+use std::str::FromStr;
+use std::sync::{mpsc, Once};
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidGenerateRequest;
+use text_generation_router::{FinishReason, Token};
+use thiserror::Error;
+use tokenizers::Tokenizer;
+use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
+use tokio::sync::{oneshot, watch};
+use tokio::task::{spawn, spawn_blocking};
+use tokio::time::{timeout, Duration, Instant};
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::instrument;
+use tracing::{debug, error, info, trace, warn};
+
+#[derive(Debug, Clone, Copy)]
+pub enum LlamacppSplitMode {
+    GPU(usize),
+    Layer,
+    Row,
+}
+
+impl FromStr for LlamacppSplitMode {
+    type Err = String;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "layer" => Ok(LlamacppSplitMode::Layer),
+            "row" => Ok(LlamacppSplitMode::Row),
+            _ => match s.parse::<usize>() {
+                Ok(n) => Ok(LlamacppSplitMode::GPU(n)),
+                Err(_) => Err("Choose a GPU number or `layer` or `row`".to_string()),
+            },
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, clap::ValueEnum)]
+pub enum LlamacppNuma {
+    Disabled,
+    Distribute,
+    Isolate,
+    Numactl,
+    Mirror,
+}
+
+#[allow(non_camel_case_types)]
+#[derive(Debug, Clone, Copy, clap::ValueEnum)]
+pub enum LlamacppGGMLType {
+    F32,
+    F16,
+    Q4_0,
+    Q4_1,
+    Q5_0,
+    Q5_1,
+    Q8_0,
+    Q8_1,
+    Q2_K,
+    Q3_K,
+    Q4_K,
+    Q5_K,
+    Q6_K,
+    Q8_K,
+    IQ2_XXS,
+    IQ2_XS,
+    IQ3_XXS,
+    IQ1_S,
+    IQ4_NL,
+    IQ3_S,
+    IQ2_S,
+    IQ4_XS,
+    I8,
+    I16,
+    I32,
+    I64,
+    F64,
+    IQ1_M,
+    BF16,
+    TQ1_0,
+    TQ2_0,
+}
+
+// TODO: macro
+impl LlamacppGGMLType {
+    fn to_ggml_type(self) -> llamacpp::ggml_type {
+        match self {
+            LlamacppGGMLType::F32 => llamacpp::GGML_TYPE_F32,
+            LlamacppGGMLType::F16 => llamacpp::GGML_TYPE_F16,
+            LlamacppGGMLType::Q4_0 => llamacpp::GGML_TYPE_Q4_0,
+            LlamacppGGMLType::Q4_1 => llamacpp::GGML_TYPE_Q4_1,
+            LlamacppGGMLType::Q5_0 => llamacpp::GGML_TYPE_Q5_0,
+            LlamacppGGMLType::Q5_1 => llamacpp::GGML_TYPE_Q5_1,
+            LlamacppGGMLType::Q8_0 => llamacpp::GGML_TYPE_Q8_0,
+            LlamacppGGMLType::Q8_1 => llamacpp::GGML_TYPE_Q8_1,
+            LlamacppGGMLType::Q2_K => llamacpp::GGML_TYPE_Q2_K,
+            LlamacppGGMLType::Q3_K => llamacpp::GGML_TYPE_Q3_K,
+            LlamacppGGMLType::Q4_K => llamacpp::GGML_TYPE_Q4_K,
+            LlamacppGGMLType::Q5_K => llamacpp::GGML_TYPE_Q5_K,
+            LlamacppGGMLType::Q6_K => llamacpp::GGML_TYPE_Q6_K,
+            LlamacppGGMLType::Q8_K => llamacpp::GGML_TYPE_Q8_K,
+            LlamacppGGMLType::IQ2_XXS => llamacpp::GGML_TYPE_IQ2_XXS,
+            LlamacppGGMLType::IQ2_XS => llamacpp::GGML_TYPE_IQ2_XS,
+            LlamacppGGMLType::IQ3_XXS => llamacpp::GGML_TYPE_IQ3_XXS,
+            LlamacppGGMLType::IQ1_S => llamacpp::GGML_TYPE_IQ1_S,
+            LlamacppGGMLType::IQ4_NL => llamacpp::GGML_TYPE_IQ4_NL,
+            LlamacppGGMLType::IQ3_S => llamacpp::GGML_TYPE_IQ3_S,
+            LlamacppGGMLType::IQ2_S => llamacpp::GGML_TYPE_IQ2_S,
+            LlamacppGGMLType::IQ4_XS => llamacpp::GGML_TYPE_IQ4_XS,
+            LlamacppGGMLType::I8 => llamacpp::GGML_TYPE_I8,
+            LlamacppGGMLType::I16 => llamacpp::GGML_TYPE_I16,
+            LlamacppGGMLType::I32 => llamacpp::GGML_TYPE_I32,
+            LlamacppGGMLType::I64 => llamacpp::GGML_TYPE_I64,
+            LlamacppGGMLType::F64 => llamacpp::GGML_TYPE_F64,
+            LlamacppGGMLType::IQ1_M => llamacpp::GGML_TYPE_IQ1_M,
+            LlamacppGGMLType::BF16 => llamacpp::GGML_TYPE_BF16,
+            LlamacppGGMLType::TQ1_0 => llamacpp::GGML_TYPE_TQ1_0,
+            LlamacppGGMLType::TQ2_0 => llamacpp::GGML_TYPE_TQ2_0,
+        }
+    }
+}
+
+pub struct LlamacppConfig {
+    pub model_gguf: String,
+    pub max_batch_total_tokens: usize,
+    pub max_physical_batch_total_tokens: usize,
+    pub max_batch_size: usize,
+    pub batch_timeout: Duration,
+    pub n_threads: usize,
+    pub n_threads_batch: usize,
+    pub n_gpu_layers: usize,
+    pub split_mode: LlamacppSplitMode,
+    pub numa: LlamacppNuma,
+    pub defrag_threshold: f32,
+    pub use_mmap: bool,
+    pub use_mlock: bool,
+    pub offload_kqv: bool,
+    pub flash_attention: bool,
+    pub type_k: LlamacppGGMLType,
+    pub type_v: LlamacppGGMLType,
+}
+
+#[derive(Debug)]
+struct LlamacppRequest {
+    input_ids: Vec<i32>,
+    top_k: i32,
+    top_p: f32,
+    typical_p: f32,
+    min_keep: usize,
+    temp: f32,
+    seed: u32,
+    penalty_last_n: i32,
+    penalty_repeat: f32,
+    penalty_freq: f32,
+    penalty_present: f32,
+    max_new_tokens: usize,
+    tx: UnboundedSender<Result<InferStreamResponse, InferError>>,
+    time: Instant,
+}
+
+pub struct LlamacppBackend {
+    tx: UnboundedSender<LlamacppRequest>,
+    status: watch::Receiver<bool>,
+}
+
+impl LlamacppRequest {
+    fn new(
+        from: &ValidGenerateRequest,
+        tx: UnboundedSender<Result<InferStreamResponse, InferError>>,
+    ) -> Option<Self> {
+        from.input_ids.as_ref().map(|input_ids| LlamacppRequest {
+            input_ids: input_ids.iter().map(|&x| x as i32).collect(),
+            top_k: from.parameters.top_k as _,
+            top_p: from.parameters.top_p as _,
+            typical_p: from.parameters.typical_p as _,
+            min_keep: 0, // disabled
+            temp: from.parameters.temperature as _,
+            seed: from.parameters.seed as _,
+            penalty_last_n: 64, // 0 = disabled, -1 = context size
+            penalty_repeat: from.parameters.repetition_penalty as _,
+            penalty_freq: from.parameters.frequency_penalty as _,
+            penalty_present: 0.0, // disabled
+            max_new_tokens: from.stopping_parameters.max_new_tokens as _,
+            tx,
+            time: Instant::now(),
+        })
+    }
+}
+
+struct Llamacpp {
+    model: *mut llamacpp::llama_model,
+    ctx: *mut llamacpp::llama_context,
+    vocab: *const llamacpp::llama_vocab,
+    logprobs: Vec<llamacpp::llama_token_data>,
+    batch: llamacpp::llama_batch,
+}
+
+extern "C" fn llamacpp_log_callback(
+    level: llamacpp::ggml_log_level,
+    msg: *const std::os::raw::c_char,
+    _user_data: *mut std::os::raw::c_void,
+) {
+    let cmsg = unsafe { std::ffi::CStr::from_ptr(msg) };
+    let rmsg = cmsg.to_string_lossy().trim_end_matches('\n').to_string();
+
+    match level {
+        llamacpp::GGML_LOG_LEVEL_DEBUG => debug!(target: "llamacpp", "{}", rmsg),
+        llamacpp::GGML_LOG_LEVEL_INFO => info!(target: "llamacpp", "{}", rmsg),
+        llamacpp::GGML_LOG_LEVEL_WARN => warn!(target: "llamacpp", "{}", rmsg),
+        llamacpp::GGML_LOG_LEVEL_ERROR => error!(target: "llamacpp", "{}", rmsg),
+        _ => trace!(target: "llamacpp", "{}", rmsg),
+    }
+}
+
+impl Llamacpp {
+    fn new(conf: LlamacppConfig) -> Result<Self, BackendError> {
+        let gguf = CString::new(conf.model_gguf)?;
+
+        let model = unsafe {
+            let mut params = llamacpp::model_default_params();
+            params.n_gpu_layers = conf.n_gpu_layers as _;
+            params.split_mode = match conf.split_mode {
+                LlamacppSplitMode::GPU(_) => llamacpp::LLAMA_SPLIT_MODE_NONE,
+                LlamacppSplitMode::Layer => llamacpp::LLAMA_SPLIT_MODE_LAYER,
+                LlamacppSplitMode::Row => llamacpp::LLAMA_SPLIT_MODE_ROW,
+            };
+            params.main_gpu = match conf.split_mode {
+                LlamacppSplitMode::GPU(n) => n as _,
+                _ => 0,
+            };
+            params.use_mmap = conf.use_mmap;
+            params.use_mlock = conf.use_mlock;
+            llamacpp::model_load_from_file(gguf.as_ptr(), params)
+        };
+        if model.is_null() {
+            return Err(BackendError::Llamacpp("Failed to load model".to_string()));
+        }
+        let ctx = unsafe {
+            let mut params = llamacpp::context_default_params();
+            params.n_ctx = conf.max_batch_total_tokens as _;
+            params.n_batch = conf.max_batch_total_tokens as _;
+            params.n_ubatch = conf.max_physical_batch_total_tokens as _;
+            params.n_seq_max = conf.max_batch_size as _;
+            params.n_threads = conf.n_threads as _;
+            params.n_threads_batch = conf.n_threads_batch as _;
+            params.defrag_thold = conf.defrag_threshold;
+            params.offload_kqv = conf.offload_kqv;
+            params.flash_attn = conf.flash_attention;
+            params.type_k = conf.type_k.to_ggml_type();
+            params.type_v = conf.type_v.to_ggml_type();
+            params.no_perf = true;
+            llamacpp::init_from_model(model, params)
+        };
+        if ctx.is_null() {
+            return Err(BackendError::Llamacpp("Failed to init context".to_string()));
+        }
+        let vocab = unsafe { llamacpp::model_get_vocab(model) };
+        if vocab.is_null() {
+            return Err(BackendError::Llamacpp("Failed to get vocab".to_string()));
+        }
+        let n_tokens = unsafe { llamacpp::vocab_n_tokens(vocab) };
+        let mut logprobs = Vec::with_capacity(n_tokens as usize);
+
+        for token in 0..n_tokens {
+            logprobs.push(llamacpp::llama_token_data {
+                id: token,
+                logit: 0.0,
+                p: 0.0,
+            });
+        }
+        let batch = unsafe { llamacpp::batch_init(conf.max_batch_total_tokens as _, 0, 1) };
+        Ok(Llamacpp {
+            model,
+            ctx,
+            vocab,
+            logprobs,
+            batch,
+        })
+    }
+
+    fn decode(&mut self) -> i32 {
+        unsafe { llamacpp::decode(self.ctx, self.batch) }
+    }
+
+    fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) {
+        unsafe {
+            llamacpp::kv_cache_seq_rm(self.ctx, seq_id, -1, -1);
+        }
+    }
+
+    fn batch_push(
+        &mut self,
+        token: llamacpp::llama_token,
+        pos: llamacpp::llama_pos,
+        seq_id: llamacpp::llama_seq_id,
+        logits: bool,
+    ) -> usize {
+        let n = self.batch.n_tokens as usize;
+        unsafe {
+            *self.batch.token.add(n) = token;
+            *self.batch.pos.add(n) = pos;
+            *self.batch.n_seq_id.add(n) = 1;
+            *(*self.batch.seq_id.add(n)).add(0) = seq_id;
+            *self.batch.logits.add(n) = logits as i8;
+        }
+        self.batch.n_tokens += 1;
+        n
+    }
+}
+
+impl Drop for Llamacpp {
+    fn drop(&mut self) {
+        if !self.ctx.is_null() {
+            unsafe { llamacpp::free(self.ctx) };
+        }
+        if !self.model.is_null() {
+            unsafe { llamacpp::model_free(self.model) };
+        }
+        unsafe { llamacpp::batch_free(self.batch) };
+    }
+}
+
+struct LlamacppSampler {
+    chain: *mut llamacpp::llama_sampler,
+}
+
+impl LlamacppSampler {
+    fn new(req: &LlamacppRequest) -> Option<Self> {
+        let chain = unsafe {
+            let params = llamacpp::sampler_chain_default_params();
+            llamacpp::sampler_chain_init(params)
+        };
+        if chain.is_null() {
+            error!("Failed to init sampler");
+            return None;
+        }
+        let (top_k, top_p, typical_p, temp, penalties, dist) = unsafe {
+            (
+                llamacpp::sampler_init_top_k(req.top_k),
+                llamacpp::sampler_init_top_p(req.top_p, req.min_keep),
+                llamacpp::sampler_init_typical(req.typical_p, req.min_keep),
+                llamacpp::sampler_init_temp(req.temp),
+                llamacpp::sampler_init_penalties(
+                    req.penalty_last_n,
+                    req.penalty_repeat,
+                    req.penalty_freq,
+                    req.penalty_present,
+                ),
+                llamacpp::sampler_init_dist(req.seed),
+            )
+        };
+        let all = &[
+            ("top_k", top_k),
+            ("top_p", top_p),
+            ("typical_p", typical_p),
+            ("temp", temp),
+            ("penalties", penalties),
+            ("dist", dist),
+        ];
+        let mut failed = false;
+
+        for (k, v) in all {
+            if v.is_null() {
+                error!("Failed to init {k} sampler");
+                failed = true;
+            } else {
+                unsafe { llamacpp::sampler_chain_add(chain, *v) };
+            }
+        }
+        if failed {
+            unsafe { llamacpp::sampler_free(chain) };
+            None
+        } else {
+            Some(LlamacppSampler { chain })
+        }
+    }
+
+    fn sample(&self, llamacpp: &mut Llamacpp, idx: usize) -> (llamacpp::llama_token, f32) {
+        let logits = unsafe { llamacpp::get_logits_ith(llamacpp.ctx, idx as _) };
+        for (token, logprob) in llamacpp.logprobs.iter_mut().enumerate() {
+            *logprob = llamacpp::llama_token_data {
+                id: token as _,
+                logit: unsafe { *logits.add(token) },
+                p: 0.0,
+            };
+        }
+        let mut view = llamacpp::llama_token_data_array {
+            data: llamacpp.logprobs.as_mut_ptr(),
+            size: llamacpp.logprobs.len(),
+            selected: -1,
+            sorted: false,
+        };
+        unsafe {
+            llamacpp::sampler_apply(self.chain, &mut view);
+            let logprob = *view.data.offset(view.selected as _);
+            llamacpp::sampler_accept(self.chain, logprob.id);
+            (logprob.id, logprob.p.ln())
+        }
+    }
+}
+
+impl Drop for LlamacppSampler {
+    fn drop(&mut self) {
+        if !self.chain.is_null() {
+            unsafe { llamacpp::sampler_free(self.chain) };
+        }
+    }
+}
+
+struct LlamacppSeq {
+    id: usize,
+    batch_pos: usize,
+    token: llamacpp::llama_token,
+    pos: llamacpp::llama_pos,
+    sampler: LlamacppSampler,
+    text: String,
+    n_new_tokens: usize,
+    running: bool,
+}
+
+static INIT: Once = Once::new();
+
+impl LlamacppBackend {
+    pub fn new(
+        conf: LlamacppConfig,
+        tokenizer: Tokenizer,
+    ) -> (
+        Self,
+        oneshot::Receiver<Result<(), BackendError>>,
+        watch::Sender<bool>,
+    ) {
+        // Setup llama & export logs, once and for all
+        INIT.call_once(|| unsafe {
+            llamacpp::log_set(Some(llamacpp_log_callback), std::ptr::null_mut());
+            llamacpp::backend_init();
+            llamacpp::numa_init(match conf.numa {
+                LlamacppNuma::Disabled => llamacpp::GGML_NUMA_STRATEGY_DISABLED,
+                LlamacppNuma::Distribute => llamacpp::GGML_NUMA_STRATEGY_DISTRIBUTE,
+                LlamacppNuma::Isolate => llamacpp::GGML_NUMA_STRATEGY_ISOLATE,
+                LlamacppNuma::Numactl => llamacpp::GGML_NUMA_STRATEGY_NUMACTL,
+                LlamacppNuma::Mirror => llamacpp::GGML_NUMA_STRATEGY_MIRROR,
+            });
+        });
+
+        let (status_tx, status_rx) = watch::channel(false);
+        let (shutdown_tx, shutdown_rx) = watch::channel(false);
+        let (ok_tx, ok_rx) = oneshot::channel();
+        let (tx, mut rx) = unbounded_channel::<LlamacppRequest>();
+        let (sync_tx, sync_rx) = mpsc::channel();
+
+        spawn(async move {
+            let mut n_tokens = 0;
+            let mut requests = Vec::with_capacity(conf.max_batch_size);
+
+            let flush = |requests: &mut Vec<_>, n_tokens: &mut usize| {
+                if !requests.is_empty() {
+                    let _ =
+                        sync_tx.send(replace(requests, Vec::with_capacity(conf.max_batch_size)));
+                    *n_tokens = 0;
+                }
+            };
+            loop {
+                match timeout(conf.batch_timeout, rx.recv()).await {
+                    Ok(Some(request)) => {
+                        let n_tokens_to_add = request.input_ids.len();
+
+                        if n_tokens + n_tokens_to_add > conf.max_batch_total_tokens {
+                            flush(&mut requests, &mut n_tokens);
+                        }
+                        n_tokens += n_tokens_to_add;
+                        requests.push(request);
+
+                        if requests.len() == conf.max_batch_size {
+                            flush(&mut requests, &mut n_tokens);
+                        }
+                    }
+                    Ok(None) => break,                             // closed
+                    Err(_) => flush(&mut requests, &mut n_tokens), // timeout
+                }
+            }
+        });
+
+        spawn_blocking(move || {
+            let mut llamacpp = match Llamacpp::new(conf) {
+                Ok(v) => {
+                    let _ = ok_tx.send(Ok(()));
+                    v
+                }
+                Err(e) => {
+                    let _ = ok_tx.send(Err(e));
+                    return;
+                }
+            };
+            let vocab = tokenizer.get_added_vocabulary();
+
+            // health() returns true
+            let _ = status_tx.send(true);
+
+            while let Ok(requests) = sync_rx.recv() {
+                if *shutdown_rx.borrow() {
+                    break;
+                }
+                let start_time = Instant::now();
+                let mut seqs: Vec<LlamacppSeq> = Vec::with_capacity(requests.len());
+                llamacpp.batch.n_tokens = 0;
+
+                for (seq_id, request) in requests.iter().enumerate() {
+                    debug!("Request: {:?}", request);
+                    // TODO remove this
+                    let sampler = match LlamacppSampler::new(request) {
+                        Some(sampler) => sampler,
+                        _ => {
+                            let _ = request.tx.send(Err(InferError::IncompleteGeneration));
+                            continue;
+                        }
+                    };
+                    let last_pos = request.input_ids.len() - 1;
+
+                    for (pos, &token_id) in request.input_ids.iter().enumerate() {
+                        llamacpp.batch_push(
+                            token_id as llamacpp::llama_token,
+                            pos as llamacpp::llama_pos,
+                            seq_id as llamacpp::llama_seq_id,
+                            pos == last_pos, // check samplers
+                        );
+                    }
+                    seqs.push(LlamacppSeq {
+                        id: seq_id,
+                        batch_pos: llamacpp.batch.n_tokens as usize - 1,
+                        token: llamacpp::LLAMA_TOKEN_NULL,
+                        pos: last_pos as llamacpp::llama_pos + 1,
+                        sampler,
+                        text: String::with_capacity(1024),
+                        n_new_tokens: 0,
+                        running: true,
+                    });
+                }
+                while llamacpp.batch.n_tokens > 0 {
+                    if llamacpp.decode() != 0 {
+                        warn!("llama_decode failed, clearing kv cache");
+                        llamacpp.clear_kv_cache(-1);
+                        for seq in seqs.iter_mut() {
+                            let _ = requests[seq.id]
+                                .tx
+                                .send(Err(InferError::IncompleteGeneration));
+                            seq.running = false;
+                        }
+                        break;
+                    }
+                    for seq in seqs.iter_mut() {
+                        if !seq.running {
+                            continue;
+                        }
+                        let (next, logprob) = seq.sampler.sample(&mut llamacpp, seq.batch_pos);
+                        seq.n_new_tokens += 1;
+                        seq.token = next;
+
+                        let piece = match tokenizer.decode(&[next as u32], false) {
+                            Ok(piece) => piece,
+                            Err(e) => {
+                                error!("Failed to decode token: {e}");
+                                let _ = requests[seq.id]
+                                    .tx
+                                    .send(Err(InferError::IncompleteGeneration));
+                                seq.running = false;
+                                continue;
+                            }
+                        };
+                        let special = vocab.is_special_token(&piece);
+
+                        if !special {
+                            seq.text.push_str(&piece);
+                        }
+                        let token = Token {
+                            id: next as _,
+                            text: piece,
+                            logprob,
+                            special,
+                        };
+                        let finish: Option<FinishReason> = {
+                            if unsafe { llamacpp::vocab_is_eog(llamacpp.vocab, next) } {
+                                Some(FinishReason::EndOfSequenceToken)
+                            } else if seq.n_new_tokens == requests[seq.id].max_new_tokens {
+                                Some(FinishReason::Length)
+                            } else {
+                                None
+                            }
+                        };
+                        if let Some(reason) = finish {
+                            let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::End {
+                                token,
+                                top_tokens: vec![],
+                                generated_text: GeneratedText {
+                                    text: seq.text.clone(),
+                                    generated_tokens: seq.n_new_tokens as _,
+                                    finish_reason: reason,
+                                    seed: Some(requests[seq.id].seed as _),
+                                },
+                                start: start_time,
+                                queued: requests[seq.id].time,
+                            }));
+                            seq.running = false;
+                            continue;
+                        }
+                        let _ = requests[seq.id]
+                            .tx
+                            .send(Ok(InferStreamResponse::Intermediate {
+                                token,
+                                top_tokens: vec![],
+                            }));
+                    }
+                    // generate a new batch
+                    llamacpp.batch.n_tokens = 0;
+
+                    for seq in seqs.iter_mut() {
+                        if seq.running {
+                            seq.batch_pos =
+                                llamacpp.batch_push(seq.token, seq.pos, seq.id as _, true);
+                            seq.pos += 1;
+                        } else {
+                            llamacpp.clear_kv_cache(seq.id as _);
+                        }
+                    }
+                }
+            }
+        });
+        (
+            Self {
+                tx,
+                status: status_rx,
+            },
+            ok_rx,
+            shutdown_tx,
+        )
+    }
+}
+
+#[async_trait]
+impl Backend for LlamacppBackend {
+    #[instrument(skip_all)]
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+        debug!(?request);
+        let (tx, rx) = unbounded_channel::<Result<InferStreamResponse, InferError>>();
+        match LlamacppRequest::new(&request, tx) {
+            Some(v) => match self.tx.send(v) {
+                Err(e) => Err(InferError::GenerationError(e.to_string())),
+                _ => Ok(UnboundedReceiverStream::new(rx)),
+            },
+            _ => Err(InferError::GenerationError("Bad request".to_string())),
+        }
+    }
+
+    async fn health(&self, _: bool) -> bool {
+        *self.status.borrow()
+    }
+
+    fn name(&self) -> &'static str {
+        "llamacpp"
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum BackendError {
+    #[error("CString error: {0}")]
+    CStringError(#[from] std::ffi::NulError),
+    #[error("Llamacpp error: {0}")]
+    Llamacpp(String),
+}
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
new file mode 100644
index 00000000..5a07acdc
--- /dev/null
+++ b/backends/llamacpp/src/main.rs
@@ -0,0 +1,284 @@
+mod backend;
+
+use backend::{
+    BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
+    LlamacppSplitMode,
+};
+use clap::Parser;
+use text_generation_router::{logging, server, usage_stats};
+use thiserror::Error;
+use tokenizers::{FromPretrainedParameters, Tokenizer};
+use tokio::sync::oneshot::error::RecvError;
+use tracing::{error, warn};
+
+/// Backend Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    /// Name of the model to load.
+    #[clap(long, env)]
+    model_id: String,
+
+    /// Revision of the model.
+    #[clap(default_value = "main", long, env)]
+    revision: String,
+
+    /// Path to the GGUF model file for inference.
+    #[clap(long, env)]
+    model_gguf: String, // TODO Option() with hf->gguf & quantize
+
+    /// Number of threads to use for generation.
+    #[clap(long, env)]
+    n_threads: Option<usize>,
+
+    /// Number of threads to use for batch processing.
+    #[clap(long, env)]
+    n_threads_batch: Option<usize>,
+
+    /// Number of layers to store in VRAM.
+    #[clap(default_value = "0", long, env)]
+    n_gpu_layers: usize,
+
+    /// Split the model across multiple GPUs.
+    #[clap(default_value = "layer", long, env)]
+    split_mode: LlamacppSplitMode,
+
+    /// Defragment the KV cache if holes/size > threshold.
+    #[clap(default_value = "-1.0", long, env)]
+    defrag_threshold: f32,
+
+    /// Enable NUMA optimizations.
+    #[clap(default_value = "disabled", value_enum, long, env)]
+    numa: LlamacppNuma,
+
+    /// Use memory mapping for the model.
+    #[clap(long, env)]
+    use_mmap: bool,
+
+    /// Use memory locking to prevent swapping.
+    #[clap(long, env)]
+    use_mlock: bool,
+
+    /// Enable offloading of KQV operations to the GPU.
+    #[clap(long, env)]
+    offload_kqv: bool,
+
+    /// Enable flash attention for faster inference. (EXPERIMENTAL)
+    #[clap(long, env)]
+    flash_attention: bool,
+
+    /// Data type used for K cache.
+    #[clap(default_value = "f16", value_enum, long, env)]
+    type_k: LlamacppGGMLType,
+
+    /// Data type used for V cache.
+    #[clap(default_value = "f16", value_enum, long, env)]
+    type_v: LlamacppGGMLType,
+
+    /// Number of tokenizer workers used for payload validation and truncation.
+    #[clap(default_value = "2", long, env)]
+    validation_workers: usize,
+
+    /// Maximum number of concurrent requests.
+    #[clap(long, env)]
+    max_concurrent_requests: Option<usize>,
+
+    /// Maximum number of input tokens per request.
+    #[clap(default_value = "1024", long, env)]
+    max_input_tokens: usize,
+
+    /// Maximum number of total tokens (input + output) per request.
+    #[clap(default_value = "2048", long, env)]
+    max_total_tokens: usize,
+
+    /// Maximum number of tokens in a batch.
+    #[clap(long, env)]
+    max_batch_total_tokens: Option<usize>,
+
+    /// Maximum number of tokens in a physical batch.
+    #[clap(long, env)]
+    max_physical_batch_total_tokens: Option<usize>,
+
+    /// Maximum number of requests per batch.
+    #[clap(long, env)]
+    max_batch_size: Option<usize>,
+
+    /// IP address to listen on.
+    #[clap(default_value = "0.0.0.0", long)]
+    hostname: String,
+
+    /// Port to listen on.
+    #[clap(default_value = "3000", long, short, env)]
+    port: u16,
+
+    /// Enable JSON output format.
+    #[clap(long, env)]
+    json_output: bool,
+
+    /// OTLP endpoint for telemetry data.
+    #[clap(long, env)]
+    otlp_endpoint: Option<String>,
+
+    /// Service name for OTLP telemetry.
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+
+    /// Allowed origins for CORS.
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
+
+    /// Path to the tokenizer configuration file.
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+
+    /// Disable grammar support.
+    #[clap(long, env)]
+    disable_grammar_support: bool,
+
+    /// Maximum number of inputs per request.
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+
+    /// Level of usage statistics collection.
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
+
+    /// Maximum payload size in bytes.
+    #[clap(default_value = "2000000", long, env)]
+    payload_limit: usize,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), RouterError> {
+    let args = Args::parse();
+
+    logging::init_logging(args.otlp_endpoint, args.otlp_service_name, args.json_output);
+
+    let n_threads = match args.n_threads {
+        Some(0) | None => num_cpus::get(),
+        Some(threads) => threads,
+    };
+    let n_threads_batch = match args.n_threads_batch {
+        Some(0) | None => n_threads,
+        Some(threads) => threads,
+    };
+    let max_batch_size = match args.max_batch_size {
+        Some(0) | None => n_threads_batch,
+        Some(threads) => threads,
+    };
+    let max_batch_total_tokens = match args.max_batch_total_tokens {
+        None => max_batch_size * args.max_total_tokens,
+        Some(size) => size,
+    };
+    let max_physical_batch_total_tokens = match args.max_physical_batch_total_tokens {
+        None => max_batch_total_tokens,
+        Some(size) => size,
+    };
+    let max_concurrent_requests = match args.max_concurrent_requests {
+        None => max_batch_size * 2,
+        Some(size) => size,
+    };
+    if args.max_input_tokens >= args.max_total_tokens {
+        return Err(RouterError::ArgumentValidation(
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
+        ));
+    }
+    if args.max_total_tokens > max_batch_total_tokens {
+        return Err(RouterError::ArgumentValidation(
+            "`max_total_tokens` must be <= `max_batch_total_tokens`".to_string(),
+        ));
+    }
+    if max_batch_size * args.max_total_tokens > max_batch_total_tokens {
+        return Err(RouterError::ArgumentValidation(
+            "`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(),
+        ));
+    }
+
+    // TODO: check if we use the same cache of Server
+    // check if llamacpp is faster
+    let tokenizer = {
+        let token = std::env::var("HF_TOKEN")
+            .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+            .ok();
+        let params = FromPretrainedParameters {
+            revision: args.revision.clone(),
+            token,
+            ..Default::default()
+        };
+        Tokenizer::from_pretrained(args.model_id.clone(), Some(params))?
+    };
+
+    let (backend, ok, shutdown) = LlamacppBackend::new(
+        LlamacppConfig {
+            model_gguf: args.model_gguf,
+            n_threads,
+            n_threads_batch,
+            n_gpu_layers: args.n_gpu_layers,
+            split_mode: args.split_mode,
+            defrag_threshold: args.defrag_threshold,
+            numa: args.numa,
+            use_mmap: args.use_mmap,
+            use_mlock: args.use_mlock,
+            flash_attention: args.flash_attention,
+            type_k: args.type_k,
+            type_v: args.type_v,
+            offload_kqv: args.offload_kqv,
+            max_batch_total_tokens,
+            max_physical_batch_total_tokens,
+            max_batch_size,
+            batch_timeout: tokio::time::Duration::from_millis(5),
+        },
+        tokenizer,
+    );
+    ok.await??;
+
+    if cfg!(debug_assertions) {
+        warn!("Graceful shutdown disabled!");
+        let _ = tokio::task::spawn(async move {
+            let _ = tokio::signal::ctrl_c().await;
+            let _ = shutdown.send(true);
+        });
+    }
+
+    server::run(
+        backend,
+        max_concurrent_requests,
+        0, // max_best_of
+        0, // max_stop_sequences
+        0, // max_top_n_tokens
+        args.max_input_tokens,
+        args.max_total_tokens,
+        args.validation_workers,
+        None,          // api_key
+        args.model_id, // tokenizer_name
+        args.tokenizer_config_path,
+        Some(args.revision),
+        false, // trust_remote_code
+        args.hostname,
+        args.port,
+        args.cors_allow_origin,
+        false, // ngrok,
+        None,  // ngrok_authtoken,
+        None,  // ngrok_edge,
+        args.disable_grammar_support,
+        args.max_client_batch_size,
+        args.usage_stats,
+        args.payload_limit,
+    )
+    .await?;
+    Ok(())
+}
+
+#[derive(Debug, Error)]
+enum RouterError {
+    #[error("Argument validation error: {0}")]
+    ArgumentValidation(String),
+    #[error("Tokenizer error: {0}")]
+    Tokenizer(#[from] tokenizers::Error),
+    #[error("Backend error: {0}")]
+    Backend(#[from] BackendError),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
+    #[error("Recv error: {0}")]
+    RecvError(#[from] RecvError),
+}
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 8fcba516..e073353f 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -52,6 +52,8 @@
 - sections:
   - local: backends/trtllm
     title: TensorRT-LLM
+  - local: backends/llamacpp
+    title: Llamacpp
   title: Backends
 - sections:
   - local: reference/launcher
diff --git a/docs/source/backends/llamacpp.md b/docs/source/backends/llamacpp.md
new file mode 100644
index 00000000..dbd93e86
--- /dev/null
+++ b/docs/source/backends/llamacpp.md
@@ -0,0 +1,120 @@
+# Llamacpp Backend
+
+The llamacpp backend facilitates the deployment of large language models
+(LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine
+optimized for both CPU and GPU computation. This backend is a component
+of Hugging Face’s **Text Generation Inference (TGI)** suite,
+specifically designed to streamline the deployment of LLMs in production
+environments.
+
+## Key Capabilities
+
+- Full compatibility with GGUF format and all quantization formats
+  (GGUF-related constraints may be mitigated dynamically by on-the-fly
+  generation in future updates)
+- Optimized inference on CPU and GPU architectures
+- Containerized deployment, eliminating dependency complexity
+- Seamless interoperability with the Hugging Face ecosystem
+
+## Model Compatibility
+
+This backend leverages models formatted in **GGUF**, providing an
+optimized balance between computational efficiency and model accuracy.
+You will find the best models on [Hugging Face][GGUF].
+
+## Build Docker image
+
+For optimal performance, the Docker image is compiled with native CPU
+instructions, thus it's highly recommended to execute the container on
+the host used during the build process. Efforts are ongoing to enhance
+portability while maintaining high computational efficiency.
+
+```bash
+docker build \
+    -t tgi-llamacpp \
+    https://github.com/huggingface/text-generation-inference.git \
+    -f Dockerfile_llamacpp
+```
+
+### Build parameters
+
+| Parameter                            | Description                       |
+| ------------------------------------ | --------------------------------- |
+| `--build-arg llamacpp_version=bXXXX` | Specific version of llama.cpp     |
+| `--build-arg llamacpp_cuda=ON`       | Enables CUDA acceleration         |
+| `--build-arg cuda_arch=ARCH`         | Defines target CUDA architecture  |
+
+## Model preparation
+
+Retrieve a GGUF model and store it in a specific directory, for example:
+
+```bash
+mkdir -p ~/models
+cd ~/models
+curl -LOJ "https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_0.gguf?download=true"
+```
+
+## Run Docker image
+
+### CPU-based inference
+
+```bash
+docker run \
+    -p 3000:3000 \
+    -e "HF_TOKEN=$HF_TOKEN" \
+    -v "$HOME/models:/models" \
+    tgi-llamacpp \
+    --model-id "Qwen/Qwen2.5-3B-Instruct" \
+    --model-gguf "/models/qwen2.5-3b-instruct-q4_0.gguf"
+```
+
+### GPU-Accelerated inference
+
+```bash
+docker run \
+    --gpus all \
+    -p 3000:3000 \
+    -e "HF_TOKEN=$HF_TOKEN" \
+    -v "$HOME/models:/models" \
+    tgi-llamacpp \
+    --n-gpu-layers 99
+    --model-id "Qwen/Qwen2.5-3B-Instruct" \
+    --model-gguf "/models/qwen2.5-3b-instruct-q4_0.gguf"
+```
+
+## Advanced parameters
+
+A full listing of configurable parameters is available in the `--help`:
+
+```bash
+docker run tgi-llamacpp --help
+
+```
+
+The table below summarizes key options:
+
+| Parameter                           | Description                                                            |
+|-------------------------------------|------------------------------------------------------------------------|
+| `--n-threads`                       | Number of threads to use for generation                                |
+| `--n-threads-batch`                 | Number of threads to use for batch processing                          |
+| `--n-gpu-layers`                    | Number of layers to store in VRAM                                      |
+| `--split-mode`                      | Split the model across multiple GPUs                                   |
+| `--defrag-threshold`                | Defragment the KV cache if holes/size > threshold                      |
+| `--numa`                            | Enable NUMA optimizations                                              |
+| `--use-mmap`                        | Use memory mapping for the model                                       |
+| `--use-mlock`                       | Use memory locking to prevent swapping                                 |
+| `--offload-kqv`                     | Enable offloading of KQV operations to the GPU                         |
+| `--flash-attention`                 | Enable flash attention for faster inference                            |
+| `--type-k`                          | Data type used for K cache                                             |
+| `--type-v`                          | Data type used for V cache                                             |
+| `--validation-workers`              | Number of tokenizer workers used for payload validation and truncation |
+| `--max-concurrent-requests`         | Maximum number of concurrent requests                                  |
+| `--max-input-tokens`                | Maximum number of input tokens per request                             |
+| `--max-total-tokens`                | Maximum number of total tokens (input + output) per request            |
+| `--max-batch-total-tokens`          | Maximum number of tokens in a batch                                    |
+| `--max-physical-batch-total-tokens` | Maximum number of tokens in a physical batch                           |
+| `--max-batch-size`                  | Maximum number of requests per batch                                   |
+
+---
+[llama.cpp]: https://github.com/ggerganov/llama.cpp
+[GGUF]: https://huggingface.co/models?library=gguf&sort=trending
diff --git a/docs/source/multi_backend_support.md b/docs/source/multi_backend_support.md
index c4df15bc..03d6d30b 100644
--- a/docs/source/multi_backend_support.md
+++ b/docs/source/multi_backend_support.md
@@ -11,3 +11,5 @@ TGI remains consistent across backends, allowing you to switch between them seam
 * **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference.
   It utilizes specialized optimizations and custom kernels for enhanced performance.
   However, it requires a model-specific compilation step for each GPU architecture.
+* **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models
+  (LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation.

From f0ed76583c80912072a8eeb8c244229ba46559f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 18 Feb 2025 10:03:53 +0100
Subject: [PATCH 088/183] Use eetq kernel from the hub (#3029)

* Use eetq kernel from the hub

* Fixing the CI.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 Dockerfile                                   |   9 -
 flake.lock                                   |   7 +-
 flake.nix                                    |   2 +-
 nix/server.nix                               |   4 +-
 server/Makefile                              |   1 -
 server/hf-kernels.lock                       | 198 +++++++++++++++++++
 server/pyproject.toml                        |   1 +
 server/text_generation_server/layers/eetq.py |  10 +-
 8 files changed, 213 insertions(+), 19 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 39c9f66b..2d769c15 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -121,13 +121,6 @@ COPY server/Makefile-awq Makefile
 # Build specific version of transformers
 RUN . .venv/bin/activate && make build-awq
 
-# Build eetq kernels
-FROM kernel-builder AS eetq-kernels-builder
-WORKDIR /usr/src
-COPY server/Makefile-eetq Makefile
-# Build specific version of transformers
-RUN . .venv/bin/activate && make build-eetq
-
 # Build Lorax Punica kernels
 FROM kernel-builder AS lorax-punica-builder
 WORKDIR /usr/src
@@ -216,8 +209,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311
 COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from awq kernels builder
 COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
-# Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from lorax punica kernels builder
 COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
diff --git a/flake.lock b/flake.lock
index 3a9d9c7c..5d6ee463 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,15 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1738769628,
-        "narHash": "sha256-hgHf1mscFbH9XtT3dYtFQcxRfict9N+Vi6QSW1c+FjU=",
+        "lastModified": 1739803255,
+        "narHash": "sha256-lreIfcjSt6D0wOuZ6jm3WEBYvYvED63T+pOKmOgBLi8=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "9a5a58219dead9704d83d9d32f105b6b90bd31f2",
+        "rev": "30ab7423277fc93c8fc0ca4df737478ebfdb8eec",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
+        "ref": "eetq-0.0.1",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 83cedfa6..5ba50114 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/eetq-0.0.1";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/nix/server.nix b/nix/server.nix
index b638449b..98193cac 100644
--- a/nix/server.nix
+++ b/nix/server.nix
@@ -6,7 +6,6 @@
   awq-inference-engine,
   causal-conv1d,
   compressed-tensors,
-  eetq,
   einops,
   exllamav2,
   flashinfer,
@@ -36,6 +35,7 @@
   py-cpuinfo,
   pydantic,
   quantization,
+  quantization-eetq,
   safetensors,
   tokenizers,
   torch,
@@ -80,7 +80,6 @@ buildPythonPackage {
 
   dependencies = [
     awq-inference-engine
-    eetq
     causal-conv1d
     compressed-tensors
     einops
@@ -111,6 +110,7 @@ buildPythonPackage {
     py-cpuinfo
     pydantic
     quantization
+    quantization-eetq
     safetensors
     sentencepiece
     tokenizers
diff --git a/server/Makefile b/server/Makefile
index 746b7faa..0db6f89b 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -2,7 +2,6 @@ include Makefile-flash-att
 include Makefile-flash-att-v2
 include Makefile-vllm
 include Makefile-awq
-include Makefile-eetq
 include Makefile-selective-scan
 include Makefile-lorax-punica
 include Makefile-exllamav2
diff --git a/server/hf-kernels.lock b/server/hf-kernels.lock
index 43f7f17d..5254cb0c 100644
--- a/server/hf-kernels.lock
+++ b/server/hf-kernels.lock
@@ -6736,5 +6736,203 @@
         "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
       }
     ]
+  },
+  {
+    "repo_id": "kernels-community/quantization-eetq",
+    "sha": "a80ce846d6270ddddeee109523ed947f594f246b",
+    "files": [
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "9c191845fb7acbd7ea6bae36ce8c237b168557e1"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_v7rnpcck3kry4.abi3.so",
+        "blob_id": "9edc9126b9ec8ce4f47a8e6688a5f0329c905329"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "ccec58b06a2282da51356fe5d04dd1e2757ce80c"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_zcfiojfkx55be.abi3.so",
+        "blob_id": "ea27fb040515267ec631cec5545b878da680e7cc"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "bb409419898138ffa9ade9ba505a167a067ea378"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_btymam4x7xvs6.abi3.so",
+        "blob_id": "0395dd048ccf10ed020a77fa04bcb026ba369d73"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "f250a00832d2044f7bbb87557a1c878d9c8dd24d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_yy3p6bsf622sq.abi3.so",
+        "blob_id": "c98d156835e442b039d38a82e9f111036750329c"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "b5259247e8fb3ed9429cf005a525edc8bcae4903"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_imijtykkseqze.abi3.so",
+        "blob_id": "c46908ce00d02376ae8e18efebb7fee55afbc3ac"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "79f8d42700ad34b9b46e6e328f90885d1ee9beab"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_4qerj3t7ddiry.abi3.so",
+        "blob_id": "9ba519d2fd4e347b784c21f4c171cbbab57c7774"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "805ec785b7f5196f78dfe77b6cd7c2603c02490e"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_j23ltbqvrnixg.abi3.so",
+        "blob_id": "77d53c16e57c658e8f9caa37b0084c4a3a7ffda1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "7b590a5a6ede67e0ae13f97dbd7a82a4674e1b23"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_p5neqtnhdgxv2.abi3.so",
+        "blob_id": "e3e5fbd8ce3232b6e9a7c3077eab9665b95bef49"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "0be7ffcb2e9590899683a197b977ec0b39ca7cb7"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_idk3dezy35dfk.abi3.so",
+        "blob_id": "61aa67cbe7ce810bf9792e6e8f19219c757ff181"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "998eba3eddd0520769a2b4ecb3402c024bde44ea"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_fpjoxzd7nm2qa.abi3.so",
+        "blob_id": "31d835db1d0348e3f35c23e6a8f2532fd7e9fea7"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "6d5320b05b03f2f3ddfd299d6e2a72aa6116264f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_k7mlunxe2ye4s.abi3.so",
+        "blob_id": "1946e4c2fab63243d051012cb12e19895828145f"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/__init__.py",
+        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_ops.py",
+        "blob_id": "9b15d85f44e4223ce1f16df987feafd6640dcc62"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_7m7hz3sbwkaio.abi3.so",
+        "blob_id": "eb1536ccd1dfa2655ea7de4445aa3c6790f3a0ae"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
+        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
+      }
+    ]
   }
 ]
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 3f02b4ec..37cb6b1a 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -42,6 +42,7 @@ build-backend = "setuptools.build_meta"
 "kernels-community/paged-attention" = ">=0.0.2"
 "kernels-community/moe" = ">=0.1.1"
 "kernels-community/quantization" = ">=0.0.3"
+"kernels-community/quantization-eetq" = ">=0.0.1"
 
 [project.scripts]
 text-generation-server = "text_generation_server.cli:app"
diff --git a/server/text_generation_server/layers/eetq.py b/server/text_generation_server/layers/eetq.py
index b1e5235a..da1e8f0e 100644
--- a/server/text_generation_server/layers/eetq.py
+++ b/server/text_generation_server/layers/eetq.py
@@ -1,9 +1,13 @@
 from dataclasses import dataclass
 
 import torch
-from EETQ import quant_weights, w8_a16_gemm
+from text_generation_server.utils.kernels import load_kernel
 from text_generation_server.utils.weights import UnquantizedWeight
 
+quantization_eetq = load_kernel(
+    module="quantization_eetq", repo_id="kernels-community/quantization-eetq"
+)
+
 
 @dataclass
 class EETQWeight(UnquantizedWeight):
@@ -31,13 +35,13 @@ class EETQLinear(torch.nn.Module):
         if weight.dtype != torch.float16:
             weight = weight.to(dtype=torch.float16)
         weight = torch.t(weight).contiguous().cpu()
-        weight, scale = quant_weights(weight, torch.int8, False)
+        weight, scale = quantization_eetq.quant_weights(weight, torch.int8, False)
 
         self.weight = weight.cuda(device)
         self.scale = scale.cuda(device)
         self.bias = bias.cuda(device) if bias is not None else None
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
-        output = w8_a16_gemm(input, self.weight, self.scale)
+        output = quantization_eetq.w8_a16_gemm(input, self.weight, self.scale)
         output = output + self.bias if self.bias is not None else output
         return output

From 794ec58b75868924dea1c341c115a36b44cabf6e Mon Sep 17 00:00:00 2001
From: celsowm <celsowm@gmail.com>
Date: Tue, 18 Feb 2025 06:08:28 -0300
Subject: [PATCH 089/183] Update README.md (#3024)

only way to avoid:
error: experimental Nix feature 'nix-command' is disabled; add '--extra-experimental-features nix-command' to enable it
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e4c4486f..b344892e 100644
--- a/README.md
+++ b/README.md
@@ -263,7 +263,8 @@ locally, which can take hours.
 After that you can run TGI with `nix run`:
 
 ```shell
-nix run . -- --model-id meta-llama/Llama-3.1-8B-Instruct
+cd text-generation-inference
+nix run --extra-experimental-features nix-command --extra-experimental-features flakes . -- --model-idmeta-llama/Llama-3.1-8B-Instruct
 ```
 
 **Note:** when you are using Nix on a non-NixOS system, you have to [make some symlinks](https://danieldk.eu/Nix-CUDA-on-non-NixOS-systems#make-runopengl-driverlib-and-symlink-the-driver-library)

From 8a1cfd6122449d7681e1ab27fafe1a7b999a7094 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 18 Feb 2025 10:33:22 +0100
Subject: [PATCH 090/183] Add `loop_controls` feature to `minijinja` to handle
 `{% break %}` (#2998)

* Add `loop_controls` feature to `minijinja`

* Add `test_chat_template_loop_controls` to test `break`
---
 router/Cargo.toml                 |  2 +-
 router/src/infer/chat_template.rs | 66 +++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/router/Cargo.toml b/router/Cargo.toml
index e4d0179a..9326258d 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -48,7 +48,7 @@ ngrok = { version = "0.13.1", features = ["axum"], optional = true }
 init-tracing-opentelemetry = { version = "0.14.1", features = [
   "opentelemetry-otlp",
 ] }
-minijinja = { workspace = true }
+minijinja = { workspace = true, features = ["loop_controls"] }
 minijinja-contrib = { workspace = true }
 futures-util = "0.3.30"
 regex = "1.10.3"
diff --git a/router/src/infer/chat_template.rs b/router/src/infer/chat_template.rs
index 8303ee76..2fef2dcb 100644
--- a/router/src/infer/chat_template.rs
+++ b/router/src/infer/chat_template.rs
@@ -186,6 +186,72 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_chat_template_loop_controls() {
+        // some chat templates as e.g. CohereForAI/c4ai-command-r7b-12-202 contain `break`
+        // statements in their chat templates, so the feature `loop_controls` has been included
+        // in `minijinja`
+        let env = Environment::new();
+
+        let source = r#"
+        {% set user_count = 0 %}
+        {% for message in messages %}
+            {% if message['role'] == 'user' %}
+                {{'### User:\n' + message['content']+'\n\n'}}
+                {% set user_count = user_count + 1 %}
+                {% if user_count >= 2 %}
+                    {% break %}
+                {% endif %}
+            {% elif message['role'] == 'assistant' %}
+                {{'### Assistant:\n'  + message['content']}}
+            {% endif %}
+        {% endfor %}
+        {% if add_generation_prompt %}
+            {{ '### Assistant:\n' }}
+        {% endif %}"#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
+
+        assert_eq!(
+            result,
+            "### User:\nHi!\n\n### Assistant:\nHello how can I help?### User:\nWhat is Deep Learning?\n\n### Assistant:\n"
+        );
+    }
+
     #[test]
     fn test_chat_template_invalid_with_raise() {
         let mut env = Environment::new();

From b8a4928d0e44d3bda29ea8f2cf6b96d29aba548f Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 18 Feb 2025 12:03:41 +0100
Subject: [PATCH 091/183] Pinning trufflehog. (#3032)

---
 .github/workflows/trufflehog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/trufflehog.yaml b/.github/workflows/trufflehog.yaml
index 7f6646e1..024702b6 100644
--- a/.github/workflows/trufflehog.yaml
+++ b/.github/workflows/trufflehog.yaml
@@ -15,6 +15,6 @@ jobs:
         with:
           fetch-depth: 0
       - name: Secret Scanning
-        uses: trufflesecurity/trufflehog@main
+        uses: trufflesecurity/trufflehog@853e1e8d249fd1e29d0fcc7280d29b03df3d643d
         with:
           extra_args: --results=verified,unknown

From 5543fdc76578ec521027c67bd664b4e9697991c8 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 18 Feb 2025 12:19:51 +0100
Subject: [PATCH 092/183] =?UTF-8?q?It's=20find=20in=20some=20machine.=20us?=
 =?UTF-8?q?ing=20hf=5Fhub::api::sync::Api=20to=20download=20c=E2=80=A6=20(?=
 =?UTF-8?q?#3030)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's find in some machine. using hf_hub::api::sync::Api to download config is not successful which will make warmup fail since attribute like max_position_embeddings could not be got. update hf-hub to the latest version could fix it

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Cargo.lock          | 493 ++++++++++++++++++++++++++++----------------
 Dockerfile_intel    |   2 +-
 launcher/Cargo.toml |   2 +-
 3 files changed, 322 insertions(+), 175 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 547cff9b..4603f77d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -28,7 +28,7 @@ dependencies = [
  "once_cell",
  "serde",
  "version_check",
- "zerocopy",
+ "zerocopy 0.7.35",
 ]
 
 [[package]]
@@ -143,7 +143,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -182,18 +182,18 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
 name = "async-trait"
-version = "0.1.85"
+version = "0.1.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056"
+checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -278,9 +278,9 @@ dependencies = [
 
 [[package]]
 name = "aws-lc-sys"
-version = "0.25.0"
+version = "0.25.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71b2ddd3ada61a305e1d8bb6c005d1eaa7d14d903681edfc400406d523a9b491"
+checksum = "54ac4f13dad353b209b34cbec082338202cbc01c8f00336b55c750c13ac91f8f"
 dependencies = [
  "bindgen 0.69.5",
  "cc",
@@ -464,7 +464,7 @@ dependencies = [
  "regex",
  "rustc-hash 1.1.0",
  "shlex",
- "syn 2.0.96",
+ "syn 2.0.98",
  "which",
 ]
 
@@ -483,9 +483,9 @@ dependencies = [
  "proc-macro2",
  "quote",
  "regex",
- "rustc-hash 2.1.0",
+ "rustc-hash 2.1.1",
  "shlex",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -544,9 +544,9 @@ checksum = "3eeab4423108c5d7c744f4d234de88d18d636100093ae04caf4825134b9c3a32"
 
 [[package]]
 name = "built"
-version = "0.7.5"
+version = "0.7.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c360505aed52b7ec96a3636c3f039d99103c37d1d9b4f7a8c743d3ea9ffcd03b"
+checksum = "56ed6191a7e78c36abdb16ab65341eefd73d64d303fffccdbb00d51e4205967b"
 
 [[package]]
 name = "bumpalo"
@@ -580,9 +580,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
 
 [[package]]
 name = "bytes"
-version = "1.9.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
+checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9"
 
 [[package]]
 name = "camino"
@@ -639,9 +639,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.10"
+version = "1.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229"
+checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9"
 dependencies = [
  "jobserver",
  "libc",
@@ -723,9 +723,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.27"
+version = "4.5.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796"
+checksum = "92b7b18d71fad5313a1e320fa9897994228ce274b60faa4d694fe0ea89cd9e6d"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -733,9 +733,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.27"
+version = "4.5.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7"
+checksum = "a35db2071778a7344791a4fb4f95308b5673d219dee3ae348b86642574ecc90c"
 dependencies = [
  "anstream",
  "anstyle",
@@ -745,14 +745,14 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.24"
+version = "4.5.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c"
+checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -763,9 +763,9 @@ checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
 
 [[package]]
 name = "cmake"
-version = "0.1.53"
+version = "0.1.54"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6"
+checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0"
 dependencies = [
  "cc",
 ]
@@ -988,9 +988,9 @@ dependencies = [
 
 [[package]]
 name = "csv-core"
-version = "0.1.11"
+version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
+checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d"
 dependencies = [
  "memchr",
 ]
@@ -1007,9 +1007,9 @@ dependencies = [
 
 [[package]]
 name = "cxx"
-version = "1.0.137"
+version = "1.0.140"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fc894913dccfed0f84106062c284fa021c3ba70cb1d78797d6f5165d4492e45"
+checksum = "dc49567e08c72902f4cbc7242ee8d874ec9cbe97fbabf77b4e0e1f447513e13a"
 dependencies = [
  "cc",
  "cxxbridge-cmd",
@@ -1021,47 +1021,47 @@ dependencies = [
 
 [[package]]
 name = "cxx-build"
-version = "1.0.137"
+version = "1.0.140"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "503b2bfb6b3e8ce7f95d865a67419451832083d3186958290cee6c53e39dfcfe"
+checksum = "fe46b5309c99e9775e7a338c98e4097455f52db5b684fd793ca22848fde6e371"
 dependencies = [
  "cc",
  "codespan-reporting",
  "proc-macro2",
  "quote",
  "scratch",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
 name = "cxxbridge-cmd"
-version = "1.0.137"
+version = "1.0.140"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0d2cb64a95b4b5a381971482235c4db2e0208302a962acdbe314db03cbbe2fb"
+checksum = "4315c4ce8d23c26d87f2f83698725fd5718d8e6ace4a9093da2664d23294d372"
 dependencies = [
- "clap 4.5.27",
+ "clap 4.5.30",
  "codespan-reporting",
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
 name = "cxxbridge-flags"
-version = "1.0.137"
+version = "1.0.140"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f797b0206463c9c2a68ed605ab28892cca784f1ef066050f4942e3de26ad885"
+checksum = "f55d69deb3a92f610a60ecc524a72c7374b6dc822f8fb7bb4e5d9473f10530c4"
 
 [[package]]
 name = "cxxbridge-macro"
-version = "1.0.137"
+version = "1.0.140"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e79010a2093848e65a3e0f7062d3f02fb2ef27f866416dfe436fccfa73d3bb59"
+checksum = "5bee7a1d9b5091462002c2b8de2a4ed0f0fde011d503cc272633f66075bd5141"
 dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -1085,7 +1085,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -1096,7 +1096,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -1126,7 +1126,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -1136,7 +1136,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -1178,7 +1178,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -1228,9 +1228,9 @@ dependencies = [
 
 [[package]]
 name = "equivalent"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
 
 [[package]]
 name = "errno"
@@ -1439,7 +1439,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -1665,7 +1665,7 @@ dependencies = [
  "log",
  "native-tls",
  "num_cpus",
- "rand",
+ "rand 0.8.5",
  "reqwest 0.11.27",
  "serde",
  "serde_json",
@@ -1674,6 +1674,28 @@ dependencies = [
  "ureq",
 ]
 
+[[package]]
+name = "hf-hub"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112fa2f6ad4ab815b9e1b938b4b1e437032d055e2f92ed10fd6ab2e62d02c6b6"
+dependencies = [
+ "dirs",
+ "futures",
+ "http 1.2.0",
+ "indicatif",
+ "log",
+ "native-tls",
+ "num_cpus",
+ "rand 0.8.5",
+ "reqwest 0.12.12",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.11",
+ "tokio",
+ "ureq",
+]
+
 [[package]]
 name = "home"
 version = "0.5.11"
@@ -1818,7 +1840,7 @@ dependencies = [
  "hyper 1.6.0",
  "hyper-util",
  "log",
- "rustls 0.23.21",
+ "rustls 0.23.23",
  "rustls-native-certs",
  "rustls-pki-types",
  "tokio",
@@ -1851,6 +1873,22 @@ dependencies = [
  "tokio-native-tls",
 ]
 
+[[package]]
+name = "hyper-tls"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
+dependencies = [
+ "bytes",
+ "http-body-util",
+ "hyper 1.6.0",
+ "hyper-util",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.10"
@@ -2008,7 +2046,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -2140,7 +2178,7 @@ dependencies = [
  "indoc",
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -2151,7 +2189,7 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -2525,9 +2563,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.8.3"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924"
+checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b"
 dependencies = [
  "adler2",
  "simd-adler32",
@@ -2563,7 +2601,7 @@ checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -2584,7 +2622,7 @@ dependencies = [
  "bytes",
  "futures",
  "pin-project",
- "rand",
+ "rand 0.8.5",
  "thiserror 1.0.69",
  "tokio",
  "tokio-util",
@@ -2634,7 +2672,7 @@ dependencies = [
  "once_cell",
  "parking_lot",
  "regex",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.4",
  "serde",
  "serde_json",
  "thiserror 1.0.69",
@@ -2763,7 +2801,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -2843,9 +2881,9 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.20.2"
+version = "1.20.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
+checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e"
 
 [[package]]
 name = "onig"
@@ -2877,9 +2915,9 @@ checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
 
 [[package]]
 name = "openssl"
-version = "0.10.69"
+version = "0.10.71"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f5e534d133a060a3c19daec1eb3e98ec6f4685978834f2dbadfe2ec215bab64e"
+checksum = "5e14130c6a98cd258fdcb0fb6d744152343ff729cbfcb28c656a9d12b999fbcd"
 dependencies = [
  "bitflags 2.8.0",
  "cfg-if",
@@ -2898,7 +2936,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -2909,9 +2947,9 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.104"
+version = "0.9.106"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741"
+checksum = "8bb61ea9811cc39e3c2069f40b8b8e2e70d8569b361f879786cc7ed48b777cdd"
 dependencies = [
  "cc",
  "libc",
@@ -3016,7 +3054,7 @@ dependencies = [
  "opentelemetry_api",
  "ordered-float 3.9.2",
  "percent-encoding",
- "rand",
+ "rand 0.8.5",
  "regex",
  "serde_json",
  "thiserror 1.0.69",
@@ -3040,7 +3078,7 @@ dependencies = [
  "opentelemetry 0.21.0",
  "ordered-float 4.6.0",
  "percent-encoding",
- "rand",
+ "rand 0.8.5",
  "thiserror 1.0.69",
 ]
 
@@ -3149,22 +3187,22 @@ dependencies = [
 
 [[package]]
 name = "pin-project"
-version = "1.1.8"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916"
+checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.1.8"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb"
+checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -3244,7 +3282,7 @@ version = "0.2.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
 dependencies = [
- "zerocopy",
+ "zerocopy 0.7.35",
 ]
 
 [[package]]
@@ -3254,7 +3292,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac"
 dependencies = [
  "proc-macro2",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -3306,7 +3344,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a65f2e60fbf1063868558d69c6beacf412dc755f9fc020f514b7955fc914fe30"
 dependencies = [
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -3346,7 +3384,7 @@ dependencies = [
  "prost 0.12.6",
  "prost-types",
  "regex",
- "syn 2.0.96",
+ "syn 2.0.98",
  "tempfile",
 ]
 
@@ -3373,7 +3411,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -3432,7 +3470,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -3445,7 +3483,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-build-config",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -3494,8 +3532,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
  "libc",
- "rand_chacha",
- "rand_core",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.1",
+ "zerocopy 0.8.18",
 ]
 
 [[package]]
@@ -3505,7 +3554,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
 dependencies = [
  "ppv-lite86",
- "rand_core",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.1",
 ]
 
 [[package]]
@@ -3517,6 +3576,16 @@ dependencies = [
  "getrandom 0.2.15",
 ]
 
+[[package]]
+name = "rand_core"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a88e0da7a2c97baa202165137c158d0a2e824ac465d13d81046727b34cb247d3"
+dependencies = [
+ "getrandom 0.3.1",
+ "zerocopy 0.8.18",
+]
+
 [[package]]
 name = "ratatui"
 version = "0.28.1"
@@ -3564,8 +3633,8 @@ dependencies = [
  "once_cell",
  "paste",
  "profiling",
- "rand",
- "rand_chacha",
+ "rand 0.8.5",
+ "rand_chacha 0.3.1",
  "simd_helpers",
  "system-deps",
  "thiserror 1.0.69",
@@ -3590,9 +3659,9 @@ dependencies = [
 
 [[package]]
 name = "raw-cpuid"
-version = "11.3.0"
+version = "11.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
+checksum = "529468c1335c1c03919960dfefdb1b3648858c20d7ec2d0663e728e4a717efbc"
 dependencies = [
  "bitflags 2.8.0",
 ]
@@ -3665,7 +3734,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -3740,7 +3809,7 @@ dependencies = [
  "http 0.2.12",
  "http-body 0.4.6",
  "hyper 0.14.32",
- "hyper-tls",
+ "hyper-tls 0.5.0",
  "ipnet",
  "js-sys",
  "log",
@@ -3749,12 +3818,12 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.4",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "sync_wrapper 0.1.2",
- "system-configuration",
+ "system-configuration 0.5.1",
  "tokio",
  "tokio-native-tls",
  "tower-service",
@@ -3773,31 +3842,41 @@ checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da"
 dependencies = [
  "base64 0.22.1",
  "bytes",
+ "encoding_rs",
  "futures-channel",
  "futures-core",
  "futures-util",
+ "h2 0.4.7",
  "http 1.2.0",
  "http-body 1.0.1",
  "http-body-util",
  "hyper 1.6.0",
+ "hyper-rustls",
+ "hyper-tls 0.6.0",
  "hyper-util",
  "ipnet",
  "js-sys",
  "log",
  "mime",
+ "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
+ "rustls-pemfile 2.2.0",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "sync_wrapper 1.0.2",
+ "system-configuration 0.6.1",
  "tokio",
+ "tokio-native-tls",
+ "tokio-util",
  "tower 0.5.2",
  "tower-service",
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
+ "wasm-streams",
  "web-sys",
  "windows-registry",
 ]
@@ -3817,7 +3896,7 @@ dependencies = [
  "cc",
  "libc",
  "once_cell",
- "spin 0.5.2",
+ "spin",
  "untrusted 0.7.1",
  "web-sys",
  "winapi",
@@ -3825,15 +3904,14 @@ dependencies = [
 
 [[package]]
 name = "ring"
-version = "0.17.8"
+version = "0.17.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d"
+checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24"
 dependencies = [
  "cc",
  "cfg-if",
  "getrandom 0.2.15",
  "libc",
- "spin 0.9.8",
  "untrusted 0.9.0",
  "windows-sys 0.52.0",
 ]
@@ -3858,7 +3936,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rust-embed-utils",
- "syn 2.0.96",
+ "syn 2.0.98",
  "walkdir",
 ]
 
@@ -3886,9 +3964,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
 [[package]]
 name = "rustc-hash"
-version = "2.1.0"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497"
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
 
 [[package]]
 name = "rustc_version"
@@ -3931,7 +4009,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
  "log",
- "ring 0.17.8",
+ "ring 0.17.9",
  "rustls-pki-types",
  "rustls-webpki",
  "subtle",
@@ -3940,9 +4018,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.23.21"
+version = "0.23.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8"
+checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395"
 dependencies = [
  "aws-lc-rs",
  "log",
@@ -3974,6 +4052,15 @@ dependencies = [
  "base64 0.21.7",
 ]
 
+[[package]]
+name = "rustls-pemfile"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
+dependencies = [
+ "rustls-pki-types",
+]
+
 [[package]]
 name = "rustls-pki-types"
 version = "1.11.0"
@@ -3987,7 +4074,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
 dependencies = [
  "aws-lc-rs",
- "ring 0.17.8",
+ "ring 0.17.9",
  "rustls-pki-types",
  "untrusted 0.9.0",
 ]
@@ -4040,7 +4127,7 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.17.8",
+ "ring 0.17.9",
  "untrusted 0.9.0",
 ]
 
@@ -4126,7 +4213,7 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -4270,9 +4357,9 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.13.2"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
+checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd"
 
 [[package]]
 name = "socket2"
@@ -4284,18 +4371,23 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "socks"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b"
+dependencies = [
+ "byteorder",
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "spin"
 version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
 
-[[package]]
-name = "spin"
-version = "0.9.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-
 [[package]]
 name = "spm_precompiled"
 version = "0.1.4"
@@ -4345,7 +4437,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -4367,9 +4459,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.96"
+version = "2.0.98"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80"
+checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4399,7 +4491,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -4425,7 +4517,18 @@ checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
 dependencies = [
  "bitflags 1.3.2",
  "core-foundation 0.9.4",
- "system-configuration-sys",
+ "system-configuration-sys 0.5.0",
+]
+
+[[package]]
+name = "system-configuration"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
+dependencies = [
+ "bitflags 2.8.0",
+ "core-foundation 0.9.4",
+ "system-configuration-sys 0.6.0",
 ]
 
 [[package]]
@@ -4438,6 +4541,16 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "system-configuration-sys"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
 [[package]]
 name = "system-deps"
 version = "6.2.2"
@@ -4483,9 +4596,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
 
 [[package]]
 name = "tempfile"
-version = "3.16.0"
+version = "3.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91"
+checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230"
 dependencies = [
  "cfg-if",
  "fastrand",
@@ -4509,12 +4622,12 @@ name = "text-generation-backends-trtllm"
 version = "3.1.1-dev0"
 dependencies = [
  "async-trait",
- "clap 4.5.27",
+ "clap 4.5.30",
  "cmake",
  "cxx",
  "cxx-build",
  "hashbrown 0.15.2",
- "hf-hub",
+ "hf-hub 0.3.2",
  "pkg-config",
  "pyo3",
  "text-generation-router",
@@ -4530,9 +4643,9 @@ name = "text-generation-benchmark"
 version = "3.1.1-dev0"
 dependencies = [
  "average",
- "clap 4.5.27",
+ "clap 4.5.30",
  "float-ord",
- "hf-hub",
+ "hf-hub 0.3.2",
  "ratatui",
  "serde",
  "serde_json",
@@ -4567,10 +4680,10 @@ dependencies = [
 name = "text-generation-launcher"
 version = "3.1.1-dev0"
 dependencies = [
- "clap 4.5.27",
+ "clap 4.5.30",
  "ctrlc",
  "float_eq",
- "hf-hub",
+ "hf-hub 0.4.1",
  "nix 0.28.0",
  "once_cell",
  "pyo3",
@@ -4595,11 +4708,11 @@ dependencies = [
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
  "chrono",
- "clap 4.5.27",
+ "clap 4.5.30",
  "csv",
  "futures",
  "futures-util",
- "hf-hub",
+ "hf-hub 0.3.2",
  "image",
  "init-tracing-opentelemetry",
  "itertools 0.10.5",
@@ -4615,7 +4728,7 @@ dependencies = [
  "opentelemetry-otlp",
  "outlines-core",
  "pyo3",
- "rand",
+ "rand 0.8.5",
  "regex",
  "reqwest 0.11.27",
  "serde",
@@ -4642,7 +4755,7 @@ version = "3.1.1-dev0"
 dependencies = [
  "async-trait",
  "bindgen 0.71.1",
- "clap 4.5.27",
+ "clap 4.5.30",
  "num_cpus",
  "pkg-config",
  "text-generation-router",
@@ -4662,11 +4775,11 @@ dependencies = [
  "axum 0.7.9",
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
- "clap 4.5.27",
+ "clap 4.5.30",
  "futures",
  "futures-util",
  "grpc-metadata",
- "hf-hub",
+ "hf-hub 0.3.2",
  "image",
  "init-tracing-opentelemetry",
  "jsonschema",
@@ -4680,7 +4793,7 @@ dependencies = [
  "opentelemetry-otlp",
  "prost 0.12.6",
  "prost-build",
- "rand",
+ "rand 0.8.5",
  "regex",
  "reqwest 0.11.27",
  "serde",
@@ -4711,12 +4824,12 @@ dependencies = [
  "axum 0.7.9",
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
- "clap 4.5.27",
+ "clap 4.5.30",
  "criterion",
  "futures",
  "futures-util",
  "grpc-metadata",
- "hf-hub",
+ "hf-hub 0.3.2",
  "image",
  "init-tracing-opentelemetry",
  "itertools 0.13.0",
@@ -4731,7 +4844,7 @@ dependencies = [
  "opentelemetry-otlp",
  "prost 0.12.6",
  "prost-build",
- "rand",
+ "rand 0.8.5",
  "regex",
  "reqwest 0.11.27",
  "serde",
@@ -4788,7 +4901,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -4799,7 +4912,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -4886,7 +4999,7 @@ dependencies = [
  "derive_builder",
  "esaxx-rs",
  "getrandom 0.2.15",
- "hf-hub",
+ "hf-hub 0.3.2",
  "indicatif",
  "itertools 0.12.1",
  "lazy_static",
@@ -4895,7 +5008,7 @@ dependencies = [
  "monostate",
  "onig",
  "paste",
- "rand",
+ "rand 0.8.5",
  "rayon",
  "rayon-cond",
  "regex",
@@ -4945,7 +5058,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -4965,7 +5078,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f57eb36ecbe0fc510036adff84824dd3c24bb781e21bfa67b69d556aa85214f"
 dependencies = [
  "pin-project",
- "rand",
+ "rand 0.8.5",
  "tokio",
 ]
 
@@ -4975,7 +5088,7 @@ version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37"
 dependencies = [
- "rustls 0.23.21",
+ "rustls 0.23.23",
  "tokio",
 ]
 
@@ -5006,9 +5119,9 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.8.19"
+version = "0.8.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e"
+checksum = "cd87a5cdd6ffab733b2f74bc4fd7ee5fff6634124999ac278c35fc78c6120148"
 dependencies = [
  "serde",
  "serde_spanned",
@@ -5027,9 +5140,9 @@ dependencies = [
 
 [[package]]
 name = "toml_edit"
-version = "0.22.22"
+version = "0.22.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
+checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474"
 dependencies = [
  "indexmap 2.7.1",
  "serde",
@@ -5103,7 +5216,7 @@ dependencies = [
  "proc-macro2",
  "prost-build",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -5117,7 +5230,7 @@ dependencies = [
  "indexmap 1.9.3",
  "pin-project",
  "pin-project-lite",
- "rand",
+ "rand 0.8.5",
  "slab",
  "tokio",
  "tokio-util",
@@ -5190,7 +5303,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -5310,9 +5423,9 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
 [[package]]
 name = "typenum"
-version = "1.17.0"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
+checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
 
 [[package]]
 name = "unicase"
@@ -5404,6 +5517,7 @@ dependencies = [
  "rustls-webpki",
  "serde",
  "serde_json",
+ "socks",
  "url",
  "webpki-roots",
 ]
@@ -5465,7 +5579,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "regex",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -5486,24 +5600,24 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.12.1"
+version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b"
+checksum = "8c1f41ffb7cf259f1ecc2876861a17e7142e63ead296f671f81f6ae85903e0d6"
 dependencies = [
- "getrandom 0.2.15",
- "rand",
+ "getrandom 0.3.1",
+ "rand 0.9.0",
  "uuid-macro-internal",
 ]
 
 [[package]]
 name = "uuid-macro-internal"
-version = "1.12.1"
+version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8a86d88347b61a0e17b9908a67efcc594130830bf1045653784358dd023e294"
+checksum = "f02e7142329d47ebfcb5fc40e357b867f971230181e6f78744e52d9325d8f052"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -5630,7 +5744,7 @@ dependencies = [
  "log",
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
  "wasm-bindgen-shared",
 ]
 
@@ -5665,7 +5779,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -5679,6 +5793,19 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "wasm-streams"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
 [[package]]
 name = "web-sys"
 version = "0.3.77"
@@ -5715,15 +5842,15 @@ version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53"
 dependencies = [
- "ring 0.17.8",
+ "ring 0.17.9",
  "untrusted 0.9.0",
 ]
 
 [[package]]
 name = "webpki-roots"
-version = "0.26.7"
+version = "0.26.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d642ff16b7e79272ae451b7322067cdc17cadf68c23264be9d94a32319efe7e"
+checksum = "2210b291f7ea53617fbafcc4939f10914214ec15aace5ba62293a668f322c5c9"
 dependencies = [
  "rustls-pki-types",
 ]
@@ -6042,9 +6169,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
 name = "winnow"
-version = "0.6.25"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad699df48212c6cc6eb4435f35500ac6fd3b9913324f938aea302022ce19d310"
+checksum = "59690dea168f2198d1a3b0cac23b8063efcd11012f10ae4698f284808c8ef603"
 dependencies = [
  "memchr",
 ]
@@ -6100,7 +6227,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
  "synstructure",
 ]
 
@@ -6111,7 +6238,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
 dependencies = [
  "byteorder",
- "zerocopy-derive",
+ "zerocopy-derive 0.7.35",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79386d31a42a4996e3336b0919ddb90f81112af416270cff95b5f5af22b839c2"
+dependencies = [
+ "zerocopy-derive 0.8.18",
 ]
 
 [[package]]
@@ -6122,7 +6258,18 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76331675d372f91bf8d17e13afbd5fe639200b73d01f0fc748bb059f9cca2db7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.98",
 ]
 
 [[package]]
@@ -6142,7 +6289,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
  "synstructure",
 ]
 
@@ -6171,7 +6318,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.96",
+ "syn 2.0.98",
 ]
 
 [[package]]
diff --git a/Dockerfile_intel b/Dockerfile_intel
index be248866..7f61d8e8 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -162,7 +162,7 @@ ARG MAMBA_VERSION=23.1.0-1
 ARG PYTHON_VERSION='3.11.10'
 # Automatically set by buildx
 ARG TARGETPLATFORM
-ENV PATH /opt/conda/bin:$PATH
+ENV PATH=/opt/conda/bin:$PATH
 
 # TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
 # Install mamba
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index fdc3c02c..fdbb5994 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -9,7 +9,7 @@ homepage.workspace = true
 [dependencies]
 clap = { version = "4.4.5", features = ["derive", "env"] }
 ctrlc = { version = "3.4.1", features = ["termination"] }
-hf-hub = "0.3.2"
+hf-hub = "0.4.1"
 nix = { version = "0.28.0", features = ["signal"] }
 once_cell = "1.19.0"
 pyo3 = { workspace = true }

From a7448661f73b519d328e6f2a5fb671989c4d56c5 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 18 Feb 2025 19:04:34 +0100
Subject: [PATCH 093/183] Improve Transformers support (#2970)

* Much better support

* add gpt neox

* bump transformers version

* bump version
---
 backends/llamacpp/requirements.txt            |   2 +-
 server/requirements_cuda.txt                  |   2 +-
 server/requirements_gen.txt                   |   2 +-
 server/requirements_intel.txt                 |   2 +-
 server/requirements_rocm.txt                  |   2 +-
 .../text_generation_server/models/__init__.py | 126 +++++++++++-------
 .../models/transformers_flash_causal_lm.py    |  54 ++++----
 7 files changed, 111 insertions(+), 79 deletions(-)

diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt
index 5c5d0cc7..d19c9e5b 100644
--- a/backends/llamacpp/requirements.txt
+++ b/backends/llamacpp/requirements.txt
@@ -1,3 +1,3 @@
-transformers==4.48.2
+transformers==4.49
 huggingface-hub==0.28.1
 hf-transfer==0.1.9
diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt
index 051045bc..0764bf92 100644
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@@ -346,7 +346,7 @@ tqdm==4.66.5
     #   outlines
     #   peft
     #   transformers
-transformers==4.48.2
+transformers==4.49
     # via
     #   text-generation-server (pyproject.toml)
     #   compressed-tensors
diff --git a/server/requirements_gen.txt b/server/requirements_gen.txt
index d9836ad7..6d64a34b 100644
--- a/server/requirements_gen.txt
+++ b/server/requirements_gen.txt
@@ -158,7 +158,7 @@ tqdm==4.67.1
     # via
     #   huggingface-hub
     #   transformers
-transformers==4.48.2
+transformers==4.49
     # via text-generation-server (pyproject.toml)
 typer==0.15.1
     # via text-generation-server (pyproject.toml)
diff --git a/server/requirements_intel.txt b/server/requirements_intel.txt
index 778d892e..c671199f 100644
--- a/server/requirements_intel.txt
+++ b/server/requirements_intel.txt
@@ -331,7 +331,7 @@ tqdm==4.66.5
     #   outlines
     #   peft
     #   transformers
-transformers==4.48.2
+transformers==4.49
     # via
     #   text-generation-server (pyproject.toml)
     #   compressed-tensors
diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt
index 65eb998b..fe7ca572 100644
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@@ -331,7 +331,7 @@ tqdm==4.66.5
     #   outlines
     #   peft
     #   transformers
-transformers==4.48.2
+transformers==4.49
     # via
     #   text-generation-server (pyproject.toml)
     #   compressed-tensors
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index f8150b5e..6d53a72b 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -6,17 +6,18 @@ from compressed_tensors.compressors.model_compressors.model_compressor import (
 )
 from compressed_tensors.quantization import QuantizationType
 from pydantic import ValidationError
-import torch
 import enum
 import os
-
-from loguru import logger
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto import modeling_auto
-from huggingface_hub import hf_hub_download, HfApi
 from typing import Optional, List, Dict
 from pathlib import Path
+from loguru import logger
+
+import torch
 import transformers
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto import modeling_auto
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+from huggingface_hub import hf_hub_download, HfApi
 
 from text_generation_server.utils.speculate import get_speculate, set_speculate
 from text_generation_server.models.model import Model
@@ -736,7 +737,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Santacoder")
             )
         else:
-            return transformers_causal_lm_class.fallback(
+            return CausalLM.fallback(
                 model_id=model_id,
                 revision=revision,
                 quantize=quantize,
@@ -857,6 +858,15 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
                 config_class=GPTNeoXConfig,
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         elif sharded:
             return CausalLM(
                 model_id=model_id,
@@ -1054,6 +1064,15 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))
         else:
@@ -1467,42 +1486,37 @@ def get_model(
     elif quantize == "exl2":
         raise NotImplementedError("exl2 quantization is not supported for AutoModel")
 
-    # Fast transformers if available
-    transformers_model_class = getattr(
-        transformers,
-        modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.get(model_type, ""),
-        None,
-    )
-    if (
-        FLASH_TRANSFORMERS_BACKEND
-        and transformers_model_class is not None
-        and transformers_model_class._supports_flex_attn
-    ):
-        return TransformersFlashCausalLM.fallback(
-            model_id,
-            revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
-    if sharded:
-        raise NotImplementedError("sharded is not supported for AutoModel")
-
-    if model_type in modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
-        return Seq2SeqLM.fallback(
-            model_id,
-            revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
     auto_map = config_dict.get("auto_map", None)
-    if trust_remote_code and auto_map is not None:
-        if "AutoModelForCausalLM" in auto_map.keys():
+    model_class = None
+
+    # If the model is already in the library
+    if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+        model_class = getattr(
+            transformers, modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[model_type]
+        )
+    elif (
+        trust_remote_code
+        and auto_map is not None
+        and "AutoModelForCausalLM" in auto_map.keys()
+    ):
+        model_class = get_class_from_dynamic_module(
+            config_dict["auto_map"]["AutoModelForCausalLM"], model_id
+        )
+
+    # This means the model is ForCausalLM
+    if model_class is not None:
+        if FLASH_TRANSFORMERS_BACKEND and model_class.is_backend_compatible():
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        elif sharded:
+            raise NotImplementedError("sharded is not supported for AutoModel")
+        else:
             return CausalLM.fallback(
                 model_id,
                 revision,
@@ -1511,15 +1525,25 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
-        if "AutoModelForSeq2SeqLM" in auto_map.keys():
-            return Seq2SeqLM.fallback(
-                model_id,
-                revision,
-                quantize=quantize,
-                speculator=speculator,
-                dtype=dtype,
-                trust_remote_code=trust_remote_code,
-            )
+
+    # Not supported at this point
+    if sharded:
+        raise NotImplementedError("sharded is not supported for AutoModel")
+
+    # This means it is a ForSeq2SeqLM model
+    if model_type in modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES or (
+        trust_remote_code
+        and auto_map is not None
+        and "AutoModelForSeq2SeqLM" in auto_map.keys()
+    ):
+        return Seq2SeqLM.fallback(
+            model_id,
+            revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+        )
 
     raise ValueError(f"Unsupported model type {model_type}")
 
diff --git a/server/text_generation_server/models/transformers_flash_causal_lm.py b/server/text_generation_server/models/transformers_flash_causal_lm.py
index 36de89b4..8773bfd3 100644
--- a/server/text_generation_server/models/transformers_flash_causal_lm.py
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@@ -81,6 +81,15 @@ def tgi_flash_attention_forward(
 
 transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS["tgi"] = tgi_flash_attention_forward
 
+# The base TP plan of these models has replicated q/k/v. This means that each process will see the full states,
+# hence we should not divide the number of heads by the world size. This is a known waste of VRAM (the cache
+# will be fully replicated on each process) and GPU communication (additional all-gather operations), however due
+# to internal constraints it was not (yet?) possible to circumvent
+REPLICATED_ATTENTION_MODELS = [
+    "olmo2",
+    "phi3",
+]
+
 
 class TransformersFlashCausalLM(FlashCausalLM):
     def __init__(
@@ -119,6 +128,7 @@ class TransformersFlashCausalLM(FlashCausalLM):
             truncation_side="left",
             trust_remote_code=trust_remote_code,
         )
+
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             revision=revision,
@@ -130,6 +140,8 @@ class TransformersFlashCausalLM(FlashCausalLM):
             tp_plan="auto" if world_size > 1 else None,
         )
 
+        torch.distributed.barrier(group=self.process_group)
+
         if tokenizer.pad_token_id is None:
             if model.config.pad_token_id is not None:
                 tokenizer.pad_token_id = model.config.pad_token_id
@@ -143,15 +155,19 @@ class TransformersFlashCausalLM(FlashCausalLM):
                 tokenizer.add_special_tokens({"pad_token": "[PAD]"})
 
         self.num_layers = model.config.num_hidden_layers
-        self.num_heads = model.config.num_attention_heads // self.process_group.size()
+        self.num_heads = model.config.num_attention_heads
         self.num_kv_heads = model.config.num_key_value_heads
-        self.num_kv_heads = (
-            self.num_kv_heads // self.process_group.size()
-            if self.num_kv_heads > 1
-            else self.num_kv_heads
-        )
         self.head_size = model.config.hidden_size // model.config.num_attention_heads
 
+        # Skip it for models in the exception list
+        if model.config.model_type not in REPLICATED_ATTENTION_MODELS:
+            self.num_heads = self.num_heads // self.process_group.size()
+            self.num_kv_heads = (
+                self.num_kv_heads // self.process_group.size()
+                if self.num_kv_heads > 1
+                else self.num_kv_heads
+            )
+
         self.cuda_graphs = {}
         self.kv_cache = []
         self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
@@ -186,7 +202,6 @@ class TransformersFlashCausalLM(FlashCausalLM):
             torch.tensor(1.0, device=device),
         )
 
-        torch.distributed.barrier(group=self.process_group)
         # Skip FlashCausalLM init.
         super(FlashCausalLM, self).__init__(
             model_id=model_id,
@@ -204,6 +219,8 @@ class TransformersFlashCausalLM(FlashCausalLM):
         self.model.original_forward = self.model.forward
         self.model.forward = self._model_forward
 
+        torch.distributed.barrier(group=self.process_group)
+
     @classmethod
     def fallback(
         cls,
@@ -237,11 +254,16 @@ class TransformersFlashCausalLM(FlashCausalLM):
         prefill_cache_indices=None,  # not used, but passed to match original signature
         adapter_data=None,  # not supported, but passed to match original signature
     ):
-        hidden_states = self.model.model.forward(
+        # A value of `None` (i.e. no logit slicing) translates to `0` in Transformers
+        logits_to_keep = lm_head_indices if lm_head_indices is not None else 0
+
+        # This is equivalent to `self.model.forward`, see the monkey patch in __init__
+        logits = self.model.original_forward(
             input_ids=input_ids.unsqueeze(0),  # expand dim to fit Transformers
             position_ids=position_ids.unsqueeze(0),  # expand dim to fit Transformers
             past_key_values=None,  # we use self.kv_cache instead of transformers cache object
             use_cache=False,  # we use self.kv_cache instead of transformers cache object
+            logits_to_keep=logits_to_keep,
             return_dict=True,
             cu_seqlen_prefill=cu_seqlen_prefill,
             kv_cache=kv_cache,
@@ -251,20 +273,6 @@ class TransformersFlashCausalLM(FlashCausalLM):
             max_s=max_s,
             kv_head_mapping=self.kv_head_mapping,
             kv_scales=self.kv_scales,
-        )[0].squeeze(dim=0)
-
-        # And compute logits from the lm_head, slicing correctly the indices
-        # NOTE: some logits post-processing (e.g. in gemma2) may be absent here with the split of the modules
-        # To update with full Transformers support asap
-        if lm_head_indices is not None:
-            hidden_states = hidden_states[lm_head_indices]
-        logits = self.model.lm_head(hidden_states)
-
-        # For Granite while next transformers version is released and we can use `lm_head_indices` natively
-        if hasattr(self.model.config, "logits_scaling"):
-            logits = logits / self.model.config.logits_scaling
-        # For Cohere for similar reasons
-        elif hasattr(self.model, "logit_scale"):
-            logits = logits * self.model.logit_scale
+        ).logits.squeeze(dim=0)
 
         return logits, None

From d6a0c67e2f63c5915af6ec1bb77e0e1d13734d32 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Wed, 19 Feb 2025 06:38:20 -0500
Subject: [PATCH 094/183] feat: add initial qwen2.5-vl model and test (#2971)

* feat: support qwen2.5 vl model

* fix: bump support models doc

* feat: check before rope type adjustment and small refactors

* fix: add transformer overlay for processor support

* fix: vendor processor and config from transformers

* fix: refactor/simplify conditionals
---
 docs/source/supported_models.md               |   1 +
 .../test_flash_qwen2_5_vl_bay.json            |  26 +
 .../test_flash_qwen2_5_vl_inpaint.json        |  26 +
 .../test_flash_qwen2_5_vl_simple.json         |  26 +
 ...est_flash_qwen2_5_vl_simple_streaming.json |  20 +
 .../models/test_flash_qwen2_5_vl.py           | 122 +++
 launcher/src/main.rs                          |   2 +-
 nix/overlay.nix                               |  20 +-
 router/src/config.rs                          |  33 +
 router/src/validation.rs                      |   6 +-
 .../text_generation_server/layers/rotary.py   |   1 -
 .../text_generation_server/models/__init__.py |  25 +
 .../models/custom_modeling/qwen2_5_vl.py      | 974 ++++++++++++++++++
 .../models/custom_modeling/qwen2_vl.py        |   8 +-
 .../models/vlm_causal_lm.py                   |   9 +-
 15 files changed, 1282 insertions(+), 17 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_bay.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_inpaint.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple_streaming.json
 create mode 100644 integration-tests/models/test_flash_qwen2_5_vl.py
 create mode 100644 server/text_generation_server/models/custom_modeling/qwen2_5_vl.py

diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index d6a8b7ae..1f804d5a 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -27,6 +27,7 @@ Text Generation Inference enables serving optimized models. The following sectio
 - [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
 - [Qwen 2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f)
 - [Qwen 2 VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
+- [Qwen 2.5 VL](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
 - [Opt](https://huggingface.co/facebook/opt-6.7b)
 - [T5](https://huggingface.co/google/flan-t5-xxl)
 - [Galactica](https://huggingface.co/facebook/galactica-120b)
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_bay.json b/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_bay.json
new file mode 100644
index 00000000..739d5361
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_bay.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image showcases the Statue of Liberty, a colossal bronze statue located in New York Harbor, a heritage building in the United States. The statue has a majestic presence, with one arm raised towards the sun and the other hitched on her hip. It sits atop a keeper's walkway, observed from the water. Surrounding the statue is a lush green meadow, where picnic spots, walkways, and a visitor desk can be found. In front of the statue, a large marina can accommodate fourteen different kinds of boats. In the backdrop stands the Empire State Building, marking the crowded skyscrapers of New York City.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1738342753,
+  "id": "",
+  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.0.2-dev0-native",
+  "usage": {
+    "completion_tokens": 128,
+    "prompt_tokens": 8736,
+    "total_tokens": 8864
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_inpaint.json b/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_inpaint.json
new file mode 100644
index 00000000..00e1b041
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_inpaint.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image shows a whimsical scene set in what appears to be a fast-food restaurant. Dominating the foreground is a large, green, inflatable dinosaur with realistic textures, giving it a Jurassic Park-like appearance. The dinosaur is wearing a red Adult Swim logo hat, adding a humorous touch to its appearance.\n\nSurrounding the dinosaur are various food items typically found in a fast-food restaurant, including French fries in a plastic cup, a hamburger on a plate, and a beverage in another cup. The hamburger is detailed with lettuce, tomato, and other typical fast-food ingredients.\n\nAccompanying the dinosaur is a realistic-looking owl perched on the table, which adds to the surreal and playful atmosphere of the scene. The background features the interior of the restaurant with neon signs and other typical decor elements, enhancing the overall theme of a fun and fantastical fast-food experience.\n\nOverall, the image is a playful and imaginative blend of a standard fast-food setting with an unexpected and amusing twist provided by the dinosaur and owl characters.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1738343775,
+  "id": "",
+  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.0.2-dev0-native",
+  "usage": {
+    "completion_tokens": 206,
+    "prompt_tokens": 5375,
+    "total_tokens": 5581
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple.json b/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple.json
new file mode 100644
index 00000000..c498c36a
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image depicts an anthropomorphic rabbit character wearing an intricate space suit, which includes a helmet with a starry face pattern and multiple suitors. The rabbit's ears are significantly large and upright, and it has a hitchhiker-like star antennas on its chest. The background is a reddish-orange, rocky landscape, suggesting a Martian environment. The suit has various buttons, a red button on the chest, and a reflective or illuminated dome on the head. The overall color scheme is dominated by shades of red, orange, and gray, giving a sense of a rugged, otherworldly setting.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1738342872,
+  "id": "",
+  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.0.2-dev0-native",
+  "usage": {
+    "completion_tokens": 121,
+    "prompt_tokens": 1363,
+    "total_tokens": 1484
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple_streaming.json b/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple_streaming.json
new file mode 100644
index 00000000..0d2718a7
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple_streaming.json
@@ -0,0 +1,20 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": "",
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1738343559,
+  "id": "",
+  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "3.0.2-dev0-native",
+  "usage": null
+}
diff --git a/integration-tests/models/test_flash_qwen2_5_vl.py b/integration-tests/models/test_flash_qwen2_5_vl.py
new file mode 100644
index 00000000..922068f4
--- /dev/null
+++ b/integration-tests/models/test_flash_qwen2_5_vl.py
@@ -0,0 +1,122 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_qwen2_5_vl_handle(launcher):
+    with launcher("Qwen/Qwen2.5-VL-3B-Instruct") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_qwen2_5(flash_qwen2_5_vl_handle):
+    await flash_qwen2_5_vl_handle.health(300)
+    return flash_qwen2_5_vl_handle.client
+
+
+@pytest.mark.private
+async def test_flash_qwen2_5_vl_simple(flash_qwen2_5, response_snapshot):
+    response = await flash_qwen2_5.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe the image"},
+                ],
+            },
+        ],
+    )
+
+    assert (
+        response.choices[0].message.content
+        == "The image depicts an anthropomorphic rabbit character wearing an intricate space suit, which includes a helmet with a starry face pattern and multiple suitors. The rabbit's ears are significantly large and upright, and it has a hitchhiker-like star antennas on its chest. The background is a reddish-orange, rocky landscape, suggesting a Martian environment. The suit has various buttons, a red button on the chest, and a reflective or illuminated dome on the head. The overall color scheme is dominated by shades of red, orange, and gray, giving a sense of a rugged, otherworldly setting."
+    )
+
+    assert response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_5_vl_simple_streaming(flash_qwen2_5, response_snapshot):
+    responses = await flash_qwen2_5.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe the image"},
+                ],
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    generated = ""
+    last_response = None
+    async for response in responses:
+        count += 1
+        generated += response.choices[0].delta.content
+        last_response = response
+
+    assert (
+        generated
+        == "The image depicts an anthropomorphic rabbit character wearing an intricate space suit, which includes a helmet with a starry face pattern and multiple suitors. The rabbit's ears are significantly large and upright, and it has a hitchhiker-like star antennas on its chest. The background is a reddish-orange, rocky landscape, suggesting a Martian environment. The suit has various buttons, a red button on the chest, and a reflective or illuminated dome on the head. The overall color scheme is dominated by shades of red, orange, and gray, giving a sense of a rugged, otherworldly setting."
+    )
+    assert count == 121
+    assert last_response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_5_vl_bay(flash_qwen2_5, response_snapshot):
+    response = await flash_qwen2_5.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                        },
+                    },
+                    {"type": "text", "text": "Describe the image"},
+                ],
+            },
+        ],
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.private
+async def test_flash_qwen2_5_vl_inpaint(flash_qwen2_5, response_snapshot):
+    response = await flash_qwen2_5.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"
+                        },
+                    },
+                    {"type": "text", "text": "Describe the image"},
+                ],
+            },
+        ],
+    )
+    assert response == response_snapshot
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 3c9ee850..fbbe8a2d 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -2053,7 +2053,7 @@ fn main() -> Result<(), LauncherError> {
                 // this is a short term temporary fix to enable vlms to avoid rejecting images
                 let default_optimal = match config {
                     Some(ref config) => match config.model_type.as_deref() {
-                        Some("qwen2_vl") => 10_000,
+                        Some("qwen2_vl") | Some("qwen2_5_vl") => 10_000,
                         _ => 4096,
                     },
                     None => 4096,
diff --git a/nix/overlay.nix b/nix/overlay.nix
index f7b9b8c2..d9047819 100644
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@@ -13,16 +13,16 @@ final: prev: {
     (
       python-self: python-super: with python-self; {
         # Python package override example:
-        #   transformers = python-super.transformers.overrideAttrs (
-        #     _: _: {
-        #       src = final.fetchFromGitHub {
-        #         owner = "huggingface";
-        #         repo = "transformers";
-        #         rev = "2bd4d5897dc73e8b172832070a6f9e567a0df017";
-        #         hash = "sha256-JOIpKH9ssDEfI2Tf15e0iPKtThJwQ9GxMvRAnm+M2Pg=";
-        #       };
-        #    }
-        # );
+        transformers = python-super.transformers.overrideAttrs (
+          _: _: {
+            src = final.fetchFromGitHub {
+              owner = "huggingface";
+              repo = "transformers";
+              rev = "8d73a38606bc342b370afe1f42718b4828d95aaa";
+              hash = "sha256-MxroG6CWqrcmRS+eFt7Ej87TDOInN15aRPBUcaycKTI=";
+            };
+          }
+        );
       }
     )
   ];
diff --git a/router/src/config.rs b/router/src/config.rs
index a1ac107a..a0135984 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -184,10 +184,43 @@ impl Qwen2Vl {
     }
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Qwen2_5VlVisionConfig {
+    pub(crate) depth: usize,
+    pub(crate) hidden_act: String,
+    pub(crate) hidden_size: usize,
+    pub(crate) intermediate_size: usize,
+    pub(crate) num_heads: usize,
+    pub(crate) in_chans: usize,
+    pub(crate) out_hidden_size: usize,
+    pub(crate) patch_size: usize,
+    pub(crate) spatial_merge_size: usize,
+    pub(crate) spatial_patch_size: usize,
+    pub(crate) window_size: usize,
+    pub(crate) fullatt_block_indexes: Vec<usize>,
+    pub(crate) tokens_per_second: usize,
+    pub(crate) temporal_patch_size: usize,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Qwen2_5Vl {
+    pub(crate) vision_config: Qwen2_5VlVisionConfig,
+}
+
+impl Qwen2_5Vl {
+    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
+        let num_pixels = height * width;
+        num_pixels / self.vision_config.patch_size.pow(2)
+    }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "model_type")]
 #[serde(rename_all = "snake_case")]
 pub enum Config {
+    Qwen2_5Vl(Qwen2_5Vl),
     Qwen2Vl(Qwen2Vl),
     LlavaNext(LlavaNext),
     ClipVisionModel(ClipVisionModel),
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 3b021b67..320e7f03 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -690,6 +690,10 @@ fn image_tokens(
             "<|vision_start|>{:?}<|vision_end|>",
             "<|image_pad|>".repeat(config.get_number_of_features(height, width))
         ),
+        Qwen2_5Vl(config) => format!(
+            "<|vision_start|>{:?}<|vision_end|>",
+            "<|image_pad|>".repeat(config.get_number_of_features(height, width))
+        ),
         _ => unimplemented!("Images tokens are not supported for this model configuration"),
     }
 }
@@ -718,7 +722,7 @@ fn prepare_input<T: TokenizerTrait>(
     let (tokenizer_query, input_chunks) = match config {
         Some(
             config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Paligemma(_) | LlavaNext(_)
-            | Qwen2Vl(_)),
+            | Qwen2Vl(_) | Qwen2_5Vl(_)),
         ) => {
             let mut input_chunks = Vec::new();
             let mut tokenizer_query = String::with_capacity(inputs.len());
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
index a7c7872e..cd22a1f1 100644
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@@ -86,7 +86,6 @@ class PositionRotaryEmbedding(nn.Module):
             # `rope_type` is now standard in transformers, but some existing models
             # have `type` instead.
             rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
-            mrope_section = rope_scaling.get("mrope_section", None)
 
             if rope_type == "linear":
                 pass
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 6d53a72b..ab12429b 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -165,6 +165,11 @@ try:
     from text_generation_server.models.custom_modeling.qwen2_vl import (
         Qwen2VLForConditionalGeneration,
     )
+    from text_generation_server.models.custom_modeling.qwen2_5_vl import (
+        Qwen2_5VLForConditionalGeneration,
+        Qwen2_5_VLConfig,
+        Qwen2_5_VLProcessor,
+    )
     from text_generation_server.layers.attention import SUPPORTS_WINDOWING
 except ImportError as e:
     log_master(logger.warning, f"Could not import Flash Attention enabled models: {e}")
@@ -318,6 +323,11 @@ class ModelType(enum.Enum):
         "name": "Qwen 2 VL",
         "url": "https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d",
     }
+    QWEN2_5_VL = {
+        "type": "qwen2_5_vl",
+        "name": "Qwen 2.5 VL",
+        "url": "https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e",
+    }
     OPT = {
         "type": "opt",
         "name": "Opt",
@@ -1387,6 +1397,21 @@ def get_model(
             trust_remote_code=trust_remote_code,
             lora_adapter_ids=lora_adapter_ids,
         )
+    if model_type == QWEN2_5_VL:
+        return VlmCausalLM(
+            model_id=model_id,
+            model_class=Qwen2_5VLForConditionalGeneration,
+            revision=revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            default_dtype=torch.bfloat16,
+            kv_cache_dtype=kv_cache_dtype,
+            trust_remote_code=trust_remote_code,
+            lora_adapter_ids=lora_adapter_ids,
+            config_class=Qwen2_5_VLConfig,
+            processor_class=Qwen2_5_VLProcessor,
+        )
     if model_type == MLLAMA:
         if FLASH_ATTENTION:
             return MllamaCausalLM(
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
new file mode 100644
index 00000000..0ee3a8d9
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
@@ -0,0 +1,974 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen2.5 VL model."""
+
+from typing import Optional, Tuple, List
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+else:
+    import flash_attn_2_cuda
+
+import numpy as np
+
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+
+import torch.nn.functional as F
+
+from text_generation_server.layers.layernorm import FastRMSNorm
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+)
+from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
+    Qwen2Model,
+)
+
+# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+from typing import Union
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.processing_utils import (
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    VideosKwargs,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
+    fps: Union[List[float], float]
+
+
+class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
+    videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "videos_kwargs": {"fps": 2.0},
+    }
+
+
+class Qwen2_5_VLProcessor(ProcessorMixin):
+    r"""
+    Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
+    [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+
+    def __init__(
+        self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
+    ):
+        self.image_token = (
+            "<|image_pad|>"
+            if not hasattr(tokenizer, "image_token")
+            else tokenizer.image_token
+        )
+        self.video_token = (
+            "<|video_pad|>"
+            if not hasattr(tokenizer, "video_token")
+            else tokenizer.video_token
+        )
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Qwen2_5_VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(
+                images=images, videos=None, **output_kwargs["images_kwargs"]
+            )
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+
+        if videos is not None:
+            videos_inputs = self.image_processor(
+                images=None, videos=videos, **output_kwargs["images_kwargs"]
+            )
+            video_grid_thw = videos_inputs["video_grid_thw"]
+
+            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
+            if isinstance(fps, (int, float)):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / fps
+                ] * len(video_grid_thw)
+            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / tmp for tmp in fps
+                ]
+            else:
+                raise ValueError(
+                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
+                )
+            videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
+
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+
+        if not isinstance(text, list):
+            text = [text]
+
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>"
+                        * (image_grid_thw[index].prod() // merge_length),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+
+        if video_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        "<|placeholder|>"
+                        * (video_grid_thw[index].prod() // merge_length),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(
+            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
+        )
+        return names_from_processor + ["second_per_grid_ts"]
+
+
+# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
+class Qwen2_5_VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_5_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=32,
+        hidden_size=3584,
+        hidden_act="silu",
+        intermediate_size=3420,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        spatial_patch_size=14,
+        temporal_patch_size=2,
+        tokens_per_second=4,
+        window_size=112,
+        out_hidden_size=3584,
+        fullatt_block_indexes=[7, 15, 23, 31],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_patch_size = spatial_patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes
+        self.out_hidden_size = out_hidden_size
+
+
+class Qwen2_5_VLConfig(PretrainedConfig):
+
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=8192,
+        intermediate_size=29568,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=80,
+        attention_dropout=0.0,
+        vision_config=None,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        if vision_config is not None:
+            self.vision_config = Qwen2_5_VLVisionConfig(**vision_config)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(
+    tensor: torch.Tensor, freqs: torch.Tensor
+) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+
+
+class Qwen2_5VLAttention(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size // weights.process_group.size()
+        self.head_dim = config.hidden_size // config.num_heads
+        self.num_heads = config.num_heads // weights.process_group.size()
+
+        self.qkv = TensorParallelColumnLinear.load_qkv(
+            config,
+            prefix=f"{prefix}.qkv",
+            weights=weights,
+            bias=False,
+            num_heads=self.num_heads,
+            num_key_value_heads=self.num_heads,
+        )
+        self.qkv.linear.bias = weights.get_sharded(f"{prefix}.qkv.bias", dim=0)
+
+        self.proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.proj",
+            weights=weights,
+            bias=True,
+        )
+        self.softmax_scale = 1.0 / np.sqrt(self.embed_dim // self.num_heads)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int,
+    ) -> torch.Tensor:
+        # apply the qkv linear layer to the hidden state
+        qkv = self.qkv(hidden_state)
+        query, key, value = qkv.split(
+            [self.embed_dim, self.embed_dim, self.embed_dim], dim=1
+        )
+
+        # reshape the query, key, and value tensors
+        _shape = (
+            hidden_state.shape[0],
+            self.num_heads,
+            self.embed_dim // self.num_heads,
+        )
+        query = query.view(*_shape)
+        key = key.view(*_shape)
+        value = value.view(*_shape)
+
+        # apply rotary positional embeddings
+        query = apply_rotary_pos_emb_vision(query.unsqueeze(0), rotary_pos_emb).squeeze(
+            0
+        )
+        key = apply_rotary_pos_emb_vision(key.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        # calc maximum sequence length for any batch
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        causal = False
+
+        # execute flash attention
+        if SYSTEM == "ipex":
+            attn_output = torch.empty_like(query)
+            ipex.llm.functional.varlen_attention(
+                (query.contiguous() if query.device.type == "xpu" else query),
+                (key.contiguous() if key.device.type == "xpu" else key),
+                (value.contiguous() if value.device.type == "xpu" else value),
+                attn_output,
+                cu_seqlens,
+                cu_seqlens,
+                max_seqlen,
+                max_seqlen,
+                0.0,
+                self.softmax_scale,
+                False,
+                causal,
+                False,
+                None,
+            )
+        else:
+            attn_output = flash_attn_2_cuda.varlen_fwd(
+                query,
+                key,
+                value,
+                None,  # tmp buffer (auto-allocated)
+                cu_seqlens,  # cu_seqlens_q
+                cu_seqlens,  # cu_seqlens_k
+                None,  # max_seqlen_q (auto-computed)
+                None,  # max_seqlen_k (auto-computed)
+                None,  # block_tables
+                None,  # broadcast_mask
+                max_seqlen,  # max_seqlen
+                max_seqlen,  # max_seqlen
+                0.0,  # dropout_p
+                self.softmax_scale,
+                False,  # zero_tensors
+                causal,  # causal attention within each sequence
+                -1,  # window_size_left
+                -1,  # window_size_right
+                0.0,  # softmax_cap
+                False,  # deterministic
+                None,  # rng_state
+            )[0]
+
+        # reshape output to original dimensions
+        attn_output = attn_output.reshape(hidden_state.shape[0], -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class Qwen2_5VLVisionMLP(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+        self.up = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.up_proj", weights=weights, config=config, bias=True
+        )
+        self.gate = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.gate_proj", weights=weights, config=config, bias=True
+        )
+        self.down = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.down_proj", weights=weights, config=config, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_states = self.gate(hidden_states)
+        up_states = self.up(hidden_states)
+        activated_states = self.activation_fn(gate_states) * up_states
+        down_states = self.down(activated_states)
+        return down_states
+
+
+class Qwen2_5VLVisionBlock(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.attn = Qwen2_5VLAttention(
+            prefix=f"{prefix}.attn",
+            config=config,
+            weights=weights,
+        )
+        self.norm1 = FastRMSNorm.load(
+            prefix=f"{prefix}.norm1",
+            weights=weights,
+            eps=1e-6,
+        )
+        self.norm2 = FastRMSNorm.load(
+            prefix=f"{prefix}.norm2",
+            weights=weights,
+            eps=1e-6,
+        )
+        self.mlp = Qwen2_5VLVisionMLP(
+            prefix=f"{prefix}.mlp",
+            config=config,
+            weights=weights,
+        )
+
+    def forward(
+        self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
+    ) -> torch.Tensor:
+        norm1_out, _ = self.norm1(hidden_states)
+        attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen)
+        hidden_states = hidden_states + attn_out
+        norm2_out, _ = self.norm2(hidden_states)
+        mlp_out = self.mlp(norm2_out)
+        hidden_states = hidden_states + mlp_out
+        return hidden_states
+
+
+class Qwen2_5VLPatchMerger(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.hidden_size = config.hidden_size * (config.spatial_merge_size**2)
+        self.patch_merger_ln_q = FastRMSNorm.load(
+            prefix=f"{prefix}.ln_q",
+            weights=weights,
+            eps=1e-6,
+        )
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.mlp.0", weights=weights, config=config, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.mlp.2", weights=weights, config=config, bias=True
+        )
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.patch_merger_ln_q(hidden_states)
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = F.gelu(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Qwen2_5VisionModel(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+
+        self.spatial_merge_size = config.spatial_merge_size
+        kernel_size = [config.temporal_patch_size, config.patch_size, config.patch_size]
+        self.patch_embedding = nn.Conv3d(
+            in_channels=config.in_channels,
+            out_channels=config.hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=False,
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embed.proj.weight"), requires_grad=False
+        )
+        head_dim = config.hidden_size // config.num_heads
+
+        theta = 10000.0
+        dim = head_dim // 2
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        self.blocks = nn.ModuleList(
+            [
+                Qwen2_5VLVisionBlock(
+                    prefix=f"{prefix}.blocks.{i}",
+                    config=config,
+                    weights=weights,
+                )
+                for i in range(config.depth)
+            ]
+        )
+        self.merger = Qwen2_5VLPatchMerger(
+            prefix=f"{prefix}.merger",
+            config=config,
+            weights=weights,
+        )
+        # import ipdb; ipdb.set_trace()
+        self.temporal_patch_size = config.temporal_patch_size
+        self.spatial_patch_size = config.spatial_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+        self.window_size = config.window_size
+        self.patch_size = config.patch_size
+        self.spatial_merge_unit = config.spatial_merge_size * config.spatial_merge_size
+        self.fullatt_block_indexes = config.fullatt_block_indexes
+
+    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = (
+            self.window_size // self.spatial_merge_size // self.patch_size
+        )
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.spatial_merge_size,
+                grid_w // self.spatial_merge_size,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w
+            )
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = (
+                seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            )
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+
+        return window_index, cu_window_seqlens
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+
+        # reshape the input tensor for processing
+        shape = (
+            -1,
+            self.in_channels,
+            self.temporal_patch_size,
+            self.spatial_patch_size,
+            self.spatial_patch_size,
+        )
+        pixel_values = pixel_values.view(shape).to(self.patch_embedding.weight.dtype)
+        hidden_states = self.patch_embedding(pixel_values).view(-1, self.embed_dim)
+        # TODO: revisit to see if we can avoid some of these reshapes
+
+        # find the position ids for the input tensor based on the grid_thw
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+
+        pos_ids = torch.cat(pos_ids, dim=0)
+
+        max_grid_size = grid_thw[:, 1:].max()
+
+        # apply the positional embeddings to the position ids
+        seq = torch.arange(
+            max_grid_size, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
+        rotary_pos_emb_full = torch.outer(seq, self.inv_freq)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        seq_len = hidden_states.shape[0]
+        patch_shape = (seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        og_shape = (seq_len, -1)
+
+        hidden_states = hidden_states.view(patch_shape)[window_index, :, :].view(
+            og_shape
+        )
+        rotary_pos_emb = rotary_pos_emb.view(patch_shape)[window_index, :, :].view(
+            og_shape
+        )
+
+        rotary_pos_emb = rotary_pos_emb.to(device=hidden_states.device)
+
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        # create a cu_seqlens tensor to be used in the attention mask
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        max_seqlen = torch.max(cu_seqlens[1:] - cu_seqlens[:-1])
+
+        # iterately apply the blocks to the hidden states
+        for layer_num, block in enumerate(self.blocks):
+            # NOTE: qwen2_5_vl.py has a concept of full attention blocks
+            # that are applied at specific layers.
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+
+            hidden_states = block(
+                hidden_states, cu_seqlens_now, rotary_pos_emb, max_seqlen
+            )
+
+        # apply the final patch merger to the hidden states
+        hidden_states = self.merger(hidden_states)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :]
+        return hidden_states
+
+
+class Qwen2_5VLForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        config.vision_config.quantize = None
+        config.vision_config.speculator = config.speculator
+        # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
+        # returns rope_scaling.type == "default" for Qwen2_5-VL model at the moment
+        if (
+            hasattr(config, "rope_scaling")
+            and config.rope_scaling is not None
+            and config.rope_scaling.get("type", None) == "default"
+        ):
+            config.rope_scaling.update({"rope_type": "mrope"})
+        self.hidden_size = config.hidden_size
+        self.vision_start_token_id = config.vision_start_token_id
+        self.vision_end_token_id = config.vision_end_token_id
+        self.image_token_id = config.image_token_id
+        self.video_token_id = config.video_token_id
+        self.spatial_merge_size = config.vision_config.spatial_merge_size
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix="model.embed_tokens", weights=weights
+        )
+        self.visual = Qwen2_5VisionModel(
+            prefix="visual", config=config.vision_config, weights=weights
+        )
+        self.text_model = Qwen2Model(prefix=None, config=config, weights=weights)
+        if config.tie_word_embeddings:
+            suffix = "model.embed_tokens"
+        else:
+            suffix = "lm_head"
+
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix=suffix if not prefix else f"{prefix}.{suffix}",
+            weights=weights,
+        )
+        self.device = weights.device
+
+    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
+    # modified to first find segments then initialize position ids for each segment
+    # Steps:
+    #  locate all vision and text segments
+    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
+    #  calculate `text_segment_lengths` for each text segment to be used as offset
+    #  create position ids for each vision segment based on the image grid
+    #  create position ids for each text segment
+    #  combine all the position ids
+    #  the final segment is the difference between the last vision segment and the end of the input
+    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
+    def get_position_ids(
+        self,
+        input_ids: torch.Tensor,
+        image_grid_thw: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if image_grid_thw is None:
+            return (
+                torch.arange(input_ids.shape[0], device=input_ids.device)
+                .unsqueeze(1)
+                .repeat(1, 3)
+            )
+
+        spatial_merge_size = self.spatial_merge_size
+        vision_start_token_id = self.vision_start_token_id
+        vision_end_token_id = self.vision_end_token_id
+        device = input_ids.device
+        dtype = input_ids.dtype
+        input_ids_len = input_ids.shape[0]
+
+        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
+        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
+        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
+        prev_vision_end = torch.cat(
+            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
+        )
+        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
+        vision_widths_max = torch.cat(
+            [
+                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
+                image_grid_thw[:-1, 2] // spatial_merge_size,
+            ]
+        )
+        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
+        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
+        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision
+
+        # create position ids for each vision segment based on the image grid
+        llm_pos_ids_list = []
+        for i, _ in enumerate(vision_segments):
+            t, h, w = (
+                image_grid_thw[i][0],
+                image_grid_thw[i][1] // spatial_merge_size,
+                image_grid_thw[i][2] // spatial_merge_size,
+            )
+            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
+            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
+            w_indices = torch.arange(w, device=device).repeat(t * h)
+            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)
+
+            # offset by the position of the last vision segment
+            im = image_position_ids + vision_segment_lengths[i]
+            llm_pos_ids_list.append(im)
+
+        # create position ids for each text segment
+        text_ranges = [
+            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
+            + text_segment_lengths[i]
+            for i, seq_len in enumerate(text_lengths_between_vision)
+        ]
+
+        full_llm_pos_ids_list = [
+            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
+        ]
+        # import ipdb
+
+        # ipdb.set_trace()
+        max_s = full_llm_pos_ids_list[-1].max() + 1
+        final_text_len = input_ids_len - vision_ends[-1]
+        if final_text_len > 0:
+            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
+            full_llm_pos_ids_list.append(m + max_s)
+
+        position_ids = (
+            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
+        )
+        return position_ids
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor],
+        pixel_values: torch.FloatTensor = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        # Unused in this model
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_attention_mask=None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        image_indices=None,
+    ):
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        # apply the visual model to the pixel values if they are provided
+        if pixel_values is not None and len(pixel_values) > 0:
+            if pixel_values is not None:
+                image_embeds = self.visual(
+                    pixel_values, grid_thw=image_grid_thw
+                ).squeeze(0)
+                inputs_embeds[input_ids == self.image_token_id] = image_embeds
+
+        hidden_states = self.text_model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
index 2d017e38..a72e0e55 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -379,7 +379,12 @@ class Qwen2VLForConditionalGeneration(nn.Module):
         config.vision_config.speculator = config.speculator
         # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
         # returns rope_scaling.type == "default" for Qwen2-VL model at the moment
-        config.rope_scaling.update({"rope_type": "mrope"})
+        if (
+            hasattr(config, "rope_scaling")
+            and config.rope_scaling is not None
+            and config.rope_scaling.get("type", None) == "default"
+        ):
+            config.rope_scaling.update({"rope_type": "mrope"})
         self.hidden_size = config.hidden_size
         self.vision_start_token_id = config.vision_start_token_id
         self.vision_end_token_id = config.vision_end_token_id
@@ -520,7 +525,6 @@ class Qwen2VLForConditionalGeneration(nn.Module):
 
         # apply the visual model to the pixel values if they are provided
         if pixel_values is not None and len(pixel_values) > 0:
-            pixel_values = pixel_values.to(inputs_embeds.dtype)
             if pixel_values is not None:
                 image_embeds = self.visual(
                     pixel_values, grid_thw=image_grid_thw
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index db78341d..39046f2a 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -123,6 +123,11 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
         num_pads = grid_t * grid_h * grid_w // 4
         padding = "<|image_pad|>" * num_pads
         return f"<|vision_start|>{padding}<|vision_end|>"
+    elif config.model_type == "qwen2_5_vl":
+        grid_t, grid_h, grid_w = image_input["image_grid_thw"][image_id]
+        num_pads = grid_t * grid_h * grid_w // 4
+        padding = "<|image_pad|>" * num_pads
+        return f"<|vision_start|>{padding}<|vision_end|>"
     else:
         raise RuntimeError(f"Unknown config {config.model_type} for multimodal")
 
@@ -231,7 +236,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                     image = Image.open(BytesIO(chunk.image.data))
                     # qwen2_vl expects images to be greater than 20 pixels, this is for warmup since the
                     # default warmup image is 20x20
-                    if config.model_type == "qwen2_vl":
+                    if config.model_type in {"qwen2_vl", "qwen2_5_vl"}:
                         if image.width <= 20:
                             w = image.width * 2
                             h = image.height * 2
@@ -422,7 +427,7 @@ class VlmCausalLM(FlashCausalLM):
             max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
-        if self.model.config.model_type == "qwen2_vl":
+        if self.model.config.model_type in {"qwen2_vl", "qwen2_5_vl"}:
             if position_ids.dim() == 1 and batch.prefilling:
                 position_ids = self.model.get_position_ids(
                     input_ids, batch.image_grid_thw

From fde3234cbc0d2e838bdff9420ab8dc8200256ea8 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 19 Feb 2025 14:53:14 +0100
Subject: [PATCH 095/183] Using public external registry (to use external
 runners for CI). (#3031)

* Using public external registry (to use external runners for CI).

* Fix build.

* Fixing the external registry.

* Fixing trtllm tests.
---
 .github/workflows/build.yaml | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 7c91d7b0..d9f58667 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -137,6 +137,7 @@ jobs:
           install: true
           buildkitd-config: /tmp/buildkitd.toml
       - name: Login to internal Container Registry
+        if: github.event_name != 'pull_request'
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.REGISTRY_USERNAME }}
@@ -149,6 +150,12 @@ jobs:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Login to Docker Hub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: docker.io
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
       - name: Login to Azure Container Registry
         if: github.event_name != 'pull_request'
         uses: docker/login-action@v3
@@ -163,7 +170,7 @@ jobs:
         uses: docker/metadata-action@v5
         with:
           images: |
-            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+            docker.io/huggingface/text-generation-inference-ci
           tags: |
             type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
       # If main, release or tag
@@ -207,7 +214,7 @@ jobs:
       - name: Final
         id: final
         run: |
-          echo "docker_image=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+          echo "docker_image=docker.io/huggingface/text-generation-inference-ci:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
           echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
           echo "docker_volume=${{ env.DOCKER_VOLUME }}" >> "$GITHUB_OUTPUT"
           echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
@@ -257,8 +264,8 @@ jobs:
     container:
       image: ${{ needs.build-and-push.outputs.docker_image }}
       credentials:
-        username: ${{ secrets.REGISTRY_USERNAME }}
-        password: ${{ secrets.REGISTRY_PASSWORD }}
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_PASSWORD }}
       options: --gpus all --shm-size=8g
 
     steps:

From 9c89d0070e83e96aef522db887065cde8ff4cdec Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 19 Feb 2025 17:01:33 +0100
Subject: [PATCH 096/183] Having less logs in case of failure for checking CI
 more easily. (#3037)

* Having less logs in case of failure for checking CI more easily.

* Cleaning up the versions to uv for the client.

* Ignore entirely the API.
---
 clients/python/tests/conftest.py           |    2 +-
 clients/python/tests/test_client.py        |   14 +-
 clients/python/tests/test_inference_api.py |   84 +-
 clients/python/text_generation/client.py   |    2 +-
 integration-tests/conftest.py              |   81 +-
 integration-tests/poetry.lock              | 1174 --------------------
 integration-tests/pyproject.toml           |   21 +-
 integration-tests/requirements.txt         |  121 +-
 integration-tests/uv.lock                  |  829 ++++++++++++++
 9 files changed, 1029 insertions(+), 1299 deletions(-)
 delete mode 100644 integration-tests/poetry.lock
 create mode 100644 integration-tests/uv.lock

diff --git a/clients/python/tests/conftest.py b/clients/python/tests/conftest.py
index 17bb73b5..f3db6e68 100644
--- a/clients/python/tests/conftest.py
+++ b/clients/python/tests/conftest.py
@@ -21,7 +21,7 @@ def fake_model():
 
 @pytest.fixture
 def unsupported_model():
-    return "gpt2"
+    return "google-bert/bert-base-uncased"
 
 
 @pytest.fixture
diff --git a/clients/python/tests/test_client.py b/clients/python/tests/test_client.py
index 8aed865b..f6ffc89d 100644
--- a/clients/python/tests/test_client.py
+++ b/clients/python/tests/test_client.py
@@ -13,8 +13,8 @@ def test_generate(llama_7b_url, hf_headers):
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
-    assert len(response.details.prefill) == 2
-    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
+    assert len(response.details.prefill) == 0
+    # assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
     assert len(response.details.tokens) == 1
     assert response.details.tokens[0].id == 29918
     assert response.details.tokens[0].text == "_"
@@ -83,11 +83,11 @@ async def test_generate_async(llama_7b_url, hf_headers):
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
-    assert len(response.details.prefill) == 2
-    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
-    assert response.details.prefill[1] == InputToken(
-        id=1243, text="test", logprob=-10.96875
-    )
+    assert len(response.details.prefill) == 0
+    # assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
+    # assert response.details.prefill[1] == InputToken(
+    #     id=1243, text="test", logprob=-10.96875
+    # )
     assert len(response.details.tokens) == 1
     assert response.details.tokens[0].id == 29918
     assert response.details.tokens[0].text == "_"
diff --git a/clients/python/tests/test_inference_api.py b/clients/python/tests/test_inference_api.py
index 59297c26..5a258405 100644
--- a/clients/python/tests/test_inference_api.py
+++ b/clients/python/tests/test_inference_api.py
@@ -1,42 +1,42 @@
-import pytest
-
-from text_generation import (
-    InferenceAPIClient,
-    InferenceAPIAsyncClient,
-    Client,
-    AsyncClient,
-)
-from text_generation.errors import NotSupportedError, NotFoundError
-from text_generation.inference_api import check_model_support, deployed_models
-
-
-def test_check_model_support(flan_t5_xxl, unsupported_model, fake_model):
-    assert check_model_support(flan_t5_xxl)
-    assert not check_model_support(unsupported_model)
-
-    with pytest.raises(NotFoundError):
-        check_model_support(fake_model)
-
-
-def test_deployed_models():
-    deployed_models()
-
-
-def test_client(flan_t5_xxl):
-    client = InferenceAPIClient(flan_t5_xxl)
-    assert isinstance(client, Client)
-
-
-def test_client_unsupported_model(unsupported_model):
-    with pytest.raises(NotSupportedError):
-        InferenceAPIClient(unsupported_model)
-
-
-def test_async_client(flan_t5_xxl):
-    client = InferenceAPIAsyncClient(flan_t5_xxl)
-    assert isinstance(client, AsyncClient)
-
-
-def test_async_client_unsupported_model(unsupported_model):
-    with pytest.raises(NotSupportedError):
-        InferenceAPIAsyncClient(unsupported_model)
+# import pytest
+#
+# from text_generation import (
+#     InferenceAPIClient,
+#     InferenceAPIAsyncClient,
+#     Client,
+#     AsyncClient,
+# )
+# from text_generation.errors import NotSupportedError, NotFoundError
+# from text_generation.inference_api import check_model_support, deployed_models
+#
+#
+# def test_check_model_support(flan_t5_xxl, unsupported_model, fake_model):
+#     assert check_model_support(flan_t5_xxl)
+#     assert not check_model_support(unsupported_model)
+#
+#     with pytest.raises(NotFoundError):
+#         check_model_support(fake_model)
+#
+#
+# def test_deployed_models():
+#     deployed_models()
+#
+#
+# def test_client(flan_t5_xxl):
+#     client = InferenceAPIClient(flan_t5_xxl)
+#     assert isinstance(client, Client)
+#
+#
+# def test_client_unsupported_model(unsupported_model):
+#     with pytest.raises(NotSupportedError):
+#         InferenceAPIClient(unsupported_model)
+#
+#
+# def test_async_client(flan_t5_xxl):
+#     client = InferenceAPIAsyncClient(flan_t5_xxl)
+#     assert isinstance(client, AsyncClient)
+#
+#
+# def test_async_client_unsupported_model(unsupported_model):
+#     with pytest.raises(NotSupportedError):
+#         InferenceAPIAsyncClient(unsupported_model)
diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py
index 45301b63..0b60d93a 100644
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
@@ -867,7 +867,7 @@ class AsyncClient:
         async with ClientSession(
             headers=self.headers, cookies=self.cookies, timeout=self.timeout
         ) as session:
-            async with session.post(self.base_url, json=request.dict()) as resp:
+            async with session.post(self.base_url, json=request.model_dump()) as resp:
                 payload = await resp.json()
 
                 if resp.status != 200:
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 6f8aa715..2d3ae8a2 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,4 +1,5 @@
 # ruff: noqa: E402
+from _pytest.fixtures import SubRequest
 import requests
 
 
@@ -10,13 +11,13 @@ class SessionTimeoutFix(requests.Session):
 
 requests.sessions.Session = SessionTimeoutFix
 
+import warnings
 import asyncio
 import contextlib
 import json
 import math
 import os
 import random
-import shutil
 import subprocess
 import sys
 import tempfile
@@ -72,6 +73,16 @@ def pytest_collection_modifyitems(config, items):
             item.add_marker(skip_release)
 
 
+@pytest.fixture(autouse=True)
+def container_log(request: SubRequest):
+    error_log = request.getfixturevalue("error_log")
+    assert error_log is not None
+    yield
+    if request.session.testsfailed:
+        error_log.seek(0)
+        print(error_log.read(), file=sys.stderr)
+
+
 class ResponseComparator(JSONSnapshotExtension):
     rtol = 0.2
     ignore_logprob = False
@@ -278,8 +289,10 @@ class IgnoreLogProbResponseComparator(ResponseComparator):
 
 
 class LauncherHandle:
-    def __init__(self, port: int):
-        self.client = AsyncClient(f"http://localhost:{port}", timeout=30)
+    def __init__(self, port: int, error_log):
+        with warnings.catch_warnings(action="ignore"):
+            self.client = AsyncClient(f"http://localhost:{port}", timeout=30)
+        self.error_log = error_log
 
     def _inner_health(self):
         raise NotImplementedError
@@ -288,6 +301,8 @@ class LauncherHandle:
         assert timeout > 0
         for _ in range(timeout):
             if not self._inner_health():
+                self.error_log.seek(0)
+                print(self.error_log.read(), file=sys.stderr)
                 raise RuntimeError("Launcher crashed")
 
             try:
@@ -295,12 +310,14 @@ class LauncherHandle:
                 return
             except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
                 time.sleep(1)
+        self.error_log.seek(0)
+        print(self.error_log.read(), file=sys.stderr)
         raise RuntimeError("Health check failed")
 
 
 class ContainerLauncherHandle(LauncherHandle):
-    def __init__(self, docker_client, container_name, port: int):
-        super(ContainerLauncherHandle, self).__init__(port)
+    def __init__(self, docker_client, container_name, port: int, error_log):
+        super().__init__(port, error_log)
         self.docker_client = docker_client
         self.container_name = container_name
 
@@ -310,8 +327,8 @@ class ContainerLauncherHandle(LauncherHandle):
 
 
 class ProcessLauncherHandle(LauncherHandle):
-    def __init__(self, process, port: int):
-        super(ProcessLauncherHandle, self).__init__(port)
+    def __init__(self, process, port: int, error_log):
+        super().__init__(port, error_log)
         self.process = process
 
     def _inner_health(self) -> bool:
@@ -334,14 +351,13 @@ def ignore_logprob_response_snapshot(snapshot):
 
 
 @pytest.fixture(scope="module")
-def event_loop():
-    loop = asyncio.get_event_loop()
-    yield loop
-    loop.close()
+def error_log():
+    with tempfile.TemporaryFile("w+") as tmp:
+        yield tmp
 
 
 @pytest.fixture(scope="module")
-def launcher(event_loop):
+async def launcher(error_log):
     @contextlib.contextmanager
     def local_launcher(
         model_id: str,
@@ -429,22 +445,19 @@ def launcher(event_loop):
         if attention is not None:
             env["ATTENTION"] = attention
 
-        with tempfile.TemporaryFile("w+") as tmp:
+            # with tempfile.TemporaryFile("w+") as tmp:
             # We'll output stdout/stderr to a temporary file. Using a pipe
             # cause the process to block until stdout is read.
-            with subprocess.Popen(
-                args,
-                stdout=tmp,
-                stderr=subprocess.STDOUT,
-                env=env,
-            ) as process:
-                yield ProcessLauncherHandle(process, port)
+        with subprocess.Popen(
+            args,
+            stdout=error_log,
+            stderr=subprocess.STDOUT,
+            env=env,
+        ) as process:
+            yield ProcessLauncherHandle(process, port, error_log=error_log)
 
-                process.terminate()
-                process.wait(60)
-
-                tmp.seek(0)
-                shutil.copyfileobj(tmp, sys.stderr)
+            process.terminate()
+            process.wait(60)
 
         if not use_flash_attention:
             del env["USE_FLASH_ATTENTION"]
@@ -578,8 +591,21 @@ def launcher(event_loop):
             shm_size="1G",
         )
 
+        def pipe():
+            for log in container.logs(stream=True):
+                log = log.decode("utf-8")
+                error_log.write(log)
+
+        # Start looping to pipe the logs
+        import threading
+
+        t = threading.Thread(target=pipe, args=())
+        t.start()
+
         try:
-            yield ContainerLauncherHandle(client, container.name, port)
+            yield ContainerLauncherHandle(
+                client, container.name, port, error_log=error_log
+            )
 
             if not use_flash_attention:
                 del env["USE_FLASH_ATTENTION"]
@@ -590,9 +616,6 @@ def launcher(event_loop):
             except NotFound:
                 pass
 
-            container_output = container.logs().decode("utf-8")
-            print(container_output, file=sys.stderr)
-
         finally:
             try:
                 container.remove()
diff --git a/integration-tests/poetry.lock b/integration-tests/poetry.lock
deleted file mode 100644
index 56b146bc..00000000
--- a/integration-tests/poetry.lock
+++ /dev/null
@@ -1,1174 +0,0 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
-
-[[package]]
-name = "aiohappyeyeballs"
-version = "2.4.0"
-description = "Happy Eyeballs for asyncio"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "aiohappyeyeballs-2.4.0-py3-none-any.whl", hash = "sha256:7ce92076e249169a13c2f49320d1967425eaf1f407522d707d59cac7628d62bd"},
-    {file = "aiohappyeyeballs-2.4.0.tar.gz", hash = "sha256:55a1714f084e63d49639800f95716da97a1f173d46a16dfcfda0016abb93b6b2"},
-]
-
-[[package]]
-name = "aiohttp"
-version = "3.10.5"
-description = "Async http client/server framework (asyncio)"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "aiohttp-3.10.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:18a01eba2574fb9edd5f6e5fb25f66e6ce061da5dab5db75e13fe1558142e0a3"},
-    {file = "aiohttp-3.10.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:94fac7c6e77ccb1ca91e9eb4cb0ac0270b9fb9b289738654120ba8cebb1189c6"},
-    {file = "aiohttp-3.10.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f1f1c75c395991ce9c94d3e4aa96e5c59c8356a15b1c9231e783865e2772699"},
-    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7acae3cf1a2a2361ec4c8e787eaaa86a94171d2417aae53c0cca6ca3118ff6"},
-    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94c4381ffba9cc508b37d2e536b418d5ea9cfdc2848b9a7fea6aebad4ec6aac1"},
-    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c31ad0c0c507894e3eaa843415841995bf8de4d6b2d24c6e33099f4bc9fc0d4f"},
-    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0912b8a8fadeb32ff67a3ed44249448c20148397c1ed905d5dac185b4ca547bb"},
-    {file = "aiohttp-3.10.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d93400c18596b7dc4794d48a63fb361b01a0d8eb39f28800dc900c8fbdaca91"},
-    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d00f3c5e0d764a5c9aa5a62d99728c56d455310bcc288a79cab10157b3af426f"},
-    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:d742c36ed44f2798c8d3f4bc511f479b9ceef2b93f348671184139e7d708042c"},
-    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:814375093edae5f1cb31e3407997cf3eacefb9010f96df10d64829362ae2df69"},
-    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8224f98be68a84b19f48e0bdc14224b5a71339aff3a27df69989fa47d01296f3"},
-    {file = "aiohttp-3.10.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d9a487ef090aea982d748b1b0d74fe7c3950b109df967630a20584f9a99c0683"},
-    {file = "aiohttp-3.10.5-cp310-cp310-win32.whl", hash = "sha256:d9ef084e3dc690ad50137cc05831c52b6ca428096e6deb3c43e95827f531d5ef"},
-    {file = "aiohttp-3.10.5-cp310-cp310-win_amd64.whl", hash = "sha256:66bf9234e08fe561dccd62083bf67400bdbf1c67ba9efdc3dac03650e97c6088"},
-    {file = "aiohttp-3.10.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8c6a4e5e40156d72a40241a25cc226051c0a8d816610097a8e8f517aeacd59a2"},
-    {file = "aiohttp-3.10.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c634a3207a5445be65536d38c13791904fda0748b9eabf908d3fe86a52941cf"},
-    {file = "aiohttp-3.10.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4aff049b5e629ef9b3e9e617fa6e2dfeda1bf87e01bcfecaf3949af9e210105e"},
-    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1942244f00baaacaa8155eca94dbd9e8cc7017deb69b75ef67c78e89fdad3c77"},
-    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e04a1f2a65ad2f93aa20f9ff9f1b672bf912413e5547f60749fa2ef8a644e061"},
-    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7f2bfc0032a00405d4af2ba27f3c429e851d04fad1e5ceee4080a1c570476697"},
-    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:424ae21498790e12eb759040bbb504e5e280cab64693d14775c54269fd1d2bb7"},
-    {file = "aiohttp-3.10.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:975218eee0e6d24eb336d0328c768ebc5d617609affaca5dbbd6dd1984f16ed0"},
-    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4120d7fefa1e2d8fb6f650b11489710091788de554e2b6f8347c7a20ceb003f5"},
-    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:b90078989ef3fc45cf9221d3859acd1108af7560c52397ff4ace8ad7052a132e"},
-    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ba5a8b74c2a8af7d862399cdedce1533642fa727def0b8c3e3e02fcb52dca1b1"},
-    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:02594361128f780eecc2a29939d9dfc870e17b45178a867bf61a11b2a4367277"},
-    {file = "aiohttp-3.10.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8fb4fc029e135859f533025bc82047334e24b0d489e75513144f25408ecaf058"},
-    {file = "aiohttp-3.10.5-cp311-cp311-win32.whl", hash = "sha256:e1ca1ef5ba129718a8fc827b0867f6aa4e893c56eb00003b7367f8a733a9b072"},
-    {file = "aiohttp-3.10.5-cp311-cp311-win_amd64.whl", hash = "sha256:349ef8a73a7c5665cca65c88ab24abe75447e28aa3bc4c93ea5093474dfdf0ff"},
-    {file = "aiohttp-3.10.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:305be5ff2081fa1d283a76113b8df7a14c10d75602a38d9f012935df20731487"},
-    {file = "aiohttp-3.10.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3a1c32a19ee6bbde02f1cb189e13a71b321256cc1d431196a9f824050b160d5a"},
-    {file = "aiohttp-3.10.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:61645818edd40cc6f455b851277a21bf420ce347baa0b86eaa41d51ef58ba23d"},
-    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c225286f2b13bab5987425558baa5cbdb2bc925b2998038fa028245ef421e75"},
-    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ba01ebc6175e1e6b7275c907a3a36be48a2d487549b656aa90c8a910d9f3178"},
-    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8eaf44ccbc4e35762683078b72bf293f476561d8b68ec8a64f98cf32811c323e"},
-    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1c43eb1ab7cbf411b8e387dc169acb31f0ca0d8c09ba63f9eac67829585b44f"},
-    {file = "aiohttp-3.10.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de7a5299827253023c55ea549444e058c0eb496931fa05d693b95140a947cb73"},
-    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4790f0e15f00058f7599dab2b206d3049d7ac464dc2e5eae0e93fa18aee9e7bf"},
-    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:44b324a6b8376a23e6ba25d368726ee3bc281e6ab306db80b5819999c737d820"},
-    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0d277cfb304118079e7044aad0b76685d30ecb86f83a0711fc5fb257ffe832ca"},
-    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:54d9ddea424cd19d3ff6128601a4a4d23d54a421f9b4c0fff740505813739a91"},
-    {file = "aiohttp-3.10.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4f1c9866ccf48a6df2b06823e6ae80573529f2af3a0992ec4fe75b1a510df8a6"},
-    {file = "aiohttp-3.10.5-cp312-cp312-win32.whl", hash = "sha256:dc4826823121783dccc0871e3f405417ac116055bf184ac04c36f98b75aacd12"},
-    {file = "aiohttp-3.10.5-cp312-cp312-win_amd64.whl", hash = "sha256:22c0a23a3b3138a6bf76fc553789cb1a703836da86b0f306b6f0dc1617398abc"},
-    {file = "aiohttp-3.10.5-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7f6b639c36734eaa80a6c152a238242bedcee9b953f23bb887e9102976343092"},
-    {file = "aiohttp-3.10.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f29930bc2921cef955ba39a3ff87d2c4398a0394ae217f41cb02d5c26c8b1b77"},
-    {file = "aiohttp-3.10.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f489a2c9e6455d87eabf907ac0b7d230a9786be43fbe884ad184ddf9e9c1e385"},
-    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:123dd5b16b75b2962d0fff566effb7a065e33cd4538c1692fb31c3bda2bfb972"},
-    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b98e698dc34966e5976e10bbca6d26d6724e6bdea853c7c10162a3235aba6e16"},
-    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3b9162bab7e42f21243effc822652dc5bb5e8ff42a4eb62fe7782bcbcdfacf6"},
-    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1923a5c44061bffd5eebeef58cecf68096e35003907d8201a4d0d6f6e387ccaa"},
-    {file = "aiohttp-3.10.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d55f011da0a843c3d3df2c2cf4e537b8070a419f891c930245f05d329c4b0689"},
-    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:afe16a84498441d05e9189a15900640a2d2b5e76cf4efe8cbb088ab4f112ee57"},
-    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f8112fb501b1e0567a1251a2fd0747baae60a4ab325a871e975b7bb67e59221f"},
-    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:1e72589da4c90337837fdfe2026ae1952c0f4a6e793adbbfbdd40efed7c63599"},
-    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4d46c7b4173415d8e583045fbc4daa48b40e31b19ce595b8d92cf639396c15d5"},
-    {file = "aiohttp-3.10.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:33e6bc4bab477c772a541f76cd91e11ccb6d2efa2b8d7d7883591dfb523e5987"},
-    {file = "aiohttp-3.10.5-cp313-cp313-win32.whl", hash = "sha256:c58c6837a2c2a7cf3133983e64173aec11f9c2cd8e87ec2fdc16ce727bcf1a04"},
-    {file = "aiohttp-3.10.5-cp313-cp313-win_amd64.whl", hash = "sha256:38172a70005252b6893088c0f5e8a47d173df7cc2b2bd88650957eb84fcf5022"},
-    {file = "aiohttp-3.10.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f6f18898ace4bcd2d41a122916475344a87f1dfdec626ecde9ee802a711bc569"},
-    {file = "aiohttp-3.10.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5ede29d91a40ba22ac1b922ef510aab871652f6c88ef60b9dcdf773c6d32ad7a"},
-    {file = "aiohttp-3.10.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:673f988370f5954df96cc31fd99c7312a3af0a97f09e407399f61583f30da9bc"},
-    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58718e181c56a3c02d25b09d4115eb02aafe1a732ce5714ab70326d9776457c3"},
-    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b38b1570242fbab8d86a84128fb5b5234a2f70c2e32f3070143a6d94bc854cf"},
-    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:074d1bff0163e107e97bd48cad9f928fa5a3eb4b9d33366137ffce08a63e37fe"},
-    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd31f176429cecbc1ba499d4aba31aaccfea488f418d60376b911269d3b883c5"},
-    {file = "aiohttp-3.10.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7384d0b87d4635ec38db9263e6a3f1eb609e2e06087f0aa7f63b76833737b471"},
-    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8989f46f3d7ef79585e98fa991e6ded55d2f48ae56d2c9fa5e491a6e4effb589"},
-    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:c83f7a107abb89a227d6c454c613e7606c12a42b9a4ca9c5d7dad25d47c776ae"},
-    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:cde98f323d6bf161041e7627a5fd763f9fd829bcfcd089804a5fdce7bb6e1b7d"},
-    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:676f94c5480d8eefd97c0c7e3953315e4d8c2b71f3b49539beb2aa676c58272f"},
-    {file = "aiohttp-3.10.5-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:2d21ac12dc943c68135ff858c3a989f2194a709e6e10b4c8977d7fcd67dfd511"},
-    {file = "aiohttp-3.10.5-cp38-cp38-win32.whl", hash = "sha256:17e997105bd1a260850272bfb50e2a328e029c941c2708170d9d978d5a30ad9a"},
-    {file = "aiohttp-3.10.5-cp38-cp38-win_amd64.whl", hash = "sha256:1c19de68896747a2aa6257ae4cf6ef59d73917a36a35ee9d0a6f48cff0f94db8"},
-    {file = "aiohttp-3.10.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7e2fe37ac654032db1f3499fe56e77190282534810e2a8e833141a021faaab0e"},
-    {file = "aiohttp-3.10.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f5bf3ead3cb66ab990ee2561373b009db5bc0e857549b6c9ba84b20bc462e172"},
-    {file = "aiohttp-3.10.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1b2c16a919d936ca87a3c5f0e43af12a89a3ce7ccbce59a2d6784caba945b68b"},
-    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad146dae5977c4dd435eb31373b3fe9b0b1bf26858c6fc452bf6af394067e10b"},
-    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c5c6fa16412b35999320f5c9690c0f554392dc222c04e559217e0f9ae244b92"},
-    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:95c4dc6f61d610bc0ee1edc6f29d993f10febfe5b76bb470b486d90bbece6b22"},
-    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da452c2c322e9ce0cfef392e469a26d63d42860f829026a63374fde6b5c5876f"},
-    {file = "aiohttp-3.10.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:898715cf566ec2869d5cb4d5fb4be408964704c46c96b4be267442d265390f32"},
-    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:391cc3a9c1527e424c6865e087897e766a917f15dddb360174a70467572ac6ce"},
-    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:380f926b51b92d02a34119d072f178d80bbda334d1a7e10fa22d467a66e494db"},
-    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce91db90dbf37bb6fa0997f26574107e1b9d5ff939315247b7e615baa8ec313b"},
-    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9093a81e18c45227eebe4c16124ebf3e0d893830c6aca7cc310bfca8fe59d857"},
-    {file = "aiohttp-3.10.5-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ee40b40aa753d844162dcc80d0fe256b87cba48ca0054f64e68000453caead11"},
-    {file = "aiohttp-3.10.5-cp39-cp39-win32.whl", hash = "sha256:03f2645adbe17f274444953bdea69f8327e9d278d961d85657cb0d06864814c1"},
-    {file = "aiohttp-3.10.5-cp39-cp39-win_amd64.whl", hash = "sha256:d17920f18e6ee090bdd3d0bfffd769d9f2cb4c8ffde3eb203777a3895c128862"},
-    {file = "aiohttp-3.10.5.tar.gz", hash = "sha256:f071854b47d39591ce9a17981c46790acb30518e2f83dfca8db2dfa091178691"},
-]
-
-[package.dependencies]
-aiohappyeyeballs = ">=2.3.0"
-aiosignal = ">=1.1.2"
-async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
-attrs = ">=17.3.0"
-frozenlist = ">=1.1.1"
-multidict = ">=4.5,<7.0"
-yarl = ">=1.0,<2.0"
-
-[package.extras]
-speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
-
-[[package]]
-name = "aiosignal"
-version = "1.3.1"
-description = "aiosignal: a list of registered asynchronous callbacks"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
-    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
-]
-
-[package.dependencies]
-frozenlist = ">=1.1.0"
-
-[[package]]
-name = "annotated-types"
-version = "0.7.0"
-description = "Reusable constraint types to use with typing.Annotated"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
-    {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
-]
-
-[[package]]
-name = "async-timeout"
-version = "4.0.3"
-description = "Timeout context manager for asyncio programs"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
-    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
-]
-
-[[package]]
-name = "attrs"
-version = "24.2.0"
-description = "Classes Without Boilerplate"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
-    {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
-]
-
-[package.extras]
-benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
-tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
-
-[[package]]
-name = "certifi"
-version = "2024.8.30"
-description = "Python package for providing Mozilla's CA Bundle."
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
-    {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
-]
-
-[[package]]
-name = "charset-normalizer"
-version = "3.3.2"
-description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-optional = false
-python-versions = ">=3.7.0"
-files = [
-    {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
-    {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
-]
-
-[[package]]
-name = "colorama"
-version = "0.4.6"
-description = "Cross-platform colored terminal text."
-optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
-files = [
-    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
-    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
-]
-
-[[package]]
-name = "docker"
-version = "7.1.0"
-description = "A Python library for the Docker Engine API."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
-    {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
-]
-
-[package.dependencies]
-pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""}
-requests = ">=2.26.0"
-urllib3 = ">=1.26.0"
-
-[package.extras]
-dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"]
-docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
-ssh = ["paramiko (>=2.4.3)"]
-websockets = ["websocket-client (>=1.3.0)"]
-
-[[package]]
-name = "exceptiongroup"
-version = "1.2.2"
-description = "Backport of PEP 654 (exception groups)"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
-    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
-]
-
-[package.extras]
-test = ["pytest (>=6)"]
-
-[[package]]
-name = "filelock"
-version = "3.16.0"
-description = "A platform independent file lock."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "filelock-3.16.0-py3-none-any.whl", hash = "sha256:f6ed4c963184f4c84dd5557ce8fece759a3724b37b80c6c4f20a2f63a4dc6609"},
-    {file = "filelock-3.16.0.tar.gz", hash = "sha256:81de9eb8453c769b63369f87f11131a7ab04e367f8d97ad39dc230daa07e3bec"},
-]
-
-[package.extras]
-docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.1.1)", "pytest (>=8.3.2)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.3)"]
-typing = ["typing-extensions (>=4.12.2)"]
-
-[[package]]
-name = "frozenlist"
-version = "1.4.1"
-description = "A list-like structure which implements collections.abc.MutableSequence"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
-    {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
-    {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
-]
-
-[[package]]
-name = "fsspec"
-version = "2024.9.0"
-description = "File-system specification"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "fsspec-2024.9.0-py3-none-any.whl", hash = "sha256:a0947d552d8a6efa72cc2c730b12c41d043509156966cca4fb157b0f2a0c574b"},
-    {file = "fsspec-2024.9.0.tar.gz", hash = "sha256:4b0afb90c2f21832df142f292649035d80b421f60a9e1c027802e5a0da2b04e8"},
-]
-
-[package.extras]
-abfs = ["adlfs"]
-adl = ["adlfs"]
-arrow = ["pyarrow (>=1)"]
-dask = ["dask", "distributed"]
-dev = ["pre-commit", "ruff"]
-doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"]
-dropbox = ["dropbox", "dropboxdrivefs", "requests"]
-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
-fuse = ["fusepy"]
-gcs = ["gcsfs"]
-git = ["pygit2"]
-github = ["requests"]
-gs = ["gcsfs"]
-gui = ["panel"]
-hdfs = ["pyarrow (>=1)"]
-http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"]
-libarchive = ["libarchive-c"]
-oci = ["ocifs"]
-s3 = ["s3fs"]
-sftp = ["paramiko"]
-smb = ["smbprotocol"]
-ssh = ["paramiko"]
-test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"]
-test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
-test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
-tqdm = ["tqdm"]
-
-[[package]]
-name = "huggingface-hub"
-version = "0.24.6"
-description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
-optional = false
-python-versions = ">=3.8.0"
-files = [
-    {file = "huggingface_hub-0.24.6-py3-none-any.whl", hash = "sha256:a990f3232aa985fe749bc9474060cbad75e8b2f115f6665a9fda5b9c97818970"},
-    {file = "huggingface_hub-0.24.6.tar.gz", hash = "sha256:cc2579e761d070713eaa9c323e3debe39d5b464ae3a7261c39a9195b27bb8000"},
-]
-
-[package.dependencies]
-filelock = "*"
-fsspec = ">=2023.5.0"
-packaging = ">=20.9"
-pyyaml = ">=5.1"
-requests = "*"
-tqdm = ">=4.42.1"
-typing-extensions = ">=3.7.4.3"
-
-[package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
-cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
-fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
-hf-transfer = ["hf-transfer (>=0.1.4)"]
-inference = ["aiohttp", "minijinja (>=1.0)"]
-quality = ["mypy (==1.5.1)", "ruff (>=0.5.0)"]
-tensorflow = ["graphviz", "pydot", "tensorflow"]
-tensorflow-testing = ["keras (<3.0)", "tensorflow"]
-testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
-torch = ["safetensors[torch]", "torch"]
-typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
-
-[[package]]
-name = "idna"
-version = "3.8"
-description = "Internationalized Domain Names in Applications (IDNA)"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"},
-    {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"},
-]
-
-[[package]]
-name = "iniconfig"
-version = "2.0.0"
-description = "brain-dead simple config-ini parsing"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
-    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
-]
-
-[[package]]
-name = "multidict"
-version = "6.1.0"
-description = "multidict implementation"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"},
-    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"},
-    {file = "multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7"},
-    {file = "multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0"},
-    {file = "multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d"},
-    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6"},
-    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156"},
-    {file = "multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753"},
-    {file = "multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80"},
-    {file = "multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926"},
-    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa"},
-    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436"},
-    {file = "multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3"},
-    {file = "multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133"},
-    {file = "multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1"},
-    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008"},
-    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f"},
-    {file = "multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"},
-    {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"},
-    {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"},
-    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392"},
-    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a"},
-    {file = "multidict-6.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd"},
-    {file = "multidict-6.1.0-cp38-cp38-win32.whl", hash = "sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167"},
-    {file = "multidict-6.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"},
-    {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"},
-    {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"},
-    {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"},
-    {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"},
-]
-
-[package.dependencies]
-typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
-
-[[package]]
-name = "numpy"
-version = "1.26.4"
-description = "Fundamental package for array computing in Python"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
-    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
-    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
-    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
-    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
-    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
-    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
-    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
-    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
-    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
-    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
-    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
-    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
-    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
-    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
-    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
-    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
-    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
-    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
-    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
-    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
-    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
-    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
-    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
-    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
-    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
-    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
-    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
-    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
-    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
-    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
-    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
-    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
-    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
-    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
-    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
-]
-
-[[package]]
-name = "packaging"
-version = "24.1"
-description = "Core utilities for Python packages"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
-    {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
-]
-
-[[package]]
-name = "pluggy"
-version = "1.5.0"
-description = "plugin and hook calling mechanisms for python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
-    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
-]
-
-[package.extras]
-dev = ["pre-commit", "tox"]
-testing = ["pytest", "pytest-benchmark"]
-
-[[package]]
-name = "pydantic"
-version = "2.9.1"
-description = "Data validation using Python type hints"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pydantic-2.9.1-py3-none-any.whl", hash = "sha256:7aff4db5fdf3cf573d4b3c30926a510a10e19a0774d38fc4967f78beb6deb612"},
-    {file = "pydantic-2.9.1.tar.gz", hash = "sha256:1363c7d975c7036df0db2b4a61f2e062fbc0aa5ab5f2772e0ffc7191a4f4bce2"},
-]
-
-[package.dependencies]
-annotated-types = ">=0.6.0"
-pydantic-core = "2.23.3"
-typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""}
-
-[package.extras]
-email = ["email-validator (>=2.0.0)"]
-timezone = ["tzdata"]
-
-[[package]]
-name = "pydantic-core"
-version = "2.23.3"
-description = "Core functionality for Pydantic validation and serialization"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pydantic_core-2.23.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7f10a5d1b9281392f1bf507d16ac720e78285dfd635b05737c3911637601bae6"},
-    {file = "pydantic_core-2.23.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3c09a7885dd33ee8c65266e5aa7fb7e2f23d49d8043f089989726391dd7350c5"},
-    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6470b5a1ec4d1c2e9afe928c6cb37eb33381cab99292a708b8cb9aa89e62429b"},
-    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9172d2088e27d9a185ea0a6c8cebe227a9139fd90295221d7d495944d2367700"},
-    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86fc6c762ca7ac8fbbdff80d61b2c59fb6b7d144aa46e2d54d9e1b7b0e780e01"},
-    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0cb80fd5c2df4898693aa841425ea1727b1b6d2167448253077d2a49003e0ed"},
-    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03667cec5daf43ac4995cefa8aaf58f99de036204a37b889c24a80927b629cec"},
-    {file = "pydantic_core-2.23.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:047531242f8e9c2db733599f1c612925de095e93c9cc0e599e96cf536aaf56ba"},
-    {file = "pydantic_core-2.23.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5499798317fff7f25dbef9347f4451b91ac2a4330c6669821c8202fd354c7bee"},
-    {file = "pydantic_core-2.23.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bbb5e45eab7624440516ee3722a3044b83fff4c0372efe183fd6ba678ff681fe"},
-    {file = "pydantic_core-2.23.3-cp310-none-win32.whl", hash = "sha256:8b5b3ed73abb147704a6e9f556d8c5cb078f8c095be4588e669d315e0d11893b"},
-    {file = "pydantic_core-2.23.3-cp310-none-win_amd64.whl", hash = "sha256:2b603cde285322758a0279995b5796d64b63060bfbe214b50a3ca23b5cee3e83"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:c889fd87e1f1bbeb877c2ee56b63bb297de4636661cc9bbfcf4b34e5e925bc27"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ea85bda3189fb27503af4c45273735bcde3dd31c1ab17d11f37b04877859ef45"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7f7f72f721223f33d3dc98a791666ebc6a91fa023ce63733709f4894a7dc611"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b2b55b0448e9da68f56b696f313949cda1039e8ec7b5d294285335b53104b61"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c24574c7e92e2c56379706b9a3f07c1e0c7f2f87a41b6ee86653100c4ce343e5"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2b05e6ccbee333a8f4b8f4d7c244fdb7a979e90977ad9c51ea31261e2085ce0"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2c409ce1c219c091e47cb03feb3c4ed8c2b8e004efc940da0166aaee8f9d6c8"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d965e8b325f443ed3196db890d85dfebbb09f7384486a77461347f4adb1fa7f8"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f56af3a420fb1ffaf43ece3ea09c2d27c444e7c40dcb7c6e7cf57aae764f2b48"},
-    {file = "pydantic_core-2.23.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5b01a078dd4f9a52494370af21aa52964e0a96d4862ac64ff7cea06e0f12d2c5"},
-    {file = "pydantic_core-2.23.3-cp311-none-win32.whl", hash = "sha256:560e32f0df04ac69b3dd818f71339983f6d1f70eb99d4d1f8e9705fb6c34a5c1"},
-    {file = "pydantic_core-2.23.3-cp311-none-win_amd64.whl", hash = "sha256:c744fa100fdea0d000d8bcddee95213d2de2e95b9c12be083370b2072333a0fa"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:e0ec50663feedf64d21bad0809f5857bac1ce91deded203efc4a84b31b2e4305"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:db6e6afcb95edbe6b357786684b71008499836e91f2a4a1e55b840955b341dbb"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98ccd69edcf49f0875d86942f4418a4e83eb3047f20eb897bffa62a5d419c8fa"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a678c1ac5c5ec5685af0133262103defb427114e62eafeda12f1357a12140162"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:01491d8b4d8db9f3391d93b0df60701e644ff0894352947f31fff3e52bd5c801"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fcf31facf2796a2d3b7fe338fe8640aa0166e4e55b4cb108dbfd1058049bf4cb"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7200fd561fb3be06827340da066df4311d0b6b8eb0c2116a110be5245dceb326"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dc1636770a809dee2bd44dd74b89cc80eb41172bcad8af75dd0bc182c2666d4c"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:67a5def279309f2e23014b608c4150b0c2d323bd7bccd27ff07b001c12c2415c"},
-    {file = "pydantic_core-2.23.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:748bdf985014c6dd3e1e4cc3db90f1c3ecc7246ff5a3cd4ddab20c768b2f1dab"},
-    {file = "pydantic_core-2.23.3-cp312-none-win32.whl", hash = "sha256:255ec6dcb899c115f1e2a64bc9ebc24cc0e3ab097775755244f77360d1f3c06c"},
-    {file = "pydantic_core-2.23.3-cp312-none-win_amd64.whl", hash = "sha256:40b8441be16c1e940abebed83cd006ddb9e3737a279e339dbd6d31578b802f7b"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6daaf5b1ba1369a22c8b050b643250e3e5efc6a78366d323294aee54953a4d5f"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d015e63b985a78a3d4ccffd3bdf22b7c20b3bbd4b8227809b3e8e75bc37f9cb2"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3fc572d9b5b5cfe13f8e8a6e26271d5d13f80173724b738557a8c7f3a8a3791"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f6bd91345b5163ee7448bee201ed7dd601ca24f43f439109b0212e296eb5b423"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc379c73fd66606628b866f661e8785088afe2adaba78e6bbe80796baf708a63"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbdce4b47592f9e296e19ac31667daed8753c8367ebb34b9a9bd89dacaa299c9"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc3cf31edf405a161a0adad83246568647c54404739b614b1ff43dad2b02e6d5"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8e22b477bf90db71c156f89a55bfe4d25177b81fce4aa09294d9e805eec13855"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:0a0137ddf462575d9bce863c4c95bac3493ba8e22f8c28ca94634b4a1d3e2bb4"},
-    {file = "pydantic_core-2.23.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:203171e48946c3164fe7691fc349c79241ff8f28306abd4cad5f4f75ed80bc8d"},
-    {file = "pydantic_core-2.23.3-cp313-none-win32.whl", hash = "sha256:76bdab0de4acb3f119c2a4bff740e0c7dc2e6de7692774620f7452ce11ca76c8"},
-    {file = "pydantic_core-2.23.3-cp313-none-win_amd64.whl", hash = "sha256:37ba321ac2a46100c578a92e9a6aa33afe9ec99ffa084424291d84e456f490c1"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d063c6b9fed7d992bcbebfc9133f4c24b7a7f215d6b102f3e082b1117cddb72c"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6cb968da9a0746a0cf521b2b5ef25fc5a0bee9b9a1a8214e0a1cfaea5be7e8a4"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edbefe079a520c5984e30e1f1f29325054b59534729c25b874a16a5048028d16"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cbaaf2ef20d282659093913da9d402108203f7cb5955020bd8d1ae5a2325d1c4"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fb539d7e5dc4aac345846f290cf504d2fd3c1be26ac4e8b5e4c2b688069ff4cf"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e6f33503c5495059148cc486867e1d24ca35df5fc064686e631e314d959ad5b"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:04b07490bc2f6f2717b10c3969e1b830f5720b632f8ae2f3b8b1542394c47a8e"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03795b9e8a5d7fda05f3873efc3f59105e2dcff14231680296b87b80bb327295"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c483dab0f14b8d3f0df0c6c18d70b21b086f74c87ab03c59250dbf6d3c89baba"},
-    {file = "pydantic_core-2.23.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8b2682038e255e94baf2c473dca914a7460069171ff5cdd4080be18ab8a7fd6e"},
-    {file = "pydantic_core-2.23.3-cp38-none-win32.whl", hash = "sha256:f4a57db8966b3a1d1a350012839c6a0099f0898c56512dfade8a1fe5fb278710"},
-    {file = "pydantic_core-2.23.3-cp38-none-win_amd64.whl", hash = "sha256:13dd45ba2561603681a2676ca56006d6dee94493f03d5cadc055d2055615c3ea"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:82da2f4703894134a9f000e24965df73cc103e31e8c31906cc1ee89fde72cbd8"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dd9be0a42de08f4b58a3cc73a123f124f65c24698b95a54c1543065baca8cf0e"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89b731f25c80830c76fdb13705c68fef6a2b6dc494402987c7ea9584fe189f5d"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c6de1ec30c4bb94f3a69c9f5f2182baeda5b809f806676675e9ef6b8dc936f28"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb68b41c3fa64587412b104294b9cbb027509dc2f6958446c502638d481525ef"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c3980f2843de5184656aab58698011b42763ccba11c4a8c35936c8dd6c7068c"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94f85614f2cba13f62c3c6481716e4adeae48e1eaa7e8bac379b9d177d93947a"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:510b7fb0a86dc8f10a8bb43bd2f97beb63cffad1203071dc434dac26453955cd"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1eba2f7ce3e30ee2170410e2171867ea73dbd692433b81a93758ab2de6c64835"},
-    {file = "pydantic_core-2.23.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b259fd8409ab84b4041b7b3f24dcc41e4696f180b775961ca8142b5b21d0e70"},
-    {file = "pydantic_core-2.23.3-cp39-none-win32.whl", hash = "sha256:40d9bd259538dba2f40963286009bf7caf18b5112b19d2b55b09c14dde6db6a7"},
-    {file = "pydantic_core-2.23.3-cp39-none-win_amd64.whl", hash = "sha256:5a8cd3074a98ee70173a8633ad3c10e00dcb991ecec57263aacb4095c5efb958"},
-    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f399e8657c67313476a121a6944311fab377085ca7f490648c9af97fc732732d"},
-    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:6b5547d098c76e1694ba85f05b595720d7c60d342f24d5aad32c3049131fa5c4"},
-    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0dda0290a6f608504882d9f7650975b4651ff91c85673341789a476b1159f211"},
-    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65b6e5da855e9c55a0c67f4db8a492bf13d8d3316a59999cfbaf98cc6e401961"},
-    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:09e926397f392059ce0afdcac920df29d9c833256354d0c55f1584b0b70cf07e"},
-    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:87cfa0ed6b8c5bd6ae8b66de941cece179281239d482f363814d2b986b79cedc"},
-    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e61328920154b6a44d98cabcb709f10e8b74276bc709c9a513a8c37a18786cc4"},
-    {file = "pydantic_core-2.23.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce3317d155628301d649fe5e16a99528d5680af4ec7aa70b90b8dacd2d725c9b"},
-    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e89513f014c6be0d17b00a9a7c81b1c426f4eb9224b15433f3d98c1a071f8433"},
-    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:4f62c1c953d7ee375df5eb2e44ad50ce2f5aff931723b398b8bc6f0ac159791a"},
-    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2718443bc671c7ac331de4eef9b673063b10af32a0bb385019ad61dcf2cc8f6c"},
-    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0d90e08b2727c5d01af1b5ef4121d2f0c99fbee692c762f4d9d0409c9da6541"},
-    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2b676583fc459c64146debea14ba3af54e540b61762dfc0613dc4e98c3f66eeb"},
-    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:50e4661f3337977740fdbfbae084ae5693e505ca2b3130a6d4eb0f2281dc43b8"},
-    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:68f4cf373f0de6abfe599a38307f4417c1c867ca381c03df27c873a9069cda25"},
-    {file = "pydantic_core-2.23.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:59d52cf01854cb26c46958552a21acb10dd78a52aa34c86f284e66b209db8cab"},
-    {file = "pydantic_core-2.23.3.tar.gz", hash = "sha256:3cb0f65d8b4121c1b015c60104a685feb929a29d7cf204387c7f2688c7974690"},
-]
-
-[package.dependencies]
-typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
-
-[[package]]
-name = "pytest"
-version = "7.4.4"
-description = "pytest: simple powerful testing with Python"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
-    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
-]
-
-[package.dependencies]
-colorama = {version = "*", markers = "sys_platform == \"win32\""}
-exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
-iniconfig = "*"
-packaging = "*"
-pluggy = ">=0.12,<2.0"
-tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
-
-[package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
-
-[[package]]
-name = "pytest-asyncio"
-version = "0.21.2"
-description = "Pytest support for asyncio"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "pytest_asyncio-0.21.2-py3-none-any.whl", hash = "sha256:ab664c88bb7998f711d8039cacd4884da6430886ae8bbd4eded552ed2004f16b"},
-    {file = "pytest_asyncio-0.21.2.tar.gz", hash = "sha256:d67738fc232b94b326b9d060750beb16e0074210b98dd8b58a5239fa2a154f45"},
-]
-
-[package.dependencies]
-pytest = ">=7.0.0"
-
-[package.extras]
-docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"]
-testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
-
-[[package]]
-name = "pywin32"
-version = "306"
-description = "Python for Window Extensions"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"},
-    {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"},
-    {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"},
-    {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"},
-    {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"},
-    {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"},
-    {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"},
-    {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"},
-    {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"},
-    {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"},
-    {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"},
-    {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"},
-    {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"},
-    {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"},
-]
-
-[[package]]
-name = "pyyaml"
-version = "6.0.2"
-description = "YAML parser and emitter for Python"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
-    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
-    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
-    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
-    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
-    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
-    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
-    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
-    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
-    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
-    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
-    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
-    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
-    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
-    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
-    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
-    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
-    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
-    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
-    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
-    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
-    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
-    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
-    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
-    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
-    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
-    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
-    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
-    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
-    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
-    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
-    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
-    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
-    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
-    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
-    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
-    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
-    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
-    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
-    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
-    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
-    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
-    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
-    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
-    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
-    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
-    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
-    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
-    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
-    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
-    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
-]
-
-[[package]]
-name = "requests"
-version = "2.32.3"
-description = "Python HTTP for Humans."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
-    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
-]
-
-[package.dependencies]
-certifi = ">=2017.4.17"
-charset-normalizer = ">=2,<4"
-idna = ">=2.5,<4"
-urllib3 = ">=1.21.1,<3"
-
-[package.extras]
-socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
-
-[[package]]
-name = "syrupy"
-version = "4.7.1"
-description = "Pytest Snapshot Test Utility"
-optional = false
-python-versions = ">=3.8.1"
-files = [
-    {file = "syrupy-4.7.1-py3-none-any.whl", hash = "sha256:be002267a512a4bedddfae2e026c93df1ea928ae10baadc09640516923376d41"},
-    {file = "syrupy-4.7.1.tar.gz", hash = "sha256:f9d4485f3f27d0e5df6ed299cac6fa32eb40a441915d988e82be5a4bdda335c8"},
-]
-
-[package.dependencies]
-pytest = ">=7.0.0,<9.0.0"
-
-[[package]]
-name = "text-generation"
-version = "0.6.1"
-description = "Hugging Face Text Generation Python Client"
-optional = false
-python-versions = ">=3.7,<4.0"
-files = [
-    {file = "text_generation-0.6.1-py3-none-any.whl", hash = "sha256:ebca00587eeabc0f5118f66ee1048bf690bd7735a9a10361c533c31c8c0bf994"},
-    {file = "text_generation-0.6.1.tar.gz", hash = "sha256:730e662aa7812f73c08ab953e008e90455f3d046f81efa0ef3de462bd4cf63d9"},
-]
-
-[package.dependencies]
-aiohttp = ">=3.8,<4.0"
-huggingface-hub = ">=0.12,<1.0"
-pydantic = ">1.10,<3"
-
-[[package]]
-name = "tomli"
-version = "2.0.1"
-description = "A lil' TOML parser"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
-    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
-]
-
-[[package]]
-name = "tqdm"
-version = "4.66.5"
-description = "Fast, Extensible Progress Meter"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
-    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
-]
-
-[package.dependencies]
-colorama = {version = "*", markers = "platform_system == \"Windows\""}
-
-[package.extras]
-dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
-notebook = ["ipywidgets (>=6)"]
-slack = ["slack-sdk"]
-telegram = ["requests"]
-
-[[package]]
-name = "typing-extensions"
-version = "4.12.2"
-description = "Backported and Experimental Type Hints for Python 3.8+"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
-    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
-]
-
-[[package]]
-name = "urllib3"
-version = "2.2.2"
-description = "HTTP library with thread-safe connection pooling, file post, and more."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"},
-    {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"},
-]
-
-[package.extras]
-brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
-h2 = ["h2 (>=4,<5)"]
-socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
-zstd = ["zstandard (>=0.18.0)"]
-
-[[package]]
-name = "yarl"
-version = "1.11.1"
-description = "Yet another URL library"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "yarl-1.11.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:400cd42185f92de559d29eeb529e71d80dfbd2f45c36844914a4a34297ca6f00"},
-    {file = "yarl-1.11.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8258c86f47e080a258993eed877d579c71da7bda26af86ce6c2d2d072c11320d"},
-    {file = "yarl-1.11.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2164cd9725092761fed26f299e3f276bb4b537ca58e6ff6b252eae9631b5c96e"},
-    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08ea567c16f140af8ddc7cb58e27e9138a1386e3e6e53982abaa6f2377b38cc"},
-    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:768ecc550096b028754ea28bf90fde071c379c62c43afa574edc6f33ee5daaec"},
-    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2909fa3a7d249ef64eeb2faa04b7957e34fefb6ec9966506312349ed8a7e77bf"},
-    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01a8697ec24f17c349c4f655763c4db70eebc56a5f82995e5e26e837c6eb0e49"},
-    {file = "yarl-1.11.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e286580b6511aac7c3268a78cdb861ec739d3e5a2a53b4809faef6b49778eaff"},
-    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4179522dc0305c3fc9782549175c8e8849252fefeb077c92a73889ccbcd508ad"},
-    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:27fcb271a41b746bd0e2a92182df507e1c204759f460ff784ca614e12dd85145"},
-    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f61db3b7e870914dbd9434b560075e0366771eecbe6d2b5561f5bc7485f39efd"},
-    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:c92261eb2ad367629dc437536463dc934030c9e7caca861cc51990fe6c565f26"},
-    {file = "yarl-1.11.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d95b52fbef190ca87d8c42f49e314eace4fc52070f3dfa5f87a6594b0c1c6e46"},
-    {file = "yarl-1.11.1-cp310-cp310-win32.whl", hash = "sha256:489fa8bde4f1244ad6c5f6d11bb33e09cf0d1d0367edb197619c3e3fc06f3d91"},
-    {file = "yarl-1.11.1-cp310-cp310-win_amd64.whl", hash = "sha256:476e20c433b356e16e9a141449f25161e6b69984fb4cdbd7cd4bd54c17844998"},
-    {file = "yarl-1.11.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:946eedc12895873891aaceb39bceb484b4977f70373e0122da483f6c38faaa68"},
-    {file = "yarl-1.11.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:21a7c12321436b066c11ec19c7e3cb9aec18884fe0d5b25d03d756a9e654edfe"},
-    {file = "yarl-1.11.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c35f493b867912f6fda721a59cc7c4766d382040bdf1ddaeeaa7fa4d072f4675"},
-    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25861303e0be76b60fddc1250ec5986c42f0a5c0c50ff57cc30b1be199c00e63"},
-    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4b53f73077e839b3f89c992223f15b1d2ab314bdbdf502afdc7bb18e95eae27"},
-    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:327c724b01b8641a1bf1ab3b232fb638706e50f76c0b5bf16051ab65c868fac5"},
-    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4307d9a3417eea87715c9736d050c83e8c1904e9b7aada6ce61b46361b733d92"},
-    {file = "yarl-1.11.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48a28bed68ab8fb7e380775f0029a079f08a17799cb3387a65d14ace16c12e2b"},
-    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:067b961853c8e62725ff2893226fef3d0da060656a9827f3f520fb1d19b2b68a"},
-    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8215f6f21394d1f46e222abeb06316e77ef328d628f593502d8fc2a9117bde83"},
-    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:498442e3af2a860a663baa14fbf23fb04b0dd758039c0e7c8f91cb9279799bff"},
-    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:69721b8effdb588cb055cc22f7c5105ca6fdaa5aeb3ea09021d517882c4a904c"},
-    {file = "yarl-1.11.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1e969fa4c1e0b1a391f3fcbcb9ec31e84440253325b534519be0d28f4b6b533e"},
-    {file = "yarl-1.11.1-cp311-cp311-win32.whl", hash = "sha256:7d51324a04fc4b0e097ff8a153e9276c2593106a811704025bbc1d6916f45ca6"},
-    {file = "yarl-1.11.1-cp311-cp311-win_amd64.whl", hash = "sha256:15061ce6584ece023457fb8b7a7a69ec40bf7114d781a8c4f5dcd68e28b5c53b"},
-    {file = "yarl-1.11.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:a4264515f9117be204935cd230fb2a052dd3792789cc94c101c535d349b3dab0"},
-    {file = "yarl-1.11.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f41fa79114a1d2eddb5eea7b912d6160508f57440bd302ce96eaa384914cd265"},
-    {file = "yarl-1.11.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:02da8759b47d964f9173c8675710720b468aa1c1693be0c9c64abb9d8d9a4867"},
-    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9361628f28f48dcf8b2f528420d4d68102f593f9c2e592bfc842f5fb337e44fd"},
-    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b91044952da03b6f95fdba398d7993dd983b64d3c31c358a4c89e3c19b6f7aef"},
-    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:74db2ef03b442276d25951749a803ddb6e270d02dda1d1c556f6ae595a0d76a8"},
-    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e975a2211952a8a083d1b9d9ba26472981ae338e720b419eb50535de3c02870"},
-    {file = "yarl-1.11.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8aef97ba1dd2138112890ef848e17d8526fe80b21f743b4ee65947ea184f07a2"},
-    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a7915ea49b0c113641dc4d9338efa9bd66b6a9a485ffe75b9907e8573ca94b84"},
-    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:504cf0d4c5e4579a51261d6091267f9fd997ef58558c4ffa7a3e1460bd2336fa"},
-    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3de5292f9f0ee285e6bd168b2a77b2a00d74cbcfa420ed078456d3023d2f6dff"},
-    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a34e1e30f1774fa35d37202bbeae62423e9a79d78d0874e5556a593479fdf239"},
-    {file = "yarl-1.11.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:66b63c504d2ca43bf7221a1f72fbe981ff56ecb39004c70a94485d13e37ebf45"},
-    {file = "yarl-1.11.1-cp312-cp312-win32.whl", hash = "sha256:a28b70c9e2213de425d9cba5ab2e7f7a1c8ca23a99c4b5159bf77b9c31251447"},
-    {file = "yarl-1.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:17b5a386d0d36fb828e2fb3ef08c8829c1ebf977eef88e5367d1c8c94b454639"},
-    {file = "yarl-1.11.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:1fa2e7a406fbd45b61b4433e3aa254a2c3e14c4b3186f6e952d08a730807fa0c"},
-    {file = "yarl-1.11.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:750f656832d7d3cb0c76be137ee79405cc17e792f31e0a01eee390e383b2936e"},
-    {file = "yarl-1.11.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b8486f322d8f6a38539136a22c55f94d269addb24db5cb6f61adc61eabc9d93"},
-    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fce4da3703ee6048ad4138fe74619c50874afe98b1ad87b2698ef95bf92c96d"},
-    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ed653638ef669e0efc6fe2acb792275cb419bf9cb5c5049399f3556995f23c7"},
-    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18ac56c9dd70941ecad42b5a906820824ca72ff84ad6fa18db33c2537ae2e089"},
-    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:688654f8507464745ab563b041d1fb7dab5d9912ca6b06e61d1c4708366832f5"},
-    {file = "yarl-1.11.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4973eac1e2ff63cf187073cd4e1f1148dcd119314ab79b88e1b3fad74a18c9d5"},
-    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:964a428132227edff96d6f3cf261573cb0f1a60c9a764ce28cda9525f18f7786"},
-    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:6d23754b9939cbab02c63434776df1170e43b09c6a517585c7ce2b3d449b7318"},
-    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c2dc4250fe94d8cd864d66018f8344d4af50e3758e9d725e94fecfa27588ff82"},
-    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09696438cb43ea6f9492ef237761b043f9179f455f405279e609f2bc9100212a"},
-    {file = "yarl-1.11.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:999bfee0a5b7385a0af5ffb606393509cfde70ecca4f01c36985be6d33e336da"},
-    {file = "yarl-1.11.1-cp313-cp313-win32.whl", hash = "sha256:ce928c9c6409c79e10f39604a7e214b3cb69552952fbda8d836c052832e6a979"},
-    {file = "yarl-1.11.1-cp313-cp313-win_amd64.whl", hash = "sha256:501c503eed2bb306638ccb60c174f856cc3246c861829ff40eaa80e2f0330367"},
-    {file = "yarl-1.11.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:dae7bd0daeb33aa3e79e72877d3d51052e8b19c9025ecf0374f542ea8ec120e4"},
-    {file = "yarl-1.11.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3ff6b1617aa39279fe18a76c8d165469c48b159931d9b48239065767ee455b2b"},
-    {file = "yarl-1.11.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3257978c870728a52dcce8c2902bf01f6c53b65094b457bf87b2644ee6238ddc"},
-    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f351fa31234699d6084ff98283cb1e852270fe9e250a3b3bf7804eb493bd937"},
-    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8aef1b64da41d18026632d99a06b3fefe1d08e85dd81d849fa7c96301ed22f1b"},
-    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7175a87ab8f7fbde37160a15e58e138ba3b2b0e05492d7351314a250d61b1591"},
-    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba444bdd4caa2a94456ef67a2f383710928820dd0117aae6650a4d17029fa25e"},
-    {file = "yarl-1.11.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0ea9682124fc062e3d931c6911934a678cb28453f957ddccf51f568c2f2b5e05"},
-    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8418c053aeb236b20b0ab8fa6bacfc2feaaf7d4683dd96528610989c99723d5f"},
-    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:61a5f2c14d0a1adfdd82258f756b23a550c13ba4c86c84106be4c111a3a4e413"},
-    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f3a6d90cab0bdf07df8f176eae3a07127daafcf7457b997b2bf46776da2c7eb7"},
-    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:077da604852be488c9a05a524068cdae1e972b7dc02438161c32420fb4ec5e14"},
-    {file = "yarl-1.11.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:15439f3c5c72686b6c3ff235279630d08936ace67d0fe5c8d5bbc3ef06f5a420"},
-    {file = "yarl-1.11.1-cp38-cp38-win32.whl", hash = "sha256:238a21849dd7554cb4d25a14ffbfa0ef380bb7ba201f45b144a14454a72ffa5a"},
-    {file = "yarl-1.11.1-cp38-cp38-win_amd64.whl", hash = "sha256:67459cf8cf31da0e2cbdb4b040507e535d25cfbb1604ca76396a3a66b8ba37a6"},
-    {file = "yarl-1.11.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:884eab2ce97cbaf89f264372eae58388862c33c4f551c15680dd80f53c89a269"},
-    {file = "yarl-1.11.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a336eaa7ee7e87cdece3cedb395c9657d227bfceb6781295cf56abcd3386a26"},
-    {file = "yarl-1.11.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:87f020d010ba80a247c4abc335fc13421037800ca20b42af5ae40e5fd75e7909"},
-    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:637c7ddb585a62d4469f843dac221f23eec3cbad31693b23abbc2c366ad41ff4"},
-    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:48dfd117ab93f0129084577a07287376cc69c08138694396f305636e229caa1a"},
-    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e0ae31fb5ccab6eda09ba1494e87eb226dcbd2372dae96b87800e1dcc98804"},
-    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f46f81501160c28d0c0b7333b4f7be8983dbbc161983b6fb814024d1b4952f79"},
-    {file = "yarl-1.11.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:04293941646647b3bfb1719d1d11ff1028e9c30199509a844da3c0f5919dc520"},
-    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:250e888fa62d73e721f3041e3a9abf427788a1934b426b45e1b92f62c1f68366"},
-    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e8f63904df26d1a66aabc141bfd258bf738b9bc7bc6bdef22713b4f5ef789a4c"},
-    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:aac44097d838dda26526cffb63bdd8737a2dbdf5f2c68efb72ad83aec6673c7e"},
-    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:267b24f891e74eccbdff42241c5fb4f974de2d6271dcc7d7e0c9ae1079a560d9"},
-    {file = "yarl-1.11.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6907daa4b9d7a688063ed098c472f96e8181733c525e03e866fb5db480a424df"},
-    {file = "yarl-1.11.1-cp39-cp39-win32.whl", hash = "sha256:14438dfc5015661f75f85bc5adad0743678eefee266ff0c9a8e32969d5d69f74"},
-    {file = "yarl-1.11.1-cp39-cp39-win_amd64.whl", hash = "sha256:94d0caaa912bfcdc702a4204cd5e2bb01eb917fc4f5ea2315aa23962549561b0"},
-    {file = "yarl-1.11.1-py3-none-any.whl", hash = "sha256:72bf26f66456baa0584eff63e44545c9f0eaed9b73cb6601b647c91f14c11f38"},
-    {file = "yarl-1.11.1.tar.gz", hash = "sha256:1bb2d9e212fb7449b8fb73bc461b51eaa17cc8430b4a87d87be7b25052d92f53"},
-]
-
-[package.dependencies]
-idna = ">=2.0"
-multidict = ">=4.0"
-
-[metadata]
-lock-version = "2.0"
-python-versions = ">=3.10,<3.13"
-content-hash = "310c0a2349bc0a0713b50f8482b4df4d07c48e0ce2c1bd91c30872f52de3fe1c"
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
index e49b98fc..1838995e 100644
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@@ -1,18 +1,19 @@
-[tool.poetry]
+[project]
 name = "text-generation-integration-tests"
 version = "2.0.1"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <nicolas@huggingface.co>"]
+requires-python = ">=3.10,<3.13"
 
-[tool.poetry.dependencies]
-pydantic = "> 2, < 3"
-python = ">=3.10,<3.13"
-syrupy = "^4.7.1"
-text-generation = "^0.6.0"
-pytest = "^7.4.0"
-pytest-asyncio = "^0.21.1"
-docker = "^7"
-numpy = "^1.20"
+dependencies = [
+    "pydantic>2,< 3",
+    "syrupy>=4.8.0",
+    "text-generation>=0.6.0",
+    "pytest>=8.3.0",
+    "pytest-asyncio>=0.23.1",
+    "docker>=7",
+    "numpy>=2.0",
+]
 
 [tool.isort]
 profile = "black"
diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
index 8bf6ba07..c5a91825 100644
--- a/integration-tests/requirements.txt
+++ b/integration-tests/requirements.txt
@@ -1,35 +1,86 @@
-aiohappyeyeballs==2.4.0 ; python_version >= "3.10" and python_version < "3.13"
-aiohttp==3.10.5 ; python_version >= "3.10" and python_version < "3.13"
-aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "3.13"
-annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "3.13"
-async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.11"
-attrs==24.2.0 ; python_version >= "3.10" and python_version < "3.13"
-certifi==2024.8.30 ; python_version >= "3.10" and python_version < "3.13"
-charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.10" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-docker==7.1.0 ; python_version >= "3.10" and python_version < "3.13"
-exceptiongroup==1.2.2 ; python_version >= "3.10" and python_version < "3.11"
-filelock==3.16.0 ; python_version >= "3.10" and python_version < "3.13"
-frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "3.13"
-fsspec==2024.9.0 ; python_version >= "3.10" and python_version < "3.13"
-huggingface-hub==0.24.6 ; python_version >= "3.10" and python_version < "3.13"
-idna==3.8 ; python_version >= "3.10" and python_version < "3.13"
-iniconfig==2.0.0 ; python_version >= "3.10" and python_version < "3.13"
-multidict==6.1.0 ; python_version >= "3.10" and python_version < "3.13"
-numpy==1.26.4 ; python_version >= "3.10" and python_version < "3.13"
-packaging==24.1 ; python_version >= "3.10" and python_version < "3.13"
-pluggy==1.5.0 ; python_version >= "3.10" and python_version < "3.13"
-pydantic-core==2.23.3 ; python_version >= "3.10" and python_version < "3.13"
-pydantic==2.9.1 ; python_version >= "3.10" and python_version < "3.13"
-pytest-asyncio==0.21.2 ; python_version >= "3.10" and python_version < "3.13"
-pytest==7.4.4 ; python_version >= "3.10" and python_version < "3.13"
-pywin32==306 ; python_version >= "3.10" and python_version < "3.13" and sys_platform == "win32"
-pyyaml==6.0.2 ; python_version >= "3.10" and python_version < "3.13"
-requests==2.32.3 ; python_version >= "3.10" and python_version < "3.13"
-syrupy==4.7.1 ; python_version >= "3.10" and python_version < "3.13"
-text-generation==0.6.1 ; python_version >= "3.10" and python_version < "3.13"
-tomli==2.0.1 ; python_version >= "3.10" and python_version < "3.11"
-tqdm==4.66.5 ; python_version >= "3.10" and python_version < "3.13"
-typing-extensions==4.12.2 ; python_version >= "3.10" and python_version < "3.13"
-urllib3==2.2.2 ; python_version >= "3.10" and python_version < "3.13"
-yarl==1.11.1 ; python_version >= "3.10" and python_version < "3.13"
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --output-file requirements.txt
+aiohappyeyeballs==2.4.6
+    # via aiohttp
+aiohttp==3.11.12
+    # via text-generation
+aiosignal==1.3.2
+    # via aiohttp
+annotated-types==0.7.0
+    # via pydantic
+attrs==25.1.0
+    # via aiohttp
+certifi==2025.1.31
+    # via requests
+charset-normalizer==3.4.1
+    # via requests
+docker==7.1.0
+    # via text-generation-integration-tests (pyproject.toml)
+filelock==3.17.0
+    # via huggingface-hub
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.2.0
+    # via huggingface-hub
+huggingface-hub==0.29.0
+    # via text-generation
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+iniconfig==2.0.0
+    # via pytest
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+numpy==2.2.3
+    # via text-generation-integration-tests (pyproject.toml)
+packaging==24.2
+    # via
+    #   huggingface-hub
+    #   pytest
+pluggy==1.5.0
+    # via pytest
+propcache==0.2.1
+    # via
+    #   aiohttp
+    #   yarl
+pydantic==2.10.6
+    # via
+    #   text-generation-integration-tests (pyproject.toml)
+    #   text-generation
+pydantic-core==2.27.2
+    # via pydantic
+pytest==8.3.4
+    # via
+    #   text-generation-integration-tests (pyproject.toml)
+    #   pytest-asyncio
+    #   syrupy
+pytest-asyncio==0.25.3
+    # via text-generation-integration-tests (pyproject.toml)
+pyyaml==6.0.2
+    # via huggingface-hub
+requests==2.32.3
+    # via
+    #   docker
+    #   huggingface-hub
+syrupy==4.8.1
+    # via text-generation-integration-tests (pyproject.toml)
+text-generation==0.7.0
+    # via text-generation-integration-tests (pyproject.toml)
+tqdm==4.67.1
+    # via huggingface-hub
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+urllib3==2.3.0
+    # via
+    #   docker
+    #   requests
+yarl==1.18.3
+    # via aiohttp
diff --git a/integration-tests/uv.lock b/integration-tests/uv.lock
new file mode 100644
index 00000000..9f3765b8
--- /dev/null
+++ b/integration-tests/uv.lock
@@ -0,0 +1,829 @@
+version = 1
+requires-python = ">=3.10, <3.13"
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/08/07/508f9ebba367fc3370162e53a3cfd12f5652ad79f0e0bfdf9f9847c6f159/aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0", size = 21726 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/4c/03fb05f56551828ec67ceb3665e5dc51638042d204983a03b0a1541475b6/aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1", size = 14543 },
+]
+
+[[package]]
+name = "aiohttp"
+version = "3.11.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohappyeyeballs" },
+    { name = "aiosignal" },
+    { name = "async-timeout", marker = "python_full_version < '3.11'" },
+    { name = "attrs" },
+    { name = "frozenlist" },
+    { name = "multidict" },
+    { name = "propcache" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/37/4b/952d49c73084fb790cb5c6ead50848c8e96b4980ad806cf4d2ad341eaa03/aiohttp-3.11.12.tar.gz", hash = "sha256:7603ca26d75b1b86160ce1bbe2787a0b706e592af5b2504e12caa88a217767b0", size = 7673175 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/42/3880e133590820aa7bc6d068eb7d8e0ad9fdce9b4663f92b821d3f6b5601/aiohttp-3.11.12-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aa8a8caca81c0a3e765f19c6953416c58e2f4cc1b84829af01dd1c771bb2f91f", size = 708721 },
+    { url = "https://files.pythonhosted.org/packages/d8/8c/04869803bed108b25afad75f94c651b287851843caacbec6677d8f2d572b/aiohttp-3.11.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:84ede78acde96ca57f6cf8ccb8a13fbaf569f6011b9a52f870c662d4dc8cd854", size = 468596 },
+    { url = "https://files.pythonhosted.org/packages/4f/f4/9074011f0d1335b161c953fb32545b6667cf24465e1932b9767874995c7e/aiohttp-3.11.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:584096938a001378484aa4ee54e05dc79c7b9dd933e271c744a97b3b6f644957", size = 455758 },
+    { url = "https://files.pythonhosted.org/packages/fd/68/06298c57ef8f534065930b805e6dbd83613f0534447922782fb9920fce28/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:392432a2dde22b86f70dd4a0e9671a349446c93965f261dbaecfaf28813e5c42", size = 1584797 },
+    { url = "https://files.pythonhosted.org/packages/bd/1e/cee6b51fcb3b1c4185a7dc62b3113bc136fae07f39386c88c90b7f79f199/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:88d385b8e7f3a870146bf5ea31786ef7463e99eb59e31db56e2315535d811f55", size = 1632535 },
+    { url = "https://files.pythonhosted.org/packages/71/1f/42424462b7a09da362e1711090db9f8d68a37a33f0aab51307335517c599/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b10a47e5390c4b30a0d58ee12581003be52eedd506862ab7f97da7a66805befb", size = 1668484 },
+    { url = "https://files.pythonhosted.org/packages/f6/79/0e25542bbe3c2bfd7a12c7a49c7bce73b09a836f65079e4b77bc2bafc89e/aiohttp-3.11.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b5263dcede17b6b0c41ef0c3ccce847d82a7da98709e75cf7efde3e9e3b5cae", size = 1589708 },
+    { url = "https://files.pythonhosted.org/packages/d1/13/93ae26b75e23f7d3a613872e472fae836ca100dc5bde5936ebc93ada8890/aiohttp-3.11.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50c5c7b8aa5443304c55c262c5693b108c35a3b61ef961f1e782dd52a2f559c7", size = 1544752 },
+    { url = "https://files.pythonhosted.org/packages/cf/5e/48847fad1b014ef92ef18ea1339a3b58eb81d3bc717b94c3627f5d2a42c5/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d1c031a7572f62f66f1257db37ddab4cb98bfaf9b9434a3b4840bf3560f5e788", size = 1529417 },
+    { url = "https://files.pythonhosted.org/packages/ae/56/fbd4ea019303f4877f0e0b8c9de92e9db24338e7545570d3f275f3c74c53/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:7e44eba534381dd2687be50cbd5f2daded21575242ecfdaf86bbeecbc38dae8e", size = 1557808 },
+    { url = "https://files.pythonhosted.org/packages/f1/43/112189cf6b3c482ecdd6819b420eaa0c2033426f28d741bb7f19db5dd2bb/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:145a73850926018ec1681e734cedcf2716d6a8697d90da11284043b745c286d5", size = 1536765 },
+    { url = "https://files.pythonhosted.org/packages/30/12/59986547de8306e06c7b30e547ccda02d29636e152366caba2dd8627bfe1/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:2c311e2f63e42c1bf86361d11e2c4a59f25d9e7aabdbdf53dc38b885c5435cdb", size = 1607621 },
+    { url = "https://files.pythonhosted.org/packages/aa/9b/af3b323b20df3318ed20d701d8242e523d59c842ca93f23134b05c9d5054/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ea756b5a7bac046d202a9a3889b9a92219f885481d78cd318db85b15cc0b7bcf", size = 1628977 },
+    { url = "https://files.pythonhosted.org/packages/36/62/adf5a331a7bda475cc326dde393fa2bc5849060b1b37ac3d1bee1953f2cd/aiohttp-3.11.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:526c900397f3bbc2db9cb360ce9c35134c908961cdd0ac25b1ae6ffcaa2507ff", size = 1564455 },
+    { url = "https://files.pythonhosted.org/packages/90/c4/4a24291f22f111a854dfdb54dc94d4e0a5229ccbb7bc7f0bed972aa50410/aiohttp-3.11.12-cp310-cp310-win32.whl", hash = "sha256:b8d3bb96c147b39c02d3db086899679f31958c5d81c494ef0fc9ef5bb1359b3d", size = 416768 },
+    { url = "https://files.pythonhosted.org/packages/51/69/5221c8006acb7bb10d9e8e2238fb216571bddc2e00a8d95bcfbe2f579c57/aiohttp-3.11.12-cp310-cp310-win_amd64.whl", hash = "sha256:7fe3d65279bfbee8de0fb4f8c17fc4e893eed2dba21b2f680e930cc2b09075c5", size = 442170 },
+    { url = "https://files.pythonhosted.org/packages/9c/38/35311e70196b6a63cfa033a7f741f800aa8a93f57442991cbe51da2394e7/aiohttp-3.11.12-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87a2e00bf17da098d90d4145375f1d985a81605267e7f9377ff94e55c5d769eb", size = 708797 },
+    { url = "https://files.pythonhosted.org/packages/44/3e/46c656e68cbfc4f3fc7cb5d2ba4da6e91607fe83428208028156688f6201/aiohttp-3.11.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b34508f1cd928ce915ed09682d11307ba4b37d0708d1f28e5774c07a7674cac9", size = 468669 },
+    { url = "https://files.pythonhosted.org/packages/a0/d6/2088fb4fd1e3ac2bfb24bc172223babaa7cdbb2784d33c75ec09e66f62f8/aiohttp-3.11.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:936d8a4f0f7081327014742cd51d320296b56aa6d324461a13724ab05f4b2933", size = 455739 },
+    { url = "https://files.pythonhosted.org/packages/e7/dc/c443a6954a56f4a58b5efbfdf23cc6f3f0235e3424faf5a0c56264d5c7bb/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de1378f72def7dfb5dbd73d86c19eda0ea7b0a6873910cc37d57e80f10d64e1", size = 1685858 },
+    { url = "https://files.pythonhosted.org/packages/25/67/2d5b3aaade1d5d01c3b109aa76e3aa9630531252cda10aa02fb99b0b11a1/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9d45dbb3aaec05cf01525ee1a7ac72de46a8c425cb75c003acd29f76b1ffe94", size = 1743829 },
+    { url = "https://files.pythonhosted.org/packages/90/9b/9728fe9a3e1b8521198455d027b0b4035522be18f504b24c5d38d59e7278/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:930ffa1925393381e1e0a9b82137fa7b34c92a019b521cf9f41263976666a0d6", size = 1785587 },
+    { url = "https://files.pythonhosted.org/packages/ce/cf/28fbb43d4ebc1b4458374a3c7b6db3b556a90e358e9bbcfe6d9339c1e2b6/aiohttp-3.11.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8340def6737118f5429a5df4e88f440746b791f8f1c4ce4ad8a595f42c980bd5", size = 1675319 },
+    { url = "https://files.pythonhosted.org/packages/e5/d2/006c459c11218cabaa7bca401f965c9cc828efbdea7e1615d4644eaf23f7/aiohttp-3.11.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4016e383f91f2814e48ed61e6bda7d24c4d7f2402c75dd28f7e1027ae44ea204", size = 1619982 },
+    { url = "https://files.pythonhosted.org/packages/9d/83/ca425891ebd37bee5d837110f7fddc4d808a7c6c126a7d1b5c3ad72fc6ba/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c0600bcc1adfaaac321422d615939ef300df81e165f6522ad096b73439c0f58", size = 1654176 },
+    { url = "https://files.pythonhosted.org/packages/25/df/047b1ce88514a1b4915d252513640184b63624e7914e41d846668b8edbda/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:0450ada317a65383b7cce9576096150fdb97396dcfe559109b403c7242faffef", size = 1660198 },
+    { url = "https://files.pythonhosted.org/packages/d3/cc/6ecb8e343f0902528620b9dbd567028a936d5489bebd7dbb0dd0914f4fdb/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:850ff6155371fd802a280f8d369d4e15d69434651b844bde566ce97ee2277420", size = 1650186 },
+    { url = "https://files.pythonhosted.org/packages/f8/f8/453df6dd69256ca8c06c53fc8803c9056e2b0b16509b070f9a3b4bdefd6c/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8fd12d0f989c6099e7b0f30dc6e0d1e05499f3337461f0b2b0dadea6c64b89df", size = 1733063 },
+    { url = "https://files.pythonhosted.org/packages/55/f8/540160787ff3000391de0e5d0d1d33be4c7972f933c21991e2ea105b2d5e/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:76719dd521c20a58a6c256d058547b3a9595d1d885b830013366e27011ffe804", size = 1755306 },
+    { url = "https://files.pythonhosted.org/packages/30/7d/49f3bfdfefd741576157f8f91caa9ff61a6f3d620ca6339268327518221b/aiohttp-3.11.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97fe431f2ed646a3b56142fc81d238abcbaff08548d6912acb0b19a0cadc146b", size = 1692909 },
+    { url = "https://files.pythonhosted.org/packages/40/9c/8ce00afd6f6112ce9a2309dc490fea376ae824708b94b7b5ea9cba979d1d/aiohttp-3.11.12-cp311-cp311-win32.whl", hash = "sha256:e10c440d142fa8b32cfdb194caf60ceeceb3e49807072e0dc3a8887ea80e8c16", size = 416584 },
+    { url = "https://files.pythonhosted.org/packages/35/97/4d3c5f562f15830de472eb10a7a222655d750839943e0e6d915ef7e26114/aiohttp-3.11.12-cp311-cp311-win_amd64.whl", hash = "sha256:246067ba0cf5560cf42e775069c5d80a8989d14a7ded21af529a4e10e3e0f0e6", size = 442674 },
+    { url = "https://files.pythonhosted.org/packages/4d/d0/94346961acb476569fca9a644cc6f9a02f97ef75961a6b8d2b35279b8d1f/aiohttp-3.11.12-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e392804a38353900c3fd8b7cacbea5132888f7129f8e241915e90b85f00e3250", size = 704837 },
+    { url = "https://files.pythonhosted.org/packages/a9/af/05c503f1cc8f97621f199ef4b8db65fb88b8bc74a26ab2adb74789507ad3/aiohttp-3.11.12-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8fa1510b96c08aaad49303ab11f8803787c99222288f310a62f493faf883ede1", size = 464218 },
+    { url = "https://files.pythonhosted.org/packages/f2/48/b9949eb645b9bd699153a2ec48751b985e352ab3fed9d98c8115de305508/aiohttp-3.11.12-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dc065a4285307607df3f3686363e7f8bdd0d8ab35f12226362a847731516e42c", size = 456166 },
+    { url = "https://files.pythonhosted.org/packages/14/fb/980981807baecb6f54bdd38beb1bd271d9a3a786e19a978871584d026dcf/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddb31f8474695cd61fc9455c644fc1606c164b93bff2490390d90464b4655df", size = 1682528 },
+    { url = "https://files.pythonhosted.org/packages/90/cb/77b1445e0a716914e6197b0698b7a3640590da6c692437920c586764d05b/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dec0000d2d8621d8015c293e24589d46fa218637d820894cb7356c77eca3259", size = 1737154 },
+    { url = "https://files.pythonhosted.org/packages/ff/24/d6fb1f4cede9ccbe98e4def6f3ed1e1efcb658871bbf29f4863ec646bf38/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3552fe98e90fdf5918c04769f338a87fa4f00f3b28830ea9b78b1bdc6140e0d", size = 1793435 },
+    { url = "https://files.pythonhosted.org/packages/17/e2/9f744cee0861af673dc271a3351f59ebd5415928e20080ab85be25641471/aiohttp-3.11.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfe7f984f28a8ae94ff3a7953cd9678550dbd2a1f9bda5dd9c5ae627744c78e", size = 1692010 },
+    { url = "https://files.pythonhosted.org/packages/90/c4/4a1235c1df544223eb57ba553ce03bc706bdd065e53918767f7fa1ff99e0/aiohttp-3.11.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a481a574af914b6e84624412666cbfbe531a05667ca197804ecc19c97b8ab1b0", size = 1619481 },
+    { url = "https://files.pythonhosted.org/packages/60/70/cf12d402a94a33abda86dd136eb749b14c8eb9fec1e16adc310e25b20033/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1987770fb4887560363b0e1a9b75aa303e447433c41284d3af2840a2f226d6e0", size = 1641578 },
+    { url = "https://files.pythonhosted.org/packages/1b/25/7211973fda1f5e833fcfd98ccb7f9ce4fbfc0074e3e70c0157a751d00db8/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a4ac6a0f0f6402854adca4e3259a623f5c82ec3f0c049374133bcb243132baf9", size = 1684463 },
+    { url = "https://files.pythonhosted.org/packages/93/60/b5905b4d0693f6018b26afa9f2221fefc0dcbd3773fe2dff1a20fb5727f1/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c96a43822f1f9f69cc5c3706af33239489a6294be486a0447fb71380070d4d5f", size = 1646691 },
+    { url = "https://files.pythonhosted.org/packages/b4/fc/ba1b14d6fdcd38df0b7c04640794b3683e949ea10937c8a58c14d697e93f/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a5e69046f83c0d3cb8f0d5bd9b8838271b1bc898e01562a04398e160953e8eb9", size = 1702269 },
+    { url = "https://files.pythonhosted.org/packages/5e/39/18c13c6f658b2ba9cc1e0c6fb2d02f98fd653ad2addcdf938193d51a9c53/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:68d54234c8d76d8ef74744f9f9fc6324f1508129e23da8883771cdbb5818cbef", size = 1734782 },
+    { url = "https://files.pythonhosted.org/packages/9f/d2/ccc190023020e342419b265861877cd8ffb75bec37b7ddd8521dd2c6deb8/aiohttp-3.11.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c9fd9dcf9c91affe71654ef77426f5cf8489305e1c66ed4816f5a21874b094b9", size = 1694740 },
+    { url = "https://files.pythonhosted.org/packages/3f/54/186805bcada64ea90ea909311ffedcd74369bfc6e880d39d2473314daa36/aiohttp-3.11.12-cp312-cp312-win32.whl", hash = "sha256:0ed49efcd0dc1611378beadbd97beb5d9ca8fe48579fc04a6ed0844072261b6a", size = 411530 },
+    { url = "https://files.pythonhosted.org/packages/3d/63/5eca549d34d141bcd9de50d4e59b913f3641559460c739d5e215693cb54a/aiohttp-3.11.12-cp312-cp312-win_amd64.whl", hash = "sha256:54775858c7f2f214476773ce785a19ee81d1294a6bedc5cc17225355aab74802", size = 437860 },
+]
+
+[[package]]
+name = "aiosignal"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "frozenlist" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ba/b5/6d55e80f6d8a08ce22b982eafa278d823b541c925f11ee774b0b9c43473d/aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54", size = 19424 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 },
+]
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 },
+]
+
+[[package]]
+name = "async-timeout"
+version = "5.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233 },
+]
+
+[[package]]
+name = "attrs"
+version = "25.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/49/7c/fdf464bcc51d23881d110abd74b512a42b3d5d376a55a831b44c603ae17f/attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e", size = 810562 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fc/30/d4986a882011f9df997a55e6becd864812ccfcd821d64aac8570ee39f719/attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a", size = 63152 },
+]
+
+[[package]]
+name = "certifi"
+version = "2025.1.31"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1c/ab/c9f1e32b7b1bf505bf26f0ef697775960db7932abeb7b516de930ba2705f/certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651", size = 167577 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/38/fc/bce832fd4fd99766c04d1ee0eead6b0ec6486fb100ae5e74c1d91292b982/certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe", size = 166393 },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/16/b0/572805e227f01586461c80e0fd25d65a2115599cc9dad142fee4b747c357/charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3", size = 123188 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0d/58/5580c1716040bc89206c77d8f74418caf82ce519aae06450393ca73475d1/charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de", size = 198013 },
+    { url = "https://files.pythonhosted.org/packages/d0/11/00341177ae71c6f5159a08168bcb98c6e6d196d372c94511f9f6c9afe0c6/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176", size = 141285 },
+    { url = "https://files.pythonhosted.org/packages/01/09/11d684ea5819e5a8f5100fb0b38cf8d02b514746607934134d31233e02c8/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037", size = 151449 },
+    { url = "https://files.pythonhosted.org/packages/08/06/9f5a12939db324d905dc1f70591ae7d7898d030d7662f0d426e2286f68c9/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f", size = 143892 },
+    { url = "https://files.pythonhosted.org/packages/93/62/5e89cdfe04584cb7f4d36003ffa2936681b03ecc0754f8e969c2becb7e24/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a", size = 146123 },
+    { url = "https://files.pythonhosted.org/packages/a9/ac/ab729a15c516da2ab70a05f8722ecfccc3f04ed7a18e45c75bbbaa347d61/charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a", size = 147943 },
+    { url = "https://files.pythonhosted.org/packages/03/d2/3f392f23f042615689456e9a274640c1d2e5dd1d52de36ab8f7955f8f050/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247", size = 142063 },
+    { url = "https://files.pythonhosted.org/packages/f2/e3/e20aae5e1039a2cd9b08d9205f52142329f887f8cf70da3650326670bddf/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408", size = 150578 },
+    { url = "https://files.pythonhosted.org/packages/8d/af/779ad72a4da0aed925e1139d458adc486e61076d7ecdcc09e610ea8678db/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb", size = 153629 },
+    { url = "https://files.pythonhosted.org/packages/c2/b6/7aa450b278e7aa92cf7732140bfd8be21f5f29d5bf334ae987c945276639/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d", size = 150778 },
+    { url = "https://files.pythonhosted.org/packages/39/f4/d9f4f712d0951dcbfd42920d3db81b00dd23b6ab520419626f4023334056/charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807", size = 146453 },
+    { url = "https://files.pythonhosted.org/packages/49/2b/999d0314e4ee0cff3cb83e6bc9aeddd397eeed693edb4facb901eb8fbb69/charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f", size = 95479 },
+    { url = "https://files.pythonhosted.org/packages/2d/ce/3cbed41cff67e455a386fb5e5dd8906cdda2ed92fbc6297921f2e4419309/charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f", size = 102790 },
+    { url = "https://files.pythonhosted.org/packages/72/80/41ef5d5a7935d2d3a773e3eaebf0a9350542f2cab4eac59a7a4741fbbbbe/charset_normalizer-3.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125", size = 194995 },
+    { url = "https://files.pythonhosted.org/packages/7a/28/0b9fefa7b8b080ec492110af6d88aa3dea91c464b17d53474b6e9ba5d2c5/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1", size = 139471 },
+    { url = "https://files.pythonhosted.org/packages/71/64/d24ab1a997efb06402e3fc07317e94da358e2585165930d9d59ad45fcae2/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3", size = 149831 },
+    { url = "https://files.pythonhosted.org/packages/37/ed/be39e5258e198655240db5e19e0b11379163ad7070962d6b0c87ed2c4d39/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd", size = 142335 },
+    { url = "https://files.pythonhosted.org/packages/88/83/489e9504711fa05d8dde1574996408026bdbdbd938f23be67deebb5eca92/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00", size = 143862 },
+    { url = "https://files.pythonhosted.org/packages/c6/c7/32da20821cf387b759ad24627a9aca289d2822de929b8a41b6241767b461/charset_normalizer-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12", size = 145673 },
+    { url = "https://files.pythonhosted.org/packages/68/85/f4288e96039abdd5aeb5c546fa20a37b50da71b5cf01e75e87f16cd43304/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77", size = 140211 },
+    { url = "https://files.pythonhosted.org/packages/28/a3/a42e70d03cbdabc18997baf4f0227c73591a08041c149e710045c281f97b/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146", size = 148039 },
+    { url = "https://files.pythonhosted.org/packages/85/e4/65699e8ab3014ecbe6f5c71d1a55d810fb716bbfd74f6283d5c2aa87febf/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd", size = 151939 },
+    { url = "https://files.pythonhosted.org/packages/b1/82/8e9fe624cc5374193de6860aba3ea8070f584c8565ee77c168ec13274bd2/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6", size = 149075 },
+    { url = "https://files.pythonhosted.org/packages/3d/7b/82865ba54c765560c8433f65e8acb9217cb839a9e32b42af4aa8e945870f/charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8", size = 144340 },
+    { url = "https://files.pythonhosted.org/packages/b5/b6/9674a4b7d4d99a0d2df9b215da766ee682718f88055751e1e5e753c82db0/charset_normalizer-3.4.1-cp311-cp311-win32.whl", hash = "sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b", size = 95205 },
+    { url = "https://files.pythonhosted.org/packages/1e/ab/45b180e175de4402dcf7547e4fb617283bae54ce35c27930a6f35b6bef15/charset_normalizer-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76", size = 102441 },
+    { url = "https://files.pythonhosted.org/packages/0a/9a/dd1e1cdceb841925b7798369a09279bd1cf183cef0f9ddf15a3a6502ee45/charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545", size = 196105 },
+    { url = "https://files.pythonhosted.org/packages/d3/8c/90bfabf8c4809ecb648f39794cf2a84ff2e7d2a6cf159fe68d9a26160467/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7", size = 140404 },
+    { url = "https://files.pythonhosted.org/packages/ad/8f/e410d57c721945ea3b4f1a04b74f70ce8fa800d393d72899f0a40526401f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757", size = 150423 },
+    { url = "https://files.pythonhosted.org/packages/f0/b8/e6825e25deb691ff98cf5c9072ee0605dc2acfca98af70c2d1b1bc75190d/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa", size = 143184 },
+    { url = "https://files.pythonhosted.org/packages/3e/a2/513f6cbe752421f16d969e32f3583762bfd583848b763913ddab8d9bfd4f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d", size = 145268 },
+    { url = "https://files.pythonhosted.org/packages/74/94/8a5277664f27c3c438546f3eb53b33f5b19568eb7424736bdc440a88a31f/charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616", size = 147601 },
+    { url = "https://files.pythonhosted.org/packages/7c/5f/6d352c51ee763623a98e31194823518e09bfa48be2a7e8383cf691bbb3d0/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b", size = 141098 },
+    { url = "https://files.pythonhosted.org/packages/78/d4/f5704cb629ba5ab16d1d3d741396aec6dc3ca2b67757c45b0599bb010478/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d", size = 149520 },
+    { url = "https://files.pythonhosted.org/packages/c5/96/64120b1d02b81785f222b976c0fb79a35875457fa9bb40827678e54d1bc8/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a", size = 152852 },
+    { url = "https://files.pythonhosted.org/packages/84/c9/98e3732278a99f47d487fd3468bc60b882920cef29d1fa6ca460a1fdf4e6/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9", size = 150488 },
+    { url = "https://files.pythonhosted.org/packages/13/0e/9c8d4cb99c98c1007cc11eda969ebfe837bbbd0acdb4736d228ccaabcd22/charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1", size = 146192 },
+    { url = "https://files.pythonhosted.org/packages/b2/21/2b6b5b860781a0b49427309cb8670785aa543fb2178de875b87b9cc97746/charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35", size = 95550 },
+    { url = "https://files.pythonhosted.org/packages/21/5b/1b390b03b1d16c7e382b561c5329f83cc06623916aab983e8ab9239c7d5c/charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f", size = 102785 },
+    { url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
+]
+
+[[package]]
+name = "docker"
+version = "7.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pywin32", marker = "sys_platform == 'win32'" },
+    { name = "requests" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/91/9b/4a2ea29aeba62471211598dac5d96825bb49348fa07e906ea930394a83ce/docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c", size = 117834 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774 },
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 },
+]
+
+[[package]]
+name = "filelock"
+version = "3.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/dc/9c/0b15fb47b464e1b663b1acd1253a062aa5feecb07d4e597daea542ebd2b5/filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e", size = 18027 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/89/ec/00d68c4ddfedfe64159999e5f8a98fb8442729a63e2077eb9dcd89623d27/filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338", size = 16164 },
+]
+
+[[package]]
+name = "frozenlist"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8f/ed/0f4cec13a93c02c47ec32d81d11c0c1efbadf4a471e3f3ce7cad366cbbd3/frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817", size = 39930 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/79/29d44c4af36b2b240725dce566b20f63f9b36ef267aaaa64ee7466f4f2f8/frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a", size = 94451 },
+    { url = "https://files.pythonhosted.org/packages/47/47/0c999aeace6ead8a44441b4f4173e2261b18219e4ad1fe9a479871ca02fc/frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb", size = 54301 },
+    { url = "https://files.pythonhosted.org/packages/8d/60/107a38c1e54176d12e06e9d4b5d755b677d71d1219217cee063911b1384f/frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec", size = 52213 },
+    { url = "https://files.pythonhosted.org/packages/17/62/594a6829ac5679c25755362a9dc93486a8a45241394564309641425d3ff6/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5", size = 240946 },
+    { url = "https://files.pythonhosted.org/packages/7e/75/6c8419d8f92c80dd0ee3f63bdde2702ce6398b0ac8410ff459f9b6f2f9cb/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76", size = 264608 },
+    { url = "https://files.pythonhosted.org/packages/88/3e/82a6f0b84bc6fb7e0be240e52863c6d4ab6098cd62e4f5b972cd31e002e8/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17", size = 261361 },
+    { url = "https://files.pythonhosted.org/packages/fd/85/14e5f9ccac1b64ff2f10c927b3ffdf88772aea875882406f9ba0cec8ad84/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba", size = 231649 },
+    { url = "https://files.pythonhosted.org/packages/ee/59/928322800306f6529d1852323014ee9008551e9bb027cc38d276cbc0b0e7/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d", size = 241853 },
+    { url = "https://files.pythonhosted.org/packages/7d/bd/e01fa4f146a6f6c18c5d34cab8abdc4013774a26c4ff851128cd1bd3008e/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2", size = 243652 },
+    { url = "https://files.pythonhosted.org/packages/a5/bd/e4771fd18a8ec6757033f0fa903e447aecc3fbba54e3630397b61596acf0/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f", size = 241734 },
+    { url = "https://files.pythonhosted.org/packages/21/13/c83821fa5544af4f60c5d3a65d054af3213c26b14d3f5f48e43e5fb48556/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c", size = 260959 },
+    { url = "https://files.pythonhosted.org/packages/71/f3/1f91c9a9bf7ed0e8edcf52698d23f3c211d8d00291a53c9f115ceb977ab1/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab", size = 262706 },
+    { url = "https://files.pythonhosted.org/packages/4c/22/4a256fdf5d9bcb3ae32622c796ee5ff9451b3a13a68cfe3f68e2c95588ce/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5", size = 250401 },
+    { url = "https://files.pythonhosted.org/packages/af/89/c48ebe1f7991bd2be6d5f4ed202d94960c01b3017a03d6954dd5fa9ea1e8/frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb", size = 45498 },
+    { url = "https://files.pythonhosted.org/packages/28/2f/cc27d5f43e023d21fe5c19538e08894db3d7e081cbf582ad5ed366c24446/frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4", size = 51622 },
+    { url = "https://files.pythonhosted.org/packages/79/43/0bed28bf5eb1c9e4301003b74453b8e7aa85fb293b31dde352aac528dafc/frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30", size = 94987 },
+    { url = "https://files.pythonhosted.org/packages/bb/bf/b74e38f09a246e8abbe1e90eb65787ed745ccab6eaa58b9c9308e052323d/frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5", size = 54584 },
+    { url = "https://files.pythonhosted.org/packages/2c/31/ab01375682f14f7613a1ade30149f684c84f9b8823a4391ed950c8285656/frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778", size = 52499 },
+    { url = "https://files.pythonhosted.org/packages/98/a8/d0ac0b9276e1404f58fec3ab6e90a4f76b778a49373ccaf6a563f100dfbc/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a", size = 276357 },
+    { url = "https://files.pythonhosted.org/packages/ad/c9/c7761084fa822f07dac38ac29f841d4587570dd211e2262544aa0b791d21/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869", size = 287516 },
+    { url = "https://files.pythonhosted.org/packages/a1/ff/cd7479e703c39df7bdab431798cef89dc75010d8aa0ca2514c5b9321db27/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d", size = 283131 },
+    { url = "https://files.pythonhosted.org/packages/59/a0/370941beb47d237eca4fbf27e4e91389fd68699e6f4b0ebcc95da463835b/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45", size = 261320 },
+    { url = "https://files.pythonhosted.org/packages/b8/5f/c10123e8d64867bc9b4f2f510a32042a306ff5fcd7e2e09e5ae5100ee333/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d", size = 274877 },
+    { url = "https://files.pythonhosted.org/packages/fa/79/38c505601ae29d4348f21706c5d89755ceded02a745016ba2f58bd5f1ea6/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3", size = 269592 },
+    { url = "https://files.pythonhosted.org/packages/19/e2/39f3a53191b8204ba9f0bb574b926b73dd2efba2a2b9d2d730517e8f7622/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a", size = 265934 },
+    { url = "https://files.pythonhosted.org/packages/d5/c9/3075eb7f7f3a91f1a6b00284af4de0a65a9ae47084930916f5528144c9dd/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9", size = 283859 },
+    { url = "https://files.pythonhosted.org/packages/05/f5/549f44d314c29408b962fa2b0e69a1a67c59379fb143b92a0a065ffd1f0f/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2", size = 287560 },
+    { url = "https://files.pythonhosted.org/packages/9d/f8/cb09b3c24a3eac02c4c07a9558e11e9e244fb02bf62c85ac2106d1eb0c0b/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf", size = 277150 },
+    { url = "https://files.pythonhosted.org/packages/37/48/38c2db3f54d1501e692d6fe058f45b6ad1b358d82cd19436efab80cfc965/frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942", size = 45244 },
+    { url = "https://files.pythonhosted.org/packages/ca/8c/2ddffeb8b60a4bce3b196c32fcc30d8830d4615e7b492ec2071da801b8ad/frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d", size = 51634 },
+    { url = "https://files.pythonhosted.org/packages/79/73/fa6d1a96ab7fd6e6d1c3500700963eab46813847f01ef0ccbaa726181dd5/frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21", size = 94026 },
+    { url = "https://files.pythonhosted.org/packages/ab/04/ea8bf62c8868b8eada363f20ff1b647cf2e93377a7b284d36062d21d81d1/frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d", size = 54150 },
+    { url = "https://files.pythonhosted.org/packages/d0/9a/8e479b482a6f2070b26bda572c5e6889bb3ba48977e81beea35b5ae13ece/frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e", size = 51927 },
+    { url = "https://files.pythonhosted.org/packages/e3/12/2aad87deb08a4e7ccfb33600871bbe8f0e08cb6d8224371387f3303654d7/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a", size = 282647 },
+    { url = "https://files.pythonhosted.org/packages/77/f2/07f06b05d8a427ea0060a9cef6e63405ea9e0d761846b95ef3fb3be57111/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a", size = 289052 },
+    { url = "https://files.pythonhosted.org/packages/bd/9f/8bf45a2f1cd4aa401acd271b077989c9267ae8463e7c8b1eb0d3f561b65e/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee", size = 291719 },
+    { url = "https://files.pythonhosted.org/packages/41/d1/1f20fd05a6c42d3868709b7604c9f15538a29e4f734c694c6bcfc3d3b935/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6", size = 267433 },
+    { url = "https://files.pythonhosted.org/packages/af/f2/64b73a9bb86f5a89fb55450e97cd5c1f84a862d4ff90d9fd1a73ab0f64a5/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e", size = 283591 },
+    { url = "https://files.pythonhosted.org/packages/29/e2/ffbb1fae55a791fd6c2938dd9ea779509c977435ba3940b9f2e8dc9d5316/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9", size = 273249 },
+    { url = "https://files.pythonhosted.org/packages/2e/6e/008136a30798bb63618a114b9321b5971172a5abddff44a100c7edc5ad4f/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039", size = 271075 },
+    { url = "https://files.pythonhosted.org/packages/ae/f0/4e71e54a026b06724cec9b6c54f0b13a4e9e298cc8db0f82ec70e151f5ce/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784", size = 285398 },
+    { url = "https://files.pythonhosted.org/packages/4d/36/70ec246851478b1c0b59f11ef8ade9c482ff447c1363c2bd5fad45098b12/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631", size = 294445 },
+    { url = "https://files.pythonhosted.org/packages/37/e0/47f87544055b3349b633a03c4d94b405956cf2437f4ab46d0928b74b7526/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f", size = 280569 },
+    { url = "https://files.pythonhosted.org/packages/f9/7c/490133c160fb6b84ed374c266f42800e33b50c3bbab1652764e6e1fc498a/frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8", size = 44721 },
+    { url = "https://files.pythonhosted.org/packages/b1/56/4e45136ffc6bdbfa68c29ca56ef53783ef4c2fd395f7cbf99a2624aa9aaa/frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f", size = 51329 },
+    { url = "https://files.pythonhosted.org/packages/c6/c8/a5be5b7550c10858fcf9b0ea054baccab474da77d37f1e828ce043a3a5d4/frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3", size = 11901 },
+]
+
+[[package]]
+name = "fsspec"
+version = "2025.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/79/68612ed99700e6413de42895aa725463e821a6b3be75c87fcce1b4af4c70/fsspec-2025.2.0.tar.gz", hash = "sha256:1c24b16eaa0a1798afa0337aa0db9b256718ab2a89c425371f5628d22c3b6afd", size = 292283 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 },
+]
+
+[[package]]
+name = "huggingface-hub"
+version = "0.29.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e2/ac/9f7010c8b050d80b64bfddcc09ef4a4450ae4369940d1b01fa13f5d083de/huggingface_hub-0.29.0.tar.gz", hash = "sha256:64034c852be270cac16c5743fe1f659b14515a9de6342d6f42cbb2ede191fc80", size = 389753 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/4d/8092df2cb0cafa9fcaf691db851b2fccfe9cad4048e081436bbbdf56e4e1/huggingface_hub-0.29.0-py3-none-any.whl", hash = "sha256:c02daa0b6bafbdacb1320fdfd1dc7151d0940825c88c4ef89837fdb1f6ea0afe", size = 468012 },
+]
+
+[[package]]
+name = "idna"
+version = "3.10"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 },
+]
+
+[[package]]
+name = "multidict"
+version = "6.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d6/be/504b89a5e9ca731cd47487e91c469064f8ae5af93b7259758dcfc2b9c848/multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a", size = 64002 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/68/259dee7fd14cf56a17c554125e534f6274c2860159692a414d0b402b9a6d/multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60", size = 48628 },
+    { url = "https://files.pythonhosted.org/packages/50/79/53ba256069fe5386a4a9e80d4e12857ced9de295baf3e20c68cdda746e04/multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1", size = 29327 },
+    { url = "https://files.pythonhosted.org/packages/ff/10/71f1379b05b196dae749b5ac062e87273e3f11634f447ebac12a571d90ae/multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53", size = 29689 },
+    { url = "https://files.pythonhosted.org/packages/71/45/70bac4f87438ded36ad4793793c0095de6572d433d98575a5752629ef549/multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5", size = 126639 },
+    { url = "https://files.pythonhosted.org/packages/80/cf/17f35b3b9509b4959303c05379c4bfb0d7dd05c3306039fc79cf035bbac0/multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581", size = 134315 },
+    { url = "https://files.pythonhosted.org/packages/ef/1f/652d70ab5effb33c031510a3503d4d6efc5ec93153562f1ee0acdc895a57/multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56", size = 129471 },
+    { url = "https://files.pythonhosted.org/packages/a6/64/2dd6c4c681688c0165dea3975a6a4eab4944ea30f35000f8b8af1df3148c/multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429", size = 124585 },
+    { url = "https://files.pythonhosted.org/packages/87/56/e6ee5459894c7e554b57ba88f7257dc3c3d2d379cb15baaa1e265b8c6165/multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748", size = 116957 },
+    { url = "https://files.pythonhosted.org/packages/36/9e/616ce5e8d375c24b84f14fc263c7ef1d8d5e8ef529dbc0f1df8ce71bb5b8/multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db", size = 128609 },
+    { url = "https://files.pythonhosted.org/packages/8c/4f/4783e48a38495d000f2124020dc96bacc806a4340345211b1ab6175a6cb4/multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056", size = 123016 },
+    { url = "https://files.pythonhosted.org/packages/3e/b3/4950551ab8fc39862ba5e9907dc821f896aa829b4524b4deefd3e12945ab/multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76", size = 133542 },
+    { url = "https://files.pythonhosted.org/packages/96/4d/f0ce6ac9914168a2a71df117935bb1f1781916acdecbb43285e225b484b8/multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160", size = 130163 },
+    { url = "https://files.pythonhosted.org/packages/be/72/17c9f67e7542a49dd252c5ae50248607dfb780bcc03035907dafefb067e3/multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7", size = 126832 },
+    { url = "https://files.pythonhosted.org/packages/71/9f/72d719e248cbd755c8736c6d14780533a1606ffb3fbb0fbd77da9f0372da/multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0", size = 26402 },
+    { url = "https://files.pythonhosted.org/packages/04/5a/d88cd5d00a184e1ddffc82aa2e6e915164a6d2641ed3606e766b5d2f275a/multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d", size = 28800 },
+    { url = "https://files.pythonhosted.org/packages/93/13/df3505a46d0cd08428e4c8169a196131d1b0c4b515c3649829258843dde6/multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6", size = 48570 },
+    { url = "https://files.pythonhosted.org/packages/f0/e1/a215908bfae1343cdb72f805366592bdd60487b4232d039c437fe8f5013d/multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156", size = 29316 },
+    { url = "https://files.pythonhosted.org/packages/70/0f/6dc70ddf5d442702ed74f298d69977f904960b82368532c88e854b79f72b/multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb", size = 29640 },
+    { url = "https://files.pythonhosted.org/packages/d8/6d/9c87b73a13d1cdea30b321ef4b3824449866bd7f7127eceed066ccb9b9ff/multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b", size = 131067 },
+    { url = "https://files.pythonhosted.org/packages/cc/1e/1b34154fef373371fd6c65125b3d42ff5f56c7ccc6bfff91b9b3c60ae9e0/multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72", size = 138507 },
+    { url = "https://files.pythonhosted.org/packages/fb/e0/0bc6b2bac6e461822b5f575eae85da6aae76d0e2a79b6665d6206b8e2e48/multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304", size = 133905 },
+    { url = "https://files.pythonhosted.org/packages/ba/af/73d13b918071ff9b2205fcf773d316e0f8fefb4ec65354bbcf0b10908cc6/multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351", size = 129004 },
+    { url = "https://files.pythonhosted.org/packages/74/21/23960627b00ed39643302d81bcda44c9444ebcdc04ee5bedd0757513f259/multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb", size = 121308 },
+    { url = "https://files.pythonhosted.org/packages/8b/5c/cf282263ffce4a596ed0bb2aa1a1dddfe1996d6a62d08842a8d4b33dca13/multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3", size = 132608 },
+    { url = "https://files.pythonhosted.org/packages/d7/3e/97e778c041c72063f42b290888daff008d3ab1427f5b09b714f5a8eff294/multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399", size = 127029 },
+    { url = "https://files.pythonhosted.org/packages/47/ac/3efb7bfe2f3aefcf8d103e9a7162572f01936155ab2f7ebcc7c255a23212/multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423", size = 137594 },
+    { url = "https://files.pythonhosted.org/packages/42/9b/6c6e9e8dc4f915fc90a9b7798c44a30773dea2995fdcb619870e705afe2b/multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3", size = 134556 },
+    { url = "https://files.pythonhosted.org/packages/1d/10/8e881743b26aaf718379a14ac58572a240e8293a1c9d68e1418fb11c0f90/multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753", size = 130993 },
+    { url = "https://files.pythonhosted.org/packages/45/84/3eb91b4b557442802d058a7579e864b329968c8d0ea57d907e7023c677f2/multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80", size = 26405 },
+    { url = "https://files.pythonhosted.org/packages/9f/0b/ad879847ecbf6d27e90a6eabb7eff6b62c129eefe617ea45eae7c1f0aead/multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926", size = 28795 },
+    { url = "https://files.pythonhosted.org/packages/fd/16/92057c74ba3b96d5e211b553895cd6dc7cc4d1e43d9ab8fafc727681ef71/multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa", size = 48713 },
+    { url = "https://files.pythonhosted.org/packages/94/3d/37d1b8893ae79716179540b89fc6a0ee56b4a65fcc0d63535c6f5d96f217/multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436", size = 29516 },
+    { url = "https://files.pythonhosted.org/packages/a2/12/adb6b3200c363062f805275b4c1e656be2b3681aada66c80129932ff0bae/multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761", size = 29557 },
+    { url = "https://files.pythonhosted.org/packages/47/e9/604bb05e6e5bce1e6a5cf80a474e0f072e80d8ac105f1b994a53e0b28c42/multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e", size = 130170 },
+    { url = "https://files.pythonhosted.org/packages/7e/13/9efa50801785eccbf7086b3c83b71a4fb501a4d43549c2f2f80b8787d69f/multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef", size = 134836 },
+    { url = "https://files.pythonhosted.org/packages/bf/0f/93808b765192780d117814a6dfcc2e75de6dcc610009ad408b8814dca3ba/multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95", size = 133475 },
+    { url = "https://files.pythonhosted.org/packages/d3/c8/529101d7176fe7dfe1d99604e48d69c5dfdcadb4f06561f465c8ef12b4df/multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925", size = 131049 },
+    { url = "https://files.pythonhosted.org/packages/ca/0c/fc85b439014d5a58063e19c3a158a889deec399d47b5269a0f3b6a2e28bc/multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966", size = 120370 },
+    { url = "https://files.pythonhosted.org/packages/db/46/d4416eb20176492d2258fbd47b4abe729ff3b6e9c829ea4236f93c865089/multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305", size = 125178 },
+    { url = "https://files.pythonhosted.org/packages/5b/46/73697ad7ec521df7de5531a32780bbfd908ded0643cbe457f981a701457c/multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2", size = 119567 },
+    { url = "https://files.pythonhosted.org/packages/cd/ed/51f060e2cb0e7635329fa6ff930aa5cffa17f4c7f5c6c3ddc3500708e2f2/multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2", size = 129822 },
+    { url = "https://files.pythonhosted.org/packages/df/9e/ee7d1954b1331da3eddea0c4e08d9142da5f14b1321c7301f5014f49d492/multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6", size = 128656 },
+    { url = "https://files.pythonhosted.org/packages/77/00/8538f11e3356b5d95fa4b024aa566cde7a38aa7a5f08f4912b32a037c5dc/multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3", size = 125360 },
+    { url = "https://files.pythonhosted.org/packages/be/05/5d334c1f2462d43fec2363cd00b1c44c93a78c3925d952e9a71caf662e96/multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133", size = 26382 },
+    { url = "https://files.pythonhosted.org/packages/a3/bf/f332a13486b1ed0496d624bcc7e8357bb8053823e8cd4b9a18edc1d97e73/multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1", size = 28529 },
+    { url = "https://files.pythonhosted.org/packages/99/b7/b9e70fde2c0f0c9af4cc5277782a89b66d35948ea3369ec9f598358c3ac5/multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506", size = 10051 },
+]
+
+[[package]]
+name = "numpy"
+version = "2.2.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fb/90/8956572f5c4ae52201fdec7ba2044b2c882832dcec7d5d0922c9e9acf2de/numpy-2.2.3.tar.gz", hash = "sha256:dbdc15f0c81611925f382dfa97b3bd0bc2c1ce19d4fe50482cb0ddc12ba30020", size = 20262700 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5e/e1/1816d5d527fa870b260a1c2c5904d060caad7515637bd54f495a5ce13ccd/numpy-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cbc6472e01952d3d1b2772b720428f8b90e2deea8344e854df22b0618e9cce71", size = 21232911 },
+    { url = "https://files.pythonhosted.org/packages/29/46/9f25dc19b359f10c0e52b6bac25d3181eb1f4b4d04c9846a32cf5ea52762/numpy-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdfe0c22692a30cd830c0755746473ae66c4a8f2e7bd508b35fb3b6a0813d787", size = 14371955 },
+    { url = "https://files.pythonhosted.org/packages/72/d7/de941296e6b09a5c81d3664ad912f1496a0ecdd2f403318e5e35604ff70f/numpy-2.2.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:e37242f5324ffd9f7ba5acf96d774f9276aa62a966c0bad8dae692deebec7716", size = 5410476 },
+    { url = "https://files.pythonhosted.org/packages/36/ce/55f685995110f8a268fdca0f198c9a84fa87b39512830965cc1087af6391/numpy-2.2.3-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:95172a21038c9b423e68be78fd0be6e1b97674cde269b76fe269a5dfa6fadf0b", size = 6945730 },
+    { url = "https://files.pythonhosted.org/packages/4f/84/abdb9f6e22576d89c259401c3234d4755b322539491bbcffadc8bcb120d3/numpy-2.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5b47c440210c5d1d67e1cf434124e0b5c395eee1f5806fdd89b553ed1acd0a3", size = 14350752 },
+    { url = "https://files.pythonhosted.org/packages/e9/88/3870cfa9bef4dffb3a326507f430e6007eeac258ebeef6b76fc542aef66d/numpy-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0391ea3622f5c51a2e29708877d56e3d276827ac5447d7f45e9bc4ade8923c52", size = 16399386 },
+    { url = "https://files.pythonhosted.org/packages/02/10/3f629682dd0b457525c131945329c4e81e2dadeb11256e6ce4c9a1a6fb41/numpy-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f6b3dfc7661f8842babd8ea07e9897fe3d9b69a1d7e5fbb743e4160f9387833b", size = 15561826 },
+    { url = "https://files.pythonhosted.org/packages/da/18/fd35673ba9751eba449d4ce5d24d94e3b612cdbfba79348da71488c0b7ac/numpy-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1ad78ce7f18ce4e7df1b2ea4019b5817a2f6a8a16e34ff2775f646adce0a5027", size = 18188593 },
+    { url = "https://files.pythonhosted.org/packages/ce/4c/c0f897b580ea59484b4cc96a441fea50333b26675a60a1421bc912268b5f/numpy-2.2.3-cp310-cp310-win32.whl", hash = "sha256:5ebeb7ef54a7be11044c33a17b2624abe4307a75893c001a4800857956b41094", size = 6590421 },
+    { url = "https://files.pythonhosted.org/packages/e5/5b/aaabbfc7060c5c8f0124c5deb5e114a3b413a548bbc64e372c5b5db36165/numpy-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:596140185c7fa113563c67c2e894eabe0daea18cf8e33851738c19f70ce86aeb", size = 12925667 },
+    { url = "https://files.pythonhosted.org/packages/96/86/453aa3949eab6ff54e2405f9cb0c01f756f031c3dc2a6d60a1d40cba5488/numpy-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:16372619ee728ed67a2a606a614f56d3eabc5b86f8b615c79d01957062826ca8", size = 21237256 },
+    { url = "https://files.pythonhosted.org/packages/20/c3/93ecceadf3e155d6a9e4464dd2392d8d80cf436084c714dc8535121c83e8/numpy-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5521a06a3148686d9269c53b09f7d399a5725c47bbb5b35747e1cb76326b714b", size = 14408049 },
+    { url = "https://files.pythonhosted.org/packages/8d/29/076999b69bd9264b8df5e56f2be18da2de6b2a2d0e10737e5307592e01de/numpy-2.2.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:7c8dde0ca2f77828815fd1aedfdf52e59071a5bae30dac3b4da2a335c672149a", size = 5408655 },
+    { url = "https://files.pythonhosted.org/packages/e2/a7/b14f0a73eb0fe77cb9bd5b44534c183b23d4229c099e339c522724b02678/numpy-2.2.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:77974aba6c1bc26e3c205c2214f0d5b4305bdc719268b93e768ddb17e3fdd636", size = 6949996 },
+    { url = "https://files.pythonhosted.org/packages/72/2f/8063da0616bb0f414b66dccead503bd96e33e43685c820e78a61a214c098/numpy-2.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d42f9c36d06440e34226e8bd65ff065ca0963aeecada587b937011efa02cdc9d", size = 14355789 },
+    { url = "https://files.pythonhosted.org/packages/e6/d7/3cd47b00b8ea95ab358c376cf5602ad21871410950bc754cf3284771f8b6/numpy-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2712c5179f40af9ddc8f6727f2bd910ea0eb50206daea75f58ddd9fa3f715bb", size = 16411356 },
+    { url = "https://files.pythonhosted.org/packages/27/c0/a2379e202acbb70b85b41483a422c1e697ff7eee74db642ca478de4ba89f/numpy-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c8b0451d2ec95010d1db8ca733afc41f659f425b7f608af569711097fd6014e2", size = 15576770 },
+    { url = "https://files.pythonhosted.org/packages/bc/63/a13ee650f27b7999e5b9e1964ae942af50bb25606d088df4229283eda779/numpy-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d9b4a8148c57ecac25a16b0e11798cbe88edf5237b0df99973687dd866f05e1b", size = 18200483 },
+    { url = "https://files.pythonhosted.org/packages/4c/87/e71f89935e09e8161ac9c590c82f66d2321eb163893a94af749dfa8a3cf8/numpy-2.2.3-cp311-cp311-win32.whl", hash = "sha256:1f45315b2dc58d8a3e7754fe4e38b6fce132dab284a92851e41b2b344f6441c5", size = 6588415 },
+    { url = "https://files.pythonhosted.org/packages/b9/c6/cd4298729826af9979c5f9ab02fcaa344b82621e7c49322cd2d210483d3f/numpy-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f48ba6f6c13e5e49f3d3efb1b51c8193215c42ac82610a04624906a9270be6f", size = 12929604 },
+    { url = "https://files.pythonhosted.org/packages/43/ec/43628dcf98466e087812142eec6d1c1a6c6bdfdad30a0aa07b872dc01f6f/numpy-2.2.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:12c045f43b1d2915eca6b880a7f4a256f59d62df4f044788c8ba67709412128d", size = 20929458 },
+    { url = "https://files.pythonhosted.org/packages/9b/c0/2f4225073e99a5c12350954949ed19b5d4a738f541d33e6f7439e33e98e4/numpy-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:87eed225fd415bbae787f93a457af7f5990b92a334e346f72070bf569b9c9c95", size = 14115299 },
+    { url = "https://files.pythonhosted.org/packages/ca/fa/d2c5575d9c734a7376cc1592fae50257ec95d061b27ee3dbdb0b3b551eb2/numpy-2.2.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:712a64103d97c404e87d4d7c47fb0c7ff9acccc625ca2002848e0d53288b90ea", size = 5145723 },
+    { url = "https://files.pythonhosted.org/packages/eb/dc/023dad5b268a7895e58e791f28dc1c60eb7b6c06fcbc2af8538ad069d5f3/numpy-2.2.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a5ae282abe60a2db0fd407072aff4599c279bcd6e9a2475500fc35b00a57c532", size = 6678797 },
+    { url = "https://files.pythonhosted.org/packages/3f/19/bcd641ccf19ac25abb6fb1dcd7744840c11f9d62519d7057b6ab2096eb60/numpy-2.2.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5266de33d4c3420973cf9ae3b98b54a2a6d53a559310e3236c4b2b06b9c07d4e", size = 14067362 },
+    { url = "https://files.pythonhosted.org/packages/39/04/78d2e7402fb479d893953fb78fa7045f7deb635ec095b6b4f0260223091a/numpy-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b787adbf04b0db1967798dba8da1af07e387908ed1553a0d6e74c084d1ceafe", size = 16116679 },
+    { url = "https://files.pythonhosted.org/packages/d0/a1/e90f7aa66512be3150cb9d27f3d9995db330ad1b2046474a13b7040dfd92/numpy-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:34c1b7e83f94f3b564b35f480f5652a47007dd91f7c839f404d03279cc8dd021", size = 15264272 },
+    { url = "https://files.pythonhosted.org/packages/dc/b6/50bd027cca494de4fa1fc7bf1662983d0ba5f256fa0ece2c376b5eb9b3f0/numpy-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4d8335b5f1b6e2bce120d55fb17064b0262ff29b459e8493d1785c18ae2553b8", size = 17880549 },
+    { url = "https://files.pythonhosted.org/packages/96/30/f7bf4acb5f8db10a96f73896bdeed7a63373137b131ca18bd3dab889db3b/numpy-2.2.3-cp312-cp312-win32.whl", hash = "sha256:4d9828d25fb246bedd31e04c9e75714a4087211ac348cb39c8c5f99dbb6683fe", size = 6293394 },
+    { url = "https://files.pythonhosted.org/packages/42/6e/55580a538116d16ae7c9aa17d4edd56e83f42126cb1dfe7a684da7925d2c/numpy-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:83807d445817326b4bcdaaaf8e8e9f1753da04341eceec705c001ff342002e5d", size = 12626357 },
+    { url = "https://files.pythonhosted.org/packages/0a/b5/a7839f5478be8f859cb880f13d90fcfe4b0ec7a9ebaff2bcc30d96760596/numpy-2.2.3-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3c2ec8a0f51d60f1e9c0c5ab116b7fc104b165ada3f6c58abf881cb2eb16044d", size = 21064244 },
+    { url = "https://files.pythonhosted.org/packages/29/e8/5da32ffcaa7a72f7ecd82f90c062140a061eb823cb88e90279424e515cf4/numpy-2.2.3-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:ed2cf9ed4e8ebc3b754d398cba12f24359f018b416c380f577bbae112ca52fc9", size = 6809418 },
+    { url = "https://files.pythonhosted.org/packages/a8/a9/68aa7076c7656a7308a0f73d0a2ced8c03f282c9fd98fa7ce21c12634087/numpy-2.2.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39261798d208c3095ae4f7bc8eaeb3481ea8c6e03dc48028057d3cbdbdb8937e", size = 16215461 },
+    { url = "https://files.pythonhosted.org/packages/17/7f/d322a4125405920401450118dbdc52e0384026bd669939484670ce8b2ab9/numpy-2.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:783145835458e60fa97afac25d511d00a1eca94d4a8f3ace9fe2043003c678e4", size = 12839607 },
+]
+
+[[package]]
+name = "packaging"
+version = "24.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
+]
+
+[[package]]
+name = "propcache"
+version = "0.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/c8/2a13f78d82211490855b2fb303b6721348d0787fdd9a12ac46d99d3acde1/propcache-0.2.1.tar.gz", hash = "sha256:3f77ce728b19cb537714499928fe800c3dda29e8d9428778fc7c186da4c09a64", size = 41735 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/a5/0ea64c9426959ef145a938e38c832fc551843481d356713ececa9a8a64e8/propcache-0.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6b3f39a85d671436ee3d12c017f8fdea38509e4f25b28eb25877293c98c243f6", size = 79296 },
+    { url = "https://files.pythonhosted.org/packages/76/5a/916db1aba735f55e5eca4733eea4d1973845cf77dfe67c2381a2ca3ce52d/propcache-0.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d51fbe4285d5db5d92a929e3e21536ea3dd43732c5b177c7ef03f918dff9f2", size = 45622 },
+    { url = "https://files.pythonhosted.org/packages/2d/62/685d3cf268b8401ec12b250b925b21d152b9d193b7bffa5fdc4815c392c2/propcache-0.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6445804cf4ec763dc70de65a3b0d9954e868609e83850a47ca4f0cb64bd79fea", size = 45133 },
+    { url = "https://files.pythonhosted.org/packages/4d/3d/31c9c29ee7192defc05aa4d01624fd85a41cf98e5922aaed206017329944/propcache-0.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9479aa06a793c5aeba49ce5c5692ffb51fcd9a7016e017d555d5e2b0045d212", size = 204809 },
+    { url = "https://files.pythonhosted.org/packages/10/a1/e4050776f4797fc86140ac9a480d5dc069fbfa9d499fe5c5d2fa1ae71f07/propcache-0.2.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9631c5e8b5b3a0fda99cb0d29c18133bca1e18aea9effe55adb3da1adef80d3", size = 219109 },
+    { url = "https://files.pythonhosted.org/packages/c9/c0/e7ae0df76343d5e107d81e59acc085cea5fd36a48aa53ef09add7503e888/propcache-0.2.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3156628250f46a0895f1f36e1d4fbe062a1af8718ec3ebeb746f1d23f0c5dc4d", size = 217368 },
+    { url = "https://files.pythonhosted.org/packages/fc/e1/e0a2ed6394b5772508868a977d3238f4afb2eebaf9976f0b44a8d347ad63/propcache-0.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b6fb63ae352e13748289f04f37868099e69dba4c2b3e271c46061e82c745634", size = 205124 },
+    { url = "https://files.pythonhosted.org/packages/50/c1/e388c232d15ca10f233c778bbdc1034ba53ede14c207a72008de45b2db2e/propcache-0.2.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:887d9b0a65404929641a9fabb6452b07fe4572b269d901d622d8a34a4e9043b2", size = 195463 },
+    { url = "https://files.pythonhosted.org/packages/0a/fd/71b349b9def426cc73813dbd0f33e266de77305e337c8c12bfb0a2a82bfb/propcache-0.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a96dc1fa45bd8c407a0af03b2d5218392729e1822b0c32e62c5bf7eeb5fb3958", size = 198358 },
+    { url = "https://files.pythonhosted.org/packages/02/f2/d7c497cd148ebfc5b0ae32808e6c1af5922215fe38c7a06e4e722fe937c8/propcache-0.2.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a7e65eb5c003a303b94aa2c3852ef130230ec79e349632d030e9571b87c4698c", size = 195560 },
+    { url = "https://files.pythonhosted.org/packages/bb/57/f37041bbe5e0dfed80a3f6be2612a3a75b9cfe2652abf2c99bef3455bbad/propcache-0.2.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:999779addc413181912e984b942fbcc951be1f5b3663cd80b2687758f434c583", size = 196895 },
+    { url = "https://files.pythonhosted.org/packages/83/36/ae3cc3e4f310bff2f064e3d2ed5558935cc7778d6f827dce74dcfa125304/propcache-0.2.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:19a0f89a7bb9d8048d9c4370c9c543c396e894c76be5525f5e1ad287f1750ddf", size = 207124 },
+    { url = "https://files.pythonhosted.org/packages/8c/c4/811b9f311f10ce9d31a32ff14ce58500458443627e4df4ae9c264defba7f/propcache-0.2.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1ac2f5fe02fa75f56e1ad473f1175e11f475606ec9bd0be2e78e4734ad575034", size = 210442 },
+    { url = "https://files.pythonhosted.org/packages/18/dd/a1670d483a61ecac0d7fc4305d91caaac7a8fc1b200ea3965a01cf03bced/propcache-0.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:574faa3b79e8ebac7cb1d7930f51184ba1ccf69adfdec53a12f319a06030a68b", size = 203219 },
+    { url = "https://files.pythonhosted.org/packages/f9/2d/30ced5afde41b099b2dc0c6573b66b45d16d73090e85655f1a30c5a24e07/propcache-0.2.1-cp310-cp310-win32.whl", hash = "sha256:03ff9d3f665769b2a85e6157ac8b439644f2d7fd17615a82fa55739bc97863f4", size = 40313 },
+    { url = "https://files.pythonhosted.org/packages/23/84/bd9b207ac80da237af77aa6e153b08ffa83264b1c7882495984fcbfcf85c/propcache-0.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:2d3af2e79991102678f53e0dbf4c35de99b6b8b58f29a27ca0325816364caaba", size = 44428 },
+    { url = "https://files.pythonhosted.org/packages/bc/0f/2913b6791ebefb2b25b4efd4bb2299c985e09786b9f5b19184a88e5778dd/propcache-0.2.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:1ffc3cca89bb438fb9c95c13fc874012f7b9466b89328c3c8b1aa93cdcfadd16", size = 79297 },
+    { url = "https://files.pythonhosted.org/packages/cf/73/af2053aeccd40b05d6e19058419ac77674daecdd32478088b79375b9ab54/propcache-0.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f174bbd484294ed9fdf09437f889f95807e5f229d5d93588d34e92106fbf6717", size = 45611 },
+    { url = "https://files.pythonhosted.org/packages/3c/09/8386115ba7775ea3b9537730e8cf718d83bbf95bffe30757ccf37ec4e5da/propcache-0.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:70693319e0b8fd35dd863e3e29513875eb15c51945bf32519ef52927ca883bc3", size = 45146 },
+    { url = "https://files.pythonhosted.org/packages/03/7a/793aa12f0537b2e520bf09f4c6833706b63170a211ad042ca71cbf79d9cb/propcache-0.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b480c6a4e1138e1aa137c0079b9b6305ec6dcc1098a8ca5196283e8a49df95a9", size = 232136 },
+    { url = "https://files.pythonhosted.org/packages/f1/38/b921b3168d72111769f648314100558c2ea1d52eb3d1ba7ea5c4aa6f9848/propcache-0.2.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d27b84d5880f6d8aa9ae3edb253c59d9f6642ffbb2c889b78b60361eed449787", size = 239706 },
+    { url = "https://files.pythonhosted.org/packages/14/29/4636f500c69b5edea7786db3c34eb6166f3384b905665ce312a6e42c720c/propcache-0.2.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:857112b22acd417c40fa4595db2fe28ab900c8c5fe4670c7989b1c0230955465", size = 238531 },
+    { url = "https://files.pythonhosted.org/packages/85/14/01fe53580a8e1734ebb704a3482b7829a0ef4ea68d356141cf0994d9659b/propcache-0.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf6c4150f8c0e32d241436526f3c3f9cbd34429492abddbada2ffcff506c51af", size = 231063 },
+    { url = "https://files.pythonhosted.org/packages/33/5c/1d961299f3c3b8438301ccfbff0143b69afcc30c05fa28673cface692305/propcache-0.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66d4cfda1d8ed687daa4bc0274fcfd5267873db9a5bc0418c2da19273040eeb7", size = 220134 },
+    { url = "https://files.pythonhosted.org/packages/00/d0/ed735e76db279ba67a7d3b45ba4c654e7b02bc2f8050671ec365d8665e21/propcache-0.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c2f992c07c0fca81655066705beae35fc95a2fa7366467366db627d9f2ee097f", size = 220009 },
+    { url = "https://files.pythonhosted.org/packages/75/90/ee8fab7304ad6533872fee982cfff5a53b63d095d78140827d93de22e2d4/propcache-0.2.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:4a571d97dbe66ef38e472703067021b1467025ec85707d57e78711c085984e54", size = 212199 },
+    { url = "https://files.pythonhosted.org/packages/eb/ec/977ffaf1664f82e90737275873461695d4c9407d52abc2f3c3e24716da13/propcache-0.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:bb6178c241278d5fe853b3de743087be7f5f4c6f7d6d22a3b524d323eecec505", size = 214827 },
+    { url = "https://files.pythonhosted.org/packages/57/48/031fb87ab6081764054821a71b71942161619549396224cbb242922525e8/propcache-0.2.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ad1af54a62ffe39cf34db1aa6ed1a1873bd548f6401db39d8e7cd060b9211f82", size = 228009 },
+    { url = "https://files.pythonhosted.org/packages/1a/06/ef1390f2524850838f2390421b23a8b298f6ce3396a7cc6d39dedd4047b0/propcache-0.2.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e7048abd75fe40712005bcfc06bb44b9dfcd8e101dda2ecf2f5aa46115ad07ca", size = 231638 },
+    { url = "https://files.pythonhosted.org/packages/38/2a/101e6386d5a93358395da1d41642b79c1ee0f3b12e31727932b069282b1d/propcache-0.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:160291c60081f23ee43d44b08a7e5fb76681221a8e10b3139618c5a9a291b84e", size = 222788 },
+    { url = "https://files.pythonhosted.org/packages/db/81/786f687951d0979007e05ad9346cd357e50e3d0b0f1a1d6074df334b1bbb/propcache-0.2.1-cp311-cp311-win32.whl", hash = "sha256:819ce3b883b7576ca28da3861c7e1a88afd08cc8c96908e08a3f4dd64a228034", size = 40170 },
+    { url = "https://files.pythonhosted.org/packages/cf/59/7cc7037b295d5772eceb426358bb1b86e6cab4616d971bd74275395d100d/propcache-0.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:edc9fc7051e3350643ad929df55c451899bb9ae6d24998a949d2e4c87fb596d3", size = 44404 },
+    { url = "https://files.pythonhosted.org/packages/4c/28/1d205fe49be8b1b4df4c50024e62480a442b1a7b818e734308bb0d17e7fb/propcache-0.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:081a430aa8d5e8876c6909b67bd2d937bfd531b0382d3fdedb82612c618bc41a", size = 79588 },
+    { url = "https://files.pythonhosted.org/packages/21/ee/fc4d893f8d81cd4971affef2a6cb542b36617cd1d8ce56b406112cb80bf7/propcache-0.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d2ccec9ac47cf4e04897619c0e0c1a48c54a71bdf045117d3a26f80d38ab1fb0", size = 45825 },
+    { url = "https://files.pythonhosted.org/packages/4a/de/bbe712f94d088da1d237c35d735f675e494a816fd6f54e9db2f61ef4d03f/propcache-0.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:14d86fe14b7e04fa306e0c43cdbeebe6b2c2156a0c9ce56b815faacc193e320d", size = 45357 },
+    { url = "https://files.pythonhosted.org/packages/7f/14/7ae06a6cf2a2f1cb382586d5a99efe66b0b3d0c6f9ac2f759e6f7af9d7cf/propcache-0.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:049324ee97bb67285b49632132db351b41e77833678432be52bdd0289c0e05e4", size = 241869 },
+    { url = "https://files.pythonhosted.org/packages/cc/59/227a78be960b54a41124e639e2c39e8807ac0c751c735a900e21315f8c2b/propcache-0.2.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cd9a1d071158de1cc1c71a26014dcdfa7dd3d5f4f88c298c7f90ad6f27bb46d", size = 247884 },
+    { url = "https://files.pythonhosted.org/packages/84/58/f62b4ffaedf88dc1b17f04d57d8536601e4e030feb26617228ef930c3279/propcache-0.2.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98110aa363f1bb4c073e8dcfaefd3a5cea0f0834c2aab23dda657e4dab2f53b5", size = 248486 },
+    { url = "https://files.pythonhosted.org/packages/1c/07/ebe102777a830bca91bbb93e3479cd34c2ca5d0361b83be9dbd93104865e/propcache-0.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:647894f5ae99c4cf6bb82a1bb3a796f6e06af3caa3d32e26d2350d0e3e3faf24", size = 243649 },
+    { url = "https://files.pythonhosted.org/packages/ed/bc/4f7aba7f08f520376c4bb6a20b9a981a581b7f2e385fa0ec9f789bb2d362/propcache-0.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfd3223c15bebe26518d58ccf9a39b93948d3dcb3e57a20480dfdd315356baff", size = 229103 },
+    { url = "https://files.pythonhosted.org/packages/fe/d5/04ac9cd4e51a57a96f78795e03c5a0ddb8f23ec098b86f92de028d7f2a6b/propcache-0.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d71264a80f3fcf512eb4f18f59423fe82d6e346ee97b90625f283df56aee103f", size = 226607 },
+    { url = "https://files.pythonhosted.org/packages/e3/f0/24060d959ea41d7a7cc7fdbf68b31852331aabda914a0c63bdb0e22e96d6/propcache-0.2.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:e73091191e4280403bde6c9a52a6999d69cdfde498f1fdf629105247599b57ec", size = 221153 },
+    { url = "https://files.pythonhosted.org/packages/77/a7/3ac76045a077b3e4de4859a0753010765e45749bdf53bd02bc4d372da1a0/propcache-0.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3935bfa5fede35fb202c4b569bb9c042f337ca4ff7bd540a0aa5e37131659348", size = 222151 },
+    { url = "https://files.pythonhosted.org/packages/e7/af/5e29da6f80cebab3f5a4dcd2a3240e7f56f2c4abf51cbfcc99be34e17f0b/propcache-0.2.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f508b0491767bb1f2b87fdfacaba5f7eddc2f867740ec69ece6d1946d29029a6", size = 233812 },
+    { url = "https://files.pythonhosted.org/packages/8c/89/ebe3ad52642cc5509eaa453e9f4b94b374d81bae3265c59d5c2d98efa1b4/propcache-0.2.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:1672137af7c46662a1c2be1e8dc78cb6d224319aaa40271c9257d886be4363a6", size = 238829 },
+    { url = "https://files.pythonhosted.org/packages/e9/2f/6b32f273fa02e978b7577159eae7471b3cfb88b48563b1c2578b2d7ca0bb/propcache-0.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b74c261802d3d2b85c9df2dfb2fa81b6f90deeef63c2db9f0e029a3cac50b518", size = 230704 },
+    { url = "https://files.pythonhosted.org/packages/5c/2e/f40ae6ff5624a5f77edd7b8359b208b5455ea113f68309e2b00a2e1426b6/propcache-0.2.1-cp312-cp312-win32.whl", hash = "sha256:d09c333d36c1409d56a9d29b3a1b800a42c76a57a5a8907eacdbce3f18768246", size = 40050 },
+    { url = "https://files.pythonhosted.org/packages/3b/77/a92c3ef994e47180862b9d7d11e37624fb1c00a16d61faf55115d970628b/propcache-0.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:c214999039d4f2a5b2073ac506bba279945233da8c786e490d411dfc30f855c1", size = 44117 },
+    { url = "https://files.pythonhosted.org/packages/41/b6/c5319caea262f4821995dca2107483b94a3345d4607ad797c76cb9c36bcc/propcache-0.2.1-py3-none-any.whl", hash = "sha256:52277518d6aae65536e9cea52d4e7fd2f7a66f4aa2d30ed3f2fcea620ace3c54", size = 11818 },
+]
+
+[[package]]
+name = "pydantic"
+version = "2.10.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/ae/d5220c5c52b158b1de7ca89fc5edb72f304a70a4c540c84c8844bf4008de/pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236", size = 761681 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/3c/8cc1cc84deffa6e25d2d0c688ebb80635dfdbf1dbea3e30c541c8cf4d860/pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584", size = 431696 },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.27.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fc/01/f3e5ac5e7c25833db5eb555f7b7ab24cd6f8c322d3a3ad2d67a952dc0abc/pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39", size = 413443 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3a/bc/fed5f74b5d802cf9a03e83f60f18864e90e3aed7223adaca5ffb7a8d8d64/pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa", size = 1895938 },
+    { url = "https://files.pythonhosted.org/packages/71/2a/185aff24ce844e39abb8dd680f4e959f0006944f4a8a0ea372d9f9ae2e53/pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c", size = 1815684 },
+    { url = "https://files.pythonhosted.org/packages/c3/43/fafabd3d94d159d4f1ed62e383e264f146a17dd4d48453319fd782e7979e/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7969e133a6f183be60e9f6f56bfae753585680f3b7307a8e555a948d443cc05a", size = 1829169 },
+    { url = "https://files.pythonhosted.org/packages/a2/d1/f2dfe1a2a637ce6800b799aa086d079998959f6f1215eb4497966efd2274/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3de9961f2a346257caf0aa508a4da705467f53778e9ef6fe744c038119737ef5", size = 1867227 },
+    { url = "https://files.pythonhosted.org/packages/7d/39/e06fcbcc1c785daa3160ccf6c1c38fea31f5754b756e34b65f74e99780b5/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2bb4d3e5873c37bb3dd58714d4cd0b0e6238cebc4177ac8fe878f8b3aa8e74c", size = 2037695 },
+    { url = "https://files.pythonhosted.org/packages/7a/67/61291ee98e07f0650eb756d44998214231f50751ba7e13f4f325d95249ab/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:280d219beebb0752699480fe8f1dc61ab6615c2046d76b7ab7ee38858de0a4e7", size = 2741662 },
+    { url = "https://files.pythonhosted.org/packages/32/90/3b15e31b88ca39e9e626630b4c4a1f5a0dfd09076366f4219429e6786076/pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47956ae78b6422cbd46f772f1746799cbb862de838fd8d1fbd34a82e05b0983a", size = 1993370 },
+    { url = "https://files.pythonhosted.org/packages/ff/83/c06d333ee3a67e2e13e07794995c1535565132940715931c1c43bfc85b11/pydantic_core-2.27.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:14d4a5c49d2f009d62a2a7140d3064f686d17a5d1a268bc641954ba181880236", size = 1996813 },
+    { url = "https://files.pythonhosted.org/packages/7c/f7/89be1c8deb6e22618a74f0ca0d933fdcb8baa254753b26b25ad3acff8f74/pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:337b443af21d488716f8d0b6164de833e788aa6bd7e3a39c005febc1284f4962", size = 2005287 },
+    { url = "https://files.pythonhosted.org/packages/b7/7d/8eb3e23206c00ef7feee17b83a4ffa0a623eb1a9d382e56e4aa46fd15ff2/pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:03d0f86ea3184a12f41a2d23f7ccb79cdb5a18e06993f8a45baa8dfec746f0e9", size = 2128414 },
+    { url = "https://files.pythonhosted.org/packages/4e/99/fe80f3ff8dd71a3ea15763878d464476e6cb0a2db95ff1c5c554133b6b83/pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7041c36f5680c6e0f08d922aed302e98b3745d97fe1589db0a3eebf6624523af", size = 2155301 },
+    { url = "https://files.pythonhosted.org/packages/2b/a3/e50460b9a5789ca1451b70d4f52546fa9e2b420ba3bfa6100105c0559238/pydantic_core-2.27.2-cp310-cp310-win32.whl", hash = "sha256:50a68f3e3819077be2c98110c1f9dcb3817e93f267ba80a2c05bb4f8799e2ff4", size = 1816685 },
+    { url = "https://files.pythonhosted.org/packages/57/4c/a8838731cb0f2c2a39d3535376466de6049034d7b239c0202a64aaa05533/pydantic_core-2.27.2-cp310-cp310-win_amd64.whl", hash = "sha256:e0fd26b16394ead34a424eecf8a31a1f5137094cabe84a1bcb10fa6ba39d3d31", size = 1982876 },
+    { url = "https://files.pythonhosted.org/packages/c2/89/f3450af9d09d44eea1f2c369f49e8f181d742f28220f88cc4dfaae91ea6e/pydantic_core-2.27.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8e10c99ef58cfdf2a66fc15d66b16c4a04f62bca39db589ae8cba08bc55331bc", size = 1893421 },
+    { url = "https://files.pythonhosted.org/packages/9e/e3/71fe85af2021f3f386da42d291412e5baf6ce7716bd7101ea49c810eda90/pydantic_core-2.27.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26f32e0adf166a84d0cb63be85c562ca8a6fa8de28e5f0d92250c6b7e9e2aff7", size = 1814998 },
+    { url = "https://files.pythonhosted.org/packages/a6/3c/724039e0d848fd69dbf5806894e26479577316c6f0f112bacaf67aa889ac/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c19d1ea0673cd13cc2f872f6c9ab42acc4e4f492a7ca9d3795ce2b112dd7e15", size = 1826167 },
+    { url = "https://files.pythonhosted.org/packages/2b/5b/1b29e8c1fb5f3199a9a57c1452004ff39f494bbe9bdbe9a81e18172e40d3/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e68c4446fe0810e959cdff46ab0a41ce2f2c86d227d96dc3847af0ba7def306", size = 1865071 },
+    { url = "https://files.pythonhosted.org/packages/89/6c/3985203863d76bb7d7266e36970d7e3b6385148c18a68cc8915fd8c84d57/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9640b0059ff4f14d1f37321b94061c6db164fbe49b334b31643e0528d100d99", size = 2036244 },
+    { url = "https://files.pythonhosted.org/packages/0e/41/f15316858a246b5d723f7d7f599f79e37493b2e84bfc789e58d88c209f8a/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40d02e7d45c9f8af700f3452f329ead92da4c5f4317ca9b896de7ce7199ea459", size = 2737470 },
+    { url = "https://files.pythonhosted.org/packages/a8/7c/b860618c25678bbd6d1d99dbdfdf0510ccb50790099b963ff78a124b754f/pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c1fd185014191700554795c99b347d64f2bb637966c4cfc16998a0ca700d048", size = 1992291 },
+    { url = "https://files.pythonhosted.org/packages/bf/73/42c3742a391eccbeab39f15213ecda3104ae8682ba3c0c28069fbcb8c10d/pydantic_core-2.27.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d81d2068e1c1228a565af076598f9e7451712700b673de8f502f0334f281387d", size = 1994613 },
+    { url = "https://files.pythonhosted.org/packages/94/7a/941e89096d1175d56f59340f3a8ebaf20762fef222c298ea96d36a6328c5/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a4207639fb02ec2dbb76227d7c751a20b1a6b4bc52850568e52260cae64ca3b", size = 2002355 },
+    { url = "https://files.pythonhosted.org/packages/6e/95/2359937a73d49e336a5a19848713555605d4d8d6940c3ec6c6c0ca4dcf25/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:3de3ce3c9ddc8bbd88f6e0e304dea0e66d843ec9de1b0042b0911c1663ffd474", size = 2126661 },
+    { url = "https://files.pythonhosted.org/packages/2b/4c/ca02b7bdb6012a1adef21a50625b14f43ed4d11f1fc237f9d7490aa5078c/pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:30c5f68ded0c36466acede341551106821043e9afaad516adfb6e8fa80a4e6a6", size = 2153261 },
+    { url = "https://files.pythonhosted.org/packages/72/9d/a241db83f973049a1092a079272ffe2e3e82e98561ef6214ab53fe53b1c7/pydantic_core-2.27.2-cp311-cp311-win32.whl", hash = "sha256:c70c26d2c99f78b125a3459f8afe1aed4d9687c24fd677c6a4436bc042e50d6c", size = 1812361 },
+    { url = "https://files.pythonhosted.org/packages/e8/ef/013f07248041b74abd48a385e2110aa3a9bbfef0fbd97d4e6d07d2f5b89a/pydantic_core-2.27.2-cp311-cp311-win_amd64.whl", hash = "sha256:08e125dbdc505fa69ca7d9c499639ab6407cfa909214d500897d02afb816e7cc", size = 1982484 },
+    { url = "https://files.pythonhosted.org/packages/10/1c/16b3a3e3398fd29dca77cea0a1d998d6bde3902fa2706985191e2313cc76/pydantic_core-2.27.2-cp311-cp311-win_arm64.whl", hash = "sha256:26f0d68d4b235a2bae0c3fc585c585b4ecc51382db0e3ba402a22cbc440915e4", size = 1867102 },
+    { url = "https://files.pythonhosted.org/packages/d6/74/51c8a5482ca447871c93e142d9d4a92ead74de6c8dc5e66733e22c9bba89/pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0", size = 1893127 },
+    { url = "https://files.pythonhosted.org/packages/d3/f3/c97e80721735868313c58b89d2de85fa80fe8dfeeed84dc51598b92a135e/pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef", size = 1811340 },
+    { url = "https://files.pythonhosted.org/packages/9e/91/840ec1375e686dbae1bd80a9e46c26a1e0083e1186abc610efa3d9a36180/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7", size = 1822900 },
+    { url = "https://files.pythonhosted.org/packages/f6/31/4240bc96025035500c18adc149aa6ffdf1a0062a4b525c932065ceb4d868/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934", size = 1869177 },
+    { url = "https://files.pythonhosted.org/packages/fa/20/02fbaadb7808be578317015c462655c317a77a7c8f0ef274bc016a784c54/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6", size = 2038046 },
+    { url = "https://files.pythonhosted.org/packages/06/86/7f306b904e6c9eccf0668248b3f272090e49c275bc488a7b88b0823444a4/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c", size = 2685386 },
+    { url = "https://files.pythonhosted.org/packages/8d/f0/49129b27c43396581a635d8710dae54a791b17dfc50c70164866bbf865e3/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2", size = 1997060 },
+    { url = "https://files.pythonhosted.org/packages/0d/0f/943b4af7cd416c477fd40b187036c4f89b416a33d3cc0ab7b82708a667aa/pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4", size = 2004870 },
+    { url = "https://files.pythonhosted.org/packages/35/40/aea70b5b1a63911c53a4c8117c0a828d6790483f858041f47bab0b779f44/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3", size = 1999822 },
+    { url = "https://files.pythonhosted.org/packages/f2/b3/807b94fd337d58effc5498fd1a7a4d9d59af4133e83e32ae39a96fddec9d/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4", size = 2130364 },
+    { url = "https://files.pythonhosted.org/packages/fc/df/791c827cd4ee6efd59248dca9369fb35e80a9484462c33c6649a8d02b565/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57", size = 2158303 },
+    { url = "https://files.pythonhosted.org/packages/9b/67/4e197c300976af185b7cef4c02203e175fb127e414125916bf1128b639a9/pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc", size = 1834064 },
+    { url = "https://files.pythonhosted.org/packages/1f/ea/cd7209a889163b8dcca139fe32b9687dd05249161a3edda62860430457a5/pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9", size = 1989046 },
+    { url = "https://files.pythonhosted.org/packages/bc/49/c54baab2f4658c26ac633d798dab66b4c3a9bbf47cff5284e9c182f4137a/pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b", size = 1885092 },
+    { url = "https://files.pythonhosted.org/packages/46/72/af70981a341500419e67d5cb45abe552a7c74b66326ac8877588488da1ac/pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:2bf14caea37e91198329b828eae1618c068dfb8ef17bb33287a7ad4b61ac314e", size = 1891159 },
+    { url = "https://files.pythonhosted.org/packages/ad/3d/c5913cccdef93e0a6a95c2d057d2c2cba347815c845cda79ddd3c0f5e17d/pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0cb791f5b45307caae8810c2023a184c74605ec3bcbb67d13846c28ff731ff8", size = 1768331 },
+    { url = "https://files.pythonhosted.org/packages/f6/f0/a3ae8fbee269e4934f14e2e0e00928f9346c5943174f2811193113e58252/pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:688d3fd9fcb71f41c4c015c023d12a79d1c4c0732ec9eb35d96e3388a120dcf3", size = 1822467 },
+    { url = "https://files.pythonhosted.org/packages/d7/7a/7bbf241a04e9f9ea24cd5874354a83526d639b02674648af3f350554276c/pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d591580c34f4d731592f0e9fe40f9cc1b430d297eecc70b962e93c5c668f15f", size = 1979797 },
+    { url = "https://files.pythonhosted.org/packages/4f/5f/4784c6107731f89e0005a92ecb8a2efeafdb55eb992b8e9d0a2be5199335/pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:82f986faf4e644ffc189a7f1aafc86e46ef70372bb153e7001e8afccc6e54133", size = 1987839 },
+    { url = "https://files.pythonhosted.org/packages/6d/a7/61246562b651dff00de86a5f01b6e4befb518df314c54dec187a78d81c84/pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bec317a27290e2537f922639cafd54990551725fc844249e64c523301d0822fc", size = 1998861 },
+    { url = "https://files.pythonhosted.org/packages/86/aa/837821ecf0c022bbb74ca132e117c358321e72e7f9702d1b6a03758545e2/pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:0296abcb83a797db256b773f45773da397da75a08f5fcaef41f2044adec05f50", size = 2116582 },
+    { url = "https://files.pythonhosted.org/packages/81/b0/5e74656e95623cbaa0a6278d16cf15e10a51f6002e3ec126541e95c29ea3/pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0d75070718e369e452075a6017fbf187f788e17ed67a3abd47fa934d001863d9", size = 2151985 },
+    { url = "https://files.pythonhosted.org/packages/63/37/3e32eeb2a451fddaa3898e2163746b0cffbbdbb4740d38372db0490d67f3/pydantic_core-2.27.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7e17b560be3c98a8e3aa66ce828bdebb9e9ac6ad5466fba92eb74c4c95cb1151", size = 2004715 },
+]
+
+[[package]]
+name = "pytest"
+version = "8.3.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/05/35/30e0d83068951d90a01852cb1cef56e5d8a09d20c7f511634cc2f7e0372a/pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761", size = 1445919 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/92/76a1c94d3afee238333bc0a42b82935dd8f9cf8ce9e336ff87ee14d9e1cf/pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6", size = 343083 },
+]
+
+[[package]]
+name = "pytest-asyncio"
+version = "0.25.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f2/a8/ecbc8ede70921dd2f544ab1cadd3ff3bf842af27f87bbdea774c7baa1d38/pytest_asyncio-0.25.3.tar.gz", hash = "sha256:fc1da2cf9f125ada7e710b4ddad05518d4cee187ae9412e9ac9271003497f07a", size = 54239 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/67/17/3493c5624e48fd97156ebaec380dcaafee9506d7e2c46218ceebbb57d7de/pytest_asyncio-0.25.3-py3-none-any.whl", hash = "sha256:9e89518e0f9bd08928f97a3482fdc4e244df17529460bc038291ccaf8f85c7c3", size = 19467 },
+]
+
+[[package]]
+name = "pywin32"
+version = "308"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/72/a6/3e9f2c474895c1bb61b11fa9640be00067b5c5b363c501ee9c3fa53aec01/pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e", size = 5927028 },
+    { url = "https://files.pythonhosted.org/packages/d9/b4/84e2463422f869b4b718f79eb7530a4c1693e96b8a4e5e968de38be4d2ba/pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e", size = 6558484 },
+    { url = "https://files.pythonhosted.org/packages/9f/8f/fb84ab789713f7c6feacaa08dad3ec8105b88ade8d1c4f0f0dfcaaa017d6/pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c", size = 7971454 },
+    { url = "https://files.pythonhosted.org/packages/eb/e2/02652007469263fe1466e98439831d65d4ca80ea1a2df29abecedf7e47b7/pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a", size = 5928156 },
+    { url = "https://files.pythonhosted.org/packages/48/ef/f4fb45e2196bc7ffe09cad0542d9aff66b0e33f6c0954b43e49c33cad7bd/pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b", size = 6559559 },
+    { url = "https://files.pythonhosted.org/packages/79/ef/68bb6aa865c5c9b11a35771329e95917b5559845bd75b65549407f9fc6b4/pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6", size = 7972495 },
+    { url = "https://files.pythonhosted.org/packages/00/7c/d00d6bdd96de4344e06c4afbf218bc86b54436a94c01c71a8701f613aa56/pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897", size = 5939729 },
+    { url = "https://files.pythonhosted.org/packages/21/27/0c8811fbc3ca188f93b5354e7c286eb91f80a53afa4e11007ef661afa746/pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47", size = 6543015 },
+    { url = "https://files.pythonhosted.org/packages/9d/0f/d40f8373608caed2255781a3ad9a51d03a594a1248cd632d6a298daca693/pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091", size = 7976033 },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/95/a3fac87cb7158e231b5a6012e438c647e1a87f09f8e0d123acec8ab8bf71/PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086", size = 184199 },
+    { url = "https://files.pythonhosted.org/packages/c7/7a/68bd47624dab8fd4afbfd3c48e3b79efe09098ae941de5b58abcbadff5cb/PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf", size = 171758 },
+    { url = "https://files.pythonhosted.org/packages/49/ee/14c54df452143b9ee9f0f29074d7ca5516a36edb0b4cc40c3f280131656f/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237", size = 718463 },
+    { url = "https://files.pythonhosted.org/packages/4d/61/de363a97476e766574650d742205be468921a7b532aa2499fcd886b62530/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b", size = 719280 },
+    { url = "https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", size = 751239 },
+    { url = "https://files.pythonhosted.org/packages/b7/33/5504b3a9a4464893c32f118a9cc045190a91637b119a9c881da1cf6b7a72/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180", size = 695802 },
+    { url = "https://files.pythonhosted.org/packages/5c/20/8347dcabd41ef3a3cdc4f7b7a2aff3d06598c8779faa189cdbf878b626a4/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68", size = 720527 },
+    { url = "https://files.pythonhosted.org/packages/be/aa/5afe99233fb360d0ff37377145a949ae258aaab831bde4792b32650a4378/PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99", size = 144052 },
+    { url = "https://files.pythonhosted.org/packages/b5/84/0fa4b06f6d6c958d207620fc60005e241ecedceee58931bb20138e1e5776/PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e", size = 161774 },
+    { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612 },
+    { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040 },
+    { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829 },
+    { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167 },
+    { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952 },
+    { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301 },
+    { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638 },
+    { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850 },
+    { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980 },
+    { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873 },
+    { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302 },
+    { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154 },
+    { url = "https://files.pythonhosted.org/packages/95/0f/b8938f1cbd09739c6da569d172531567dbcc9789e0029aa070856f123984/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", size = 766223 },
+    { url = "https://files.pythonhosted.org/packages/b9/2b/614b4752f2e127db5cc206abc23a8c19678e92b23c3db30fc86ab731d3bd/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", size = 767542 },
+    { url = "https://files.pythonhosted.org/packages/d4/00/dd137d5bcc7efea1836d6264f049359861cf548469d18da90cd8216cf05f/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", size = 731164 },
+    { url = "https://files.pythonhosted.org/packages/c9/1f/4f998c900485e5c0ef43838363ba4a9723ac0ad73a9dc42068b12aaba4e4/PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", size = 756611 },
+    { url = "https://files.pythonhosted.org/packages/df/d1/f5a275fdb252768b7a11ec63585bc38d0e87c9e05668a139fea92b80634c/PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", size = 140591 },
+    { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338 },
+]
+
+[[package]]
+name = "requests"
+version = "2.32.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
+]
+
+[[package]]
+name = "syrupy"
+version = "4.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c4/32/8b56491ed50ae103c2db14885c29fe765170bdf044fe5868548113da35ef/syrupy-4.8.1.tar.gz", hash = "sha256:8da8c0311e6d92de0b15767768c6ab98982b7b4a4c67083c08fbac3fbad4d44c", size = 50192 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/47/5e8f44ec0f287b08e8c1f3fc63fe1fbe182f07bf606eec903d7827b95e51/syrupy-4.8.1-py3-none-any.whl", hash = "sha256:274f97cbaf44175f5e478a2f3a53559d31f41c66c6bf28131695f94ac893ea00", size = 50326 },
+]
+
+[[package]]
+name = "text-generation"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "huggingface-hub" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/53/1b2dc20686079464ae381f230a9fc412984a4255cea73c21afb6a46bc21f/text_generation-0.7.0.tar.gz", hash = "sha256:689200cd1f0d4141562af2515393c2c21cdbd9fac21c8398bf3043cdcc14184e", size = 10373 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/79/8fc351fd919a41287243c998a47692c7eb0fa5acded13db0080f2c6f1852/text_generation-0.7.0-py3-none-any.whl", hash = "sha256:02ab337a0ee0e7c70e04a607b311c261caae74bde46a7d837c6fdd150108f4d8", size = 12718 },
+]
+
+[[package]]
+name = "text-generation-integration-tests"
+version = "2.0.1"
+source = { virtual = "." }
+dependencies = [
+    { name = "docker" },
+    { name = "numpy" },
+    { name = "pydantic" },
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "syrupy" },
+    { name = "text-generation" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "docker", specifier = ">=7" },
+    { name = "numpy", specifier = ">=2.0" },
+    { name = "pydantic", specifier = ">2,<3" },
+    { name = "pytest", specifier = ">=8.3.0" },
+    { name = "pytest-asyncio", specifier = ">=0.23.1" },
+    { name = "syrupy", specifier = ">=4.8.0" },
+    { name = "text-generation", specifier = ">=0.6.0" },
+]
+
+[[package]]
+name = "tomli"
+version = "2.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077 },
+    { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429 },
+    { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067 },
+    { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030 },
+    { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898 },
+    { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894 },
+    { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319 },
+    { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273 },
+    { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310 },
+    { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309 },
+    { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762 },
+    { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453 },
+    { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486 },
+    { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349 },
+    { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159 },
+    { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243 },
+    { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645 },
+    { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584 },
+    { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875 },
+    { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418 },
+    { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 },
+]
+
+[[package]]
+name = "tqdm"
+version = "4.67.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540 },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.12.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 },
+]
+
+[[package]]
+name = "yarl"
+version = "1.18.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "multidict" },
+    { name = "propcache" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/9d/4b94a8e6d2b51b599516a5cb88e5bc99b4d8d4583e468057eaa29d5f0918/yarl-1.18.3.tar.gz", hash = "sha256:ac1801c45cbf77b6c99242eeff4fffb5e4e73a800b5c4ad4fc0be5def634d2e1", size = 181062 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/98/e005bc608765a8a5569f58e650961314873c8469c333616eb40bff19ae97/yarl-1.18.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34", size = 141458 },
+    { url = "https://files.pythonhosted.org/packages/df/5d/f8106b263b8ae8a866b46d9be869ac01f9b3fb7f2325f3ecb3df8003f796/yarl-1.18.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7", size = 94365 },
+    { url = "https://files.pythonhosted.org/packages/56/3e/d8637ddb9ba69bf851f765a3ee288676f7cf64fb3be13760c18cbc9d10bd/yarl-1.18.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:602d98f2c2d929f8e697ed274fbadc09902c4025c5a9963bf4e9edfc3ab6f7ed", size = 92181 },
+    { url = "https://files.pythonhosted.org/packages/76/f9/d616a5c2daae281171de10fba41e1c0e2d8207166fc3547252f7d469b4e1/yarl-1.18.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c654d5207c78e0bd6d749f6dae1dcbbfde3403ad3a4b11f3c5544d9906969dde", size = 315349 },
+    { url = "https://files.pythonhosted.org/packages/bb/b4/3ea5e7b6f08f698b3769a06054783e434f6d59857181b5c4e145de83f59b/yarl-1.18.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5094d9206c64181d0f6e76ebd8fb2f8fe274950a63890ee9e0ebfd58bf9d787b", size = 330494 },
+    { url = "https://files.pythonhosted.org/packages/55/f1/e0fc810554877b1b67420568afff51b967baed5b53bcc983ab164eebf9c9/yarl-1.18.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35098b24e0327fc4ebdc8ffe336cee0a87a700c24ffed13161af80124b7dc8e5", size = 326927 },
+    { url = "https://files.pythonhosted.org/packages/a9/42/b1753949b327b36f210899f2dd0a0947c0c74e42a32de3f8eb5c7d93edca/yarl-1.18.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3236da9272872443f81fedc389bace88408f64f89f75d1bdb2256069a8730ccc", size = 319703 },
+    { url = "https://files.pythonhosted.org/packages/f0/6d/e87c62dc9635daefb064b56f5c97df55a2e9cc947a2b3afd4fd2f3b841c7/yarl-1.18.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2c08cc9b16f4f4bc522771d96734c7901e7ebef70c6c5c35dd0f10845270bcd", size = 310246 },
+    { url = "https://files.pythonhosted.org/packages/e3/ef/e2e8d1785cdcbd986f7622d7f0098205f3644546da7919c24b95790ec65a/yarl-1.18.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80316a8bd5109320d38eef8833ccf5f89608c9107d02d2a7f985f98ed6876990", size = 319730 },
+    { url = "https://files.pythonhosted.org/packages/fc/15/8723e22345bc160dfde68c4b3ae8b236e868f9963c74015f1bc8a614101c/yarl-1.18.3-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:c1e1cc06da1491e6734f0ea1e6294ce00792193c463350626571c287c9a704db", size = 321681 },
+    { url = "https://files.pythonhosted.org/packages/86/09/bf764e974f1516efa0ae2801494a5951e959f1610dd41edbfc07e5e0f978/yarl-1.18.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62", size = 324812 },
+    { url = "https://files.pythonhosted.org/packages/f6/4c/20a0187e3b903c97d857cf0272d687c1b08b03438968ae8ffc50fe78b0d6/yarl-1.18.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e3b9fd71836999aad54084906f8663dffcd2a7fb5cdafd6c37713b2e72be1760", size = 337011 },
+    { url = "https://files.pythonhosted.org/packages/c9/71/6244599a6e1cc4c9f73254a627234e0dad3883ece40cc33dce6265977461/yarl-1.18.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:757e81cae69244257d125ff31663249b3013b5dc0a8520d73694aed497fb195b", size = 338132 },
+    { url = "https://files.pythonhosted.org/packages/af/f5/e0c3efaf74566c4b4a41cb76d27097df424052a064216beccae8d303c90f/yarl-1.18.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b1771de9944d875f1b98a745bc547e684b863abf8f8287da8466cf470ef52690", size = 331849 },
+    { url = "https://files.pythonhosted.org/packages/8a/b8/3d16209c2014c2f98a8f658850a57b716efb97930aebf1ca0d9325933731/yarl-1.18.3-cp310-cp310-win32.whl", hash = "sha256:8874027a53e3aea659a6d62751800cf6e63314c160fd607489ba5c2edd753cf6", size = 84309 },
+    { url = "https://files.pythonhosted.org/packages/fd/b7/2e9a5b18eb0fe24c3a0e8bae994e812ed9852ab4fd067c0107fadde0d5f0/yarl-1.18.3-cp310-cp310-win_amd64.whl", hash = "sha256:93b2e109287f93db79210f86deb6b9bbb81ac32fc97236b16f7433db7fc437d8", size = 90484 },
+    { url = "https://files.pythonhosted.org/packages/40/93/282b5f4898d8e8efaf0790ba6d10e2245d2c9f30e199d1a85cae9356098c/yarl-1.18.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8503ad47387b8ebd39cbbbdf0bf113e17330ffd339ba1144074da24c545f0069", size = 141555 },
+    { url = "https://files.pythonhosted.org/packages/6d/9c/0a49af78df099c283ca3444560f10718fadb8a18dc8b3edf8c7bd9fd7d89/yarl-1.18.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193", size = 94351 },
+    { url = "https://files.pythonhosted.org/packages/5a/a1/205ab51e148fdcedad189ca8dd587794c6f119882437d04c33c01a75dece/yarl-1.18.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:67a283dd2882ac98cc6318384f565bffc751ab564605959df4752d42483ad889", size = 92286 },
+    { url = "https://files.pythonhosted.org/packages/ed/fe/88b690b30f3f59275fb674f5f93ddd4a3ae796c2b62e5bb9ece8a4914b83/yarl-1.18.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d980e0325b6eddc81331d3f4551e2a333999fb176fd153e075c6d1c2530aa8a8", size = 340649 },
+    { url = "https://files.pythonhosted.org/packages/07/eb/3b65499b568e01f36e847cebdc8d7ccb51fff716dbda1ae83c3cbb8ca1c9/yarl-1.18.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b643562c12680b01e17239be267bc306bbc6aac1f34f6444d1bded0c5ce438ca", size = 356623 },
+    { url = "https://files.pythonhosted.org/packages/33/46/f559dc184280b745fc76ec6b1954de2c55595f0ec0a7614238b9ebf69618/yarl-1.18.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c017a3b6df3a1bd45b9fa49a0f54005e53fbcad16633870104b66fa1a30a29d8", size = 354007 },
+    { url = "https://files.pythonhosted.org/packages/af/ba/1865d85212351ad160f19fb99808acf23aab9a0f8ff31c8c9f1b4d671fc9/yarl-1.18.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75674776d96d7b851b6498f17824ba17849d790a44d282929c42dbb77d4f17ae", size = 344145 },
+    { url = "https://files.pythonhosted.org/packages/94/cb/5c3e975d77755d7b3d5193e92056b19d83752ea2da7ab394e22260a7b824/yarl-1.18.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ccaa3a4b521b780a7e771cc336a2dba389a0861592bbce09a476190bb0c8b4b3", size = 336133 },
+    { url = "https://files.pythonhosted.org/packages/19/89/b77d3fd249ab52a5c40859815765d35c91425b6bb82e7427ab2f78f5ff55/yarl-1.18.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2d06d3005e668744e11ed80812e61efd77d70bb7f03e33c1598c301eea20efbb", size = 347967 },
+    { url = "https://files.pythonhosted.org/packages/35/bd/f6b7630ba2cc06c319c3235634c582a6ab014d52311e7d7c22f9518189b5/yarl-1.18.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:9d41beda9dc97ca9ab0b9888cb71f7539124bc05df02c0cff6e5acc5a19dcc6e", size = 346397 },
+    { url = "https://files.pythonhosted.org/packages/18/1a/0b4e367d5a72d1f095318344848e93ea70da728118221f84f1bf6c1e39e7/yarl-1.18.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ba23302c0c61a9999784e73809427c9dbedd79f66a13d84ad1b1943802eaaf59", size = 350206 },
+    { url = "https://files.pythonhosted.org/packages/b5/cf/320fff4367341fb77809a2d8d7fe75b5d323a8e1b35710aafe41fdbf327b/yarl-1.18.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6748dbf9bfa5ba1afcc7556b71cda0d7ce5f24768043a02a58846e4a443d808d", size = 362089 },
+    { url = "https://files.pythonhosted.org/packages/57/cf/aadba261d8b920253204085268bad5e8cdd86b50162fcb1b10c10834885a/yarl-1.18.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0b0cad37311123211dc91eadcb322ef4d4a66008d3e1bdc404808992260e1a0e", size = 366267 },
+    { url = "https://files.pythonhosted.org/packages/54/58/fb4cadd81acdee6dafe14abeb258f876e4dd410518099ae9a35c88d8097c/yarl-1.18.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0fb2171a4486bb075316ee754c6d8382ea6eb8b399d4ec62fde2b591f879778a", size = 359141 },
+    { url = "https://files.pythonhosted.org/packages/9a/7a/4c571597589da4cd5c14ed2a0b17ac56ec9ee7ee615013f74653169e702d/yarl-1.18.3-cp311-cp311-win32.whl", hash = "sha256:61b1a825a13bef4a5f10b1885245377d3cd0bf87cba068e1d9a88c2ae36880e1", size = 84402 },
+    { url = "https://files.pythonhosted.org/packages/ae/7b/8600250b3d89b625f1121d897062f629883c2f45339623b69b1747ec65fa/yarl-1.18.3-cp311-cp311-win_amd64.whl", hash = "sha256:b9d60031cf568c627d028239693fd718025719c02c9f55df0a53e587aab951b5", size = 91030 },
+    { url = "https://files.pythonhosted.org/packages/33/85/bd2e2729752ff4c77338e0102914897512e92496375e079ce0150a6dc306/yarl-1.18.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1dd4bdd05407ced96fed3d7f25dbbf88d2ffb045a0db60dbc247f5b3c5c25d50", size = 142644 },
+    { url = "https://files.pythonhosted.org/packages/ff/74/1178322cc0f10288d7eefa6e4a85d8d2e28187ccab13d5b844e8b5d7c88d/yarl-1.18.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7c33dd1931a95e5d9a772d0ac5e44cac8957eaf58e3c8da8c1414de7dd27c576", size = 94962 },
+    { url = "https://files.pythonhosted.org/packages/be/75/79c6acc0261e2c2ae8a1c41cf12265e91628c8c58ae91f5ff59e29c0787f/yarl-1.18.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:25b411eddcfd56a2f0cd6a384e9f4f7aa3efee14b188de13048c25b5e91f1640", size = 92795 },
+    { url = "https://files.pythonhosted.org/packages/6b/32/927b2d67a412c31199e83fefdce6e645247b4fb164aa1ecb35a0f9eb2058/yarl-1.18.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:436c4fc0a4d66b2badc6c5fc5ef4e47bb10e4fd9bf0c79524ac719a01f3607c2", size = 332368 },
+    { url = "https://files.pythonhosted.org/packages/19/e5/859fca07169d6eceeaa4fde1997c91d8abde4e9a7c018e371640c2da2b71/yarl-1.18.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e35ef8683211db69ffe129a25d5634319a677570ab6b2eba4afa860f54eeaf75", size = 342314 },
+    { url = "https://files.pythonhosted.org/packages/08/75/76b63ccd91c9e03ab213ef27ae6add2e3400e77e5cdddf8ed2dbc36e3f21/yarl-1.18.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:84b2deecba4a3f1a398df819151eb72d29bfeb3b69abb145a00ddc8d30094512", size = 341987 },
+    { url = "https://files.pythonhosted.org/packages/1a/e1/a097d5755d3ea8479a42856f51d97eeff7a3a7160593332d98f2709b3580/yarl-1.18.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba", size = 336914 },
+    { url = "https://files.pythonhosted.org/packages/0b/42/e1b4d0e396b7987feceebe565286c27bc085bf07d61a59508cdaf2d45e63/yarl-1.18.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0e883008013c0e4aef84dcfe2a0b172c4d23c2669412cf5b3371003941f72bb", size = 325765 },
+    { url = "https://files.pythonhosted.org/packages/7e/18/03a5834ccc9177f97ca1bbb245b93c13e58e8225276f01eedc4cc98ab820/yarl-1.18.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5a3f356548e34a70b0172d8890006c37be92995f62d95a07b4a42e90fba54272", size = 344444 },
+    { url = "https://files.pythonhosted.org/packages/c8/03/a713633bdde0640b0472aa197b5b86e90fbc4c5bc05b727b714cd8a40e6d/yarl-1.18.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ccd17349166b1bee6e529b4add61727d3f55edb7babbe4069b5764c9587a8cc6", size = 340760 },
+    { url = "https://files.pythonhosted.org/packages/eb/99/f6567e3f3bbad8fd101886ea0276c68ecb86a2b58be0f64077396cd4b95e/yarl-1.18.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b958ddd075ddba5b09bb0be8a6d9906d2ce933aee81100db289badbeb966f54e", size = 346484 },
+    { url = "https://files.pythonhosted.org/packages/8e/a9/84717c896b2fc6cb15bd4eecd64e34a2f0a9fd6669e69170c73a8b46795a/yarl-1.18.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c7d79f7d9aabd6011004e33b22bc13056a3e3fb54794d138af57f5ee9d9032cb", size = 359864 },
+    { url = "https://files.pythonhosted.org/packages/1e/2e/d0f5f1bef7ee93ed17e739ec8dbcb47794af891f7d165fa6014517b48169/yarl-1.18.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4891ed92157e5430874dad17b15eb1fda57627710756c27422200c52d8a4e393", size = 364537 },
+    { url = "https://files.pythonhosted.org/packages/97/8a/568d07c5d4964da5b02621a517532adb8ec5ba181ad1687191fffeda0ab6/yarl-1.18.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ce1af883b94304f493698b00d0f006d56aea98aeb49d75ec7d98cd4a777e9285", size = 357861 },
+    { url = "https://files.pythonhosted.org/packages/7d/e3/924c3f64b6b3077889df9a1ece1ed8947e7b61b0a933f2ec93041990a677/yarl-1.18.3-cp312-cp312-win32.whl", hash = "sha256:f91c4803173928a25e1a55b943c81f55b8872f0018be83e3ad4938adffb77dd2", size = 84097 },
+    { url = "https://files.pythonhosted.org/packages/34/45/0e055320daaabfc169b21ff6174567b2c910c45617b0d79c68d7ab349b02/yarl-1.18.3-cp312-cp312-win_amd64.whl", hash = "sha256:7e2ee16578af3b52ac2f334c3b1f92262f47e02cc6193c598502bd46f5cd1477", size = 90399 },
+    { url = "https://files.pythonhosted.org/packages/f5/4b/a06e0ec3d155924f77835ed2d167ebd3b211a7b0853da1cf8d8414d784ef/yarl-1.18.3-py3-none-any.whl", hash = "sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b", size = 45109 },
+]

From 230aa256417e0d22b7e11a295c4674aaeba3a00d Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Wed, 19 Feb 2025 21:09:12 +0100
Subject: [PATCH 097/183] feat: Add the parsing of HF_HUB_USER_AGENT_ORIGIN
 environment variable for telemetry (#3027)

* feat: Add the parsing of HF_HUB_USER_AGENT_ORIGIN environment variable to add info about the environment running TGI. That is useful to track usage in case of collaborations for example.

* fix: trufflehog
---
 .github/workflows/trufflehog.yaml | 3 ++-
 router/src/server.rs              | 3 +++
 router/src/usage_stats.rs         | 3 +++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/trufflehog.yaml b/.github/workflows/trufflehog.yaml
index 024702b6..9f1c5f36 100644
--- a/.github/workflows/trufflehog.yaml
+++ b/.github/workflows/trufflehog.yaml
@@ -17,4 +17,5 @@ jobs:
       - name: Secret Scanning
         uses: trufflesecurity/trufflehog@853e1e8d249fd1e29d0fcc7280d29b03df3d643d
         with:
-          extra_args: --results=verified,unknown
+          # exclude buggy postgres detector that is causing false positives and not relevant to our codebase
+          extra_args: --results=verified,unknown --exclude-detectors=postgres
diff --git a/router/src/server.rs b/router/src/server.rs
index 9e57af27..e9d2fcf4 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1877,6 +1877,8 @@ pub async fn run(
 
     // Only send usage stats when TGI is run in container and the function returns Some
     let is_container = matches!(usage_stats::is_container(), Ok(true));
+    // retrieve the huggingface_hub user agent origin if set, and add the origin to telemetry
+    let origin = std::env::var("HF_HUB_USER_AGENT_ORIGIN").ok();
     let user_agent = match (usage_stats_level, is_container) {
         (usage_stats::UsageStatsLevel::On | usage_stats::UsageStatsLevel::NoStack, true) => {
             let reduced_args = usage_stats::Args::new(
@@ -1899,6 +1901,7 @@ pub async fn run(
                 max_client_batch_size,
                 usage_stats_level,
                 backend.name(),
+                origin,
             );
             Some(usage_stats::UserAgent::new(reduced_args))
         }
diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs
index c3df0c80..353e9e37 100644
--- a/router/src/usage_stats.rs
+++ b/router/src/usage_stats.rs
@@ -98,6 +98,7 @@ pub struct Args {
     max_client_batch_size: usize,
     usage_stats_level: UsageStatsLevel,
     backend_name: &'static str,
+    origin: Option<String>,
 }
 
 impl Args {
@@ -122,6 +123,7 @@ impl Args {
         max_client_batch_size: usize,
         usage_stats_level: UsageStatsLevel,
         backend_name: &'static str,
+        origin: Option<String>,
     ) -> Self {
         Self {
             model_config,
@@ -143,6 +145,7 @@ impl Args {
             max_client_batch_size,
             usage_stats_level,
             backend_name,
+            origin,
         }
     }
 }

From feaa2477b74d3abc03fae7c7251cc94a46d10bd8 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Thu, 20 Feb 2025 16:12:28 +0800
Subject: [PATCH 098/183] update ipex and torch to 2.6 for cpu (#3039)

ipex cpu 2.6 support topk_group in moe fusion ops

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_intel | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index 7f61d8e8..134a6f66 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -185,22 +185,14 @@ RUN case ${TARGETPLATFORM} in \
 
 RUN conda install -c conda-forge gperftools mkl
 
-
-RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.5.0.dev20240815%2Bcpu-cp311-cp311-linux_x86_64.whl
-RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.20.0.dev20240815%2Bcpu-cp311-cp311-linux_x86_64.whl
-RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240815%2Bcpu-cp311-cp311-linux_x86_64.whl
-
+RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu
 RUN pip install triton==3.1.0 py-libnuma
 
 WORKDIR /usr/src
 
-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout b7b552baf64283b594665b8687430fe92990e497
-RUN git clone https://github.com/intel/torch-ccl.git && cd torch-ccl && git checkout v2.4.0+cpu+rc0
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/cpu/intel_extension_for_pytorch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/cpu/oneccl_bind_pt-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl
 
-RUN sed -i 's/VERSION_MINOR 6/VERSION_MINOR 5/' intel-extension-for-pytorch/version.txt
-RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update --init --recursive && python setup.py install
-
-RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .
 
 ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so
 ENV CCL_ROOT=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch

From ed96ba650389c2a12bdc26541ecc514a21fa57d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 20 Feb 2025 12:34:20 +0100
Subject: [PATCH 099/183] flashinfer 0.2.0.post1 -> post2 (#3040)

* flashinfer 0.2.0.post1 -> post2

* Fix ruff stuff.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 clients/python/tests/test_client.py | 2 +-
 flake.lock                          | 8 ++++----
 flake.nix                           | 2 +-
 server/Makefile-flashinfer          | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/clients/python/tests/test_client.py b/clients/python/tests/test_client.py
index f6ffc89d..0c702c63 100644
--- a/clients/python/tests/test_client.py
+++ b/clients/python/tests/test_client.py
@@ -2,7 +2,7 @@ import pytest
 
 from text_generation import Client, AsyncClient
 from text_generation.errors import NotFoundError, ValidationError
-from text_generation.types import FinishReason, InputToken
+from text_generation.types import FinishReason
 
 
 def test_generate(llama_7b_url, hf_headers):
diff --git a/flake.lock b/flake.lock
index 5d6ee463..512625de 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,16 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1739803255,
-        "narHash": "sha256-lreIfcjSt6D0wOuZ6jm3WEBYvYvED63T+pOKmOgBLi8=",
+        "lastModified": 1740036032,
+        "narHash": "sha256-nqo3U8uNlFIgrOz8wCfgk08Oi+RzQxxFDPipeVMyM/E=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "30ab7423277fc93c8fc0ca4df737478ebfdb8eec",
+        "rev": "e9fb0e818a7e9a54cdab8d9c7c0cef5037fe084a",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "eetq-0.0.1",
+        "ref": "flashinfer-0.2.0.post2",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 5ba50114..6068dc5f 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/eetq-0.0.1";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/flashinfer-0.2.0.post2";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/server/Makefile-flashinfer b/server/Makefile-flashinfer
index d095d841..f311a656 100644
--- a/server/Makefile-flashinfer
+++ b/server/Makefile-flashinfer
@@ -3,4 +3,4 @@ install-flashinfer:
 	# `pip install flashinfer` cannot resolve it.
 	uv pip install fsspec sympy==1.13.1 numpy
 	uv pip install -U setuptools
-	TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0+PTX" FLASHINFER_ENABLE_AOT=1 pip install git+https://github.com/flashinfer-ai/flashinfer.git@v0.2.0.post1#egg=flashinfer  --no-build-isolation
+	TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0+PTX" FLASHINFER_ENABLE_AOT=1 pip install git+https://github.com/flashinfer-ai/flashinfer.git@v0.2.0.post2#egg=flashinfer-python  --no-build-isolation

From 06dfe9abfe958d87a1f4aa1fe960c765e2deb900 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Fri, 21 Feb 2025 07:36:45 +0800
Subject: [PATCH 100/183] fix qwen2 vl crash in continous batching (#3004)

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 .../models/flash_causal_lm.py                   | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index f268e499..e268af8b 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -750,7 +750,16 @@ class FlashCausalLMBatch(Batch):
             adapter_segment_builder = None
         else:
             input_ids = batches[0].input_ids.new_empty(total_batch_size)
-            position_ids = batches[0].position_ids.new_empty(total_batch_size)
+            if (
+                batches[0].position_ids is not None
+                and batches[0].position_ids.dim() == 2
+            ):
+                # Qwen2_vl case:
+                position_ids = batches[0].position_ids.new_empty(
+                    (total_batch_size, batches[0].position_ids.shape[-1])
+                )
+            else:
+                position_ids = batches[0].position_ids.new_empty(total_batch_size)
             slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
             input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
                 total_batch_size
@@ -2133,7 +2142,11 @@ class FlashCausalLM(Model):
         if not prefill or (prefill and finished_prefilling):
             batch.input_ids = next_input_ids[cu_accepted_ids[1:] - 1]
             batch.speculative_ids = speculative_ids
-            batch.position_ids += accepted_ids
+            if batch.position_ids.dim() == 2:
+                # Qwen2_vl case:
+                batch.position_ids += accepted_ids.unsqueeze(-1)
+            else:
+                batch.position_ids += accepted_ids
             batch.cache_lengths_tensor += batch.input_lengths_tensor + accepted_ids - 1
             batch.input_lengths_tensor = torch.ones_like(batch.input_lengths_tensor)
             batch.slot_indices += accepted_ids

From 142a49a80d72bc2982835c9bb834d4fa40e75a3c Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 21 Feb 2025 10:03:40 +0100
Subject: [PATCH 101/183] Simplify logs2. (#3045)

* Simplify logs2.

* Changing the scope from module to session to fix the event_loop issue.
---
 integration-tests/conftest.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 2d3ae8a2..8440df34 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -81,6 +81,8 @@ def container_log(request: SubRequest):
     if request.session.testsfailed:
         error_log.seek(0)
         print(error_log.read(), file=sys.stderr)
+    else:
+        error_log.truncate(0)
 
 
 class ResponseComparator(JSONSnapshotExtension):
@@ -350,13 +352,13 @@ def ignore_logprob_response_snapshot(snapshot):
     return snapshot.use_extension(IgnoreLogProbResponseComparator)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="session")
 def error_log():
     with tempfile.TemporaryFile("w+") as tmp:
         yield tmp
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="session")
 async def launcher(error_log):
     @contextlib.contextmanager
     def local_launcher(

From 3498f6085edee54d8e7d659ccfc2f784cd0fe547 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Fri, 21 Feb 2025 10:11:28 +0100
Subject: [PATCH 102/183] Update Gradio ChatInterface configuration in
 consuming_tgi.md (#3042)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current code does not work and gives the following message:

    UserWarning: You have not specified a value for the `type` parameter. Defaulting to the 'tuples' format for chatbot messages, but this is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style dictionaries with 'role' and 'content' keys.
      warnings.warn(
    Traceback (most recent call last):
      File "/Users/angt/hf/tgi/test-gradio.py", line 22, in <module>
        gr.ChatInterface(
    TypeError: ChatInterface.__init__() got an unexpected keyword argument 'retry_btn'

Signed-off-by: Adrien Gallouët <adrien@gallouet.fr>
---
 docs/source/basic_tutorials/consuming_tgi.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/source/basic_tutorials/consuming_tgi.md b/docs/source/basic_tutorials/consuming_tgi.md
index b07e7219..56237f9a 100644
--- a/docs/source/basic_tutorials/consuming_tgi.md
+++ b/docs/source/basic_tutorials/consuming_tgi.md
@@ -152,14 +152,10 @@ def inference(message, history):
 
 gr.ChatInterface(
     inference,
-    chatbot=gr.Chatbot(height=300),
-    textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7),
+    type="messages",
     description="This is the demo for Gradio UI consuming TGI endpoint.",
     title="Gradio 🤝 TGI",
     examples=["Are tomatoes vegetables?"],
-    retry_btn="Retry",
-    undo_btn="Undo",
-    clear_btn="Clear",
 ).queue().launch()
 ```
 

From 1cae3197c448baf6118c6bb3ea63af02f61db2ea Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Fri, 21 Feb 2025 04:30:29 -0500
Subject: [PATCH 103/183] Improve tool call message processing (#3036)

* make content field optional in chat request

* add tool_calls field to Message struct

* feat: add test and serialize tool messages

* fix: bump utopia, openapi doc version and improve test

* fix: rerun update docs

* fix: suppoer tool call id in template and remove unnecessary changes

* fix: ruff lint remove unused import

* fix: adjust message types in tests

---------

Co-authored-by: sailesh duddupudi <saileshradar@gmail.com>
---
 docs/openapi.json                             |  70 +++--
 .../test_flash_llama_tool_reply_response.json |  26 ++
 integration-tests/models/test_tools_llama.py  |  38 +++
 router/src/infer/chat_template.rs             | 254 ++++++++++++++++--
 router/src/lib.rs                             |  89 ++++--
 router/src/server.rs                          |   3 +-
 router/src/vertex.rs                          |  10 +-
 7 files changed, 429 insertions(+), 61 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json

diff --git a/docs/openapi.json b/docs/openapi.json
index a1df080b..9de76e47 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -1865,25 +1865,57 @@
         }
       },
       "Message": {
-        "type": "object",
-        "required": [
-          "role",
-          "content"
-        ],
-        "properties": {
-          "content": {
-            "$ref": "#/components/schemas/MessageContent"
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/MessageBody"
           },
-          "name": {
-            "type": "string",
-            "example": "\"David\"",
-            "nullable": true
-          },
-          "role": {
-            "type": "string",
-            "example": "user"
+          {
+            "type": "object",
+            "required": [
+              "role"
+            ],
+            "properties": {
+              "name": {
+                "type": "string",
+                "example": "\"David\"",
+                "nullable": true
+              },
+              "role": {
+                "type": "string",
+                "example": "user"
+              }
+            }
           }
-        }
+        ]
+      },
+      "MessageBody": {
+        "oneOf": [
+          {
+            "type": "object",
+            "required": [
+              "content"
+            ],
+            "properties": {
+              "content": {
+                "$ref": "#/components/schemas/MessageContent"
+              }
+            }
+          },
+          {
+            "type": "object",
+            "required": [
+              "tool_calls"
+            ],
+            "properties": {
+              "tool_calls": {
+                "type": "array",
+                "items": {
+                  "$ref": "#/components/schemas/ToolCall"
+                }
+              }
+            }
+          }
+        ]
       },
       "MessageChunk": {
         "oneOf": [
@@ -2179,6 +2211,10 @@
           "role": {
             "type": "string",
             "example": "user"
+          },
+          "tool_call_id": {
+            "type": "string",
+            "nullable": true
           }
         }
       },
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
new file mode 100644
index 00000000..4f10aa3b
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "I can't access real-time data, but I can provide you with current conditions and forecast for Paris, France:\n\nThe current conditions in Paris are mostly cloudy with a temperature of 6.7°C (44.1°F). \n\nPlease note that the actual weather may differ from this information, and I recommend checking the forecast on a reliable weather website for the most up-to-date information.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1739932427,
+  "id": "",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.1.1-dev0-native",
+  "usage": {
+    "completion_tokens": 79,
+    "prompt_tokens": 103,
+    "total_tokens": 182
+  }
+}
diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py
index 70c3aff0..b8a90cff 100644
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@@ -468,3 +468,41 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object(
         == '{"function": {"_name": "get_n_day_weather_forecast", "location": "San Francisco, CA", "format": "celsius", "num_days":3}}<|eot_id|>'
     )
     assert last_response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_tool_reply_response(
+    flash_llama_grammar_tools, response_snapshot
+):
+    responses = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=42,
+        messages=[
+            {"role": "user", "content": "What's the weather like in Paris today?"},
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "0",
+                        "function": {
+                            "arguments": '{"longitude": 2.2945, "latitude": 48.8567}',
+                            "name": "get_weather",
+                            "description": None,
+                        },
+                        "type": "function",
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "0", "content": "6.7"},
+        ],
+        stream=False,
+    )
+
+    assert responses.choices[0].message.tool_calls is None
+    assert (
+        responses.choices[0].message.content
+        == "I can't access real-time data, but I can provide you with current conditions and forecast for Paris, France:\n\nThe current conditions in Paris are mostly cloudy with a temperature of 6.7°C (44.1°F). \n\nPlease note that the actual weather may differ from this information, and I recommend checking the forecast on a reliable weather website for the most up-to-date information."
+    )
+
+    assert responses == response_snapshot
diff --git a/router/src/infer/chat_template.rs b/router/src/infer/chat_template.rs
index 2fef2dcb..e660cc74 100644
--- a/router/src/infer/chat_template.rs
+++ b/router/src/infer/chat_template.rs
@@ -1,5 +1,7 @@
 use crate::infer::InferError;
-use crate::{ChatTemplateInputs, Message, MessageChunk, TextMessage, TokenizerConfigToken, Tool};
+use crate::{
+    ChatTemplateInputs, Message, MessageBody, MessageChunk, TextMessage, TokenizerConfigToken, Tool,
+};
 use chrono::Local;
 use minijinja::{Environment, ErrorKind, Template};
 use minijinja_contrib::pycompat;
@@ -74,7 +76,9 @@ impl ChatTemplate {
                     format!("\n---\n{}", tool_prompt)
                 };
                 if let Some(last_message) = messages.last_mut() {
-                    last_message.content.push(MessageChunk::Text { text });
+                    if let MessageBody::Content { content } = &mut last_message.body {
+                        content.push(MessageChunk::Text { text });
+                    }
                 }
                 Some(tools)
             }
@@ -119,7 +123,8 @@ mod tests {
     use crate::infer::chat_template::{raise_exception, strftime_now};
     use crate::infer::ChatTemplate;
     use crate::{
-        ChatTemplateInputs, Message, MessageContent, TextMessage, TokenizerConfigToken, Tool,
+        ChatTemplateInputs, Message, MessageBody, MessageContent, TextMessage,
+        TokenizerConfigToken, Tool,
     };
     use chrono::Local;
     use minijinja::Environment;
@@ -158,18 +163,22 @@ mod tests {
                 TextMessage {
                     role: "user".to_string(),
                     content: "Hi!".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "Hello how can I help?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "user".to_string(),
                     content: "What is Deep Learning?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "magic!".to_string(),
+                    ..Default::default()
                 },
             ],
             bos_token: Some("[BOS]"),
@@ -186,6 +195,182 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_chat_template_with_tool_response() {
+        let env = Environment::new();
+
+        // template modified from Llama-3.1-8B-Instruct
+        // https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer_config.json#L2053
+        // the main change is accesing `message.tool_call_id` from the messages
+        let source = r#"
+        {{- bos_token }}
+        {%- if custom_tools is defined %}
+            {%- set tools = custom_tools %}
+        {%- endif %}
+        {%- if not tools_in_user_message is defined %}
+            {%- set tools_in_user_message = true %}
+        {%- endif %}
+        {%- if not date_string is defined %}
+            {%- set date_string = "26 Jul 2024" %}
+        {%- endif %}
+        {%- if not tools is defined %}
+            {%- set tools = none %}
+        {%- endif %}
+
+        {#- This block extracts the system message, so we can slot it into the right place. #}
+        {%- if messages[0]['role'] == 'system' %}
+            {%- set system_message = messages[0]['content']|trim %}
+            {%- set messages = messages[1:] %}
+        {%- else %}
+            {%- set system_message = "" %}
+        {%- endif %}
+
+        {#- System message + builtin tools #}
+        {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+        {%- if builtin_tools is defined or tools is not none %}
+            {{- "Environment: ipython\n" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+        {%- endif %}
+        {{- "Cutting Knowledge Date: December 2023\n" }}
+        {{- "Today Date: " + date_string + "\n\n" }}
+        {%- if tools is not none and not tools_in_user_message %}
+            {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+            {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+            {{- "Do not use variables.\n\n" }}
+            {%- for t in tools %}
+                {{- t | tojson(indent=4) }}
+                {{- "\n\n" }}
+            {%- endfor %}
+        {%- endif %}
+        {{- system_message }}
+        {{- "<|eot_id|>" }}
+
+        {#- Custom tools are passed in a user message with some extra guidance #}
+        {%- if tools_in_user_message and not tools is none %}
+            {#- Extract the first user message so we can plug it in here #}
+            {%- if messages | length != 0 %}
+                {%- set first_user_message = messages[0]['content']|trim %}
+                {%- set messages = messages[1:] %}
+            {%- else %}
+                {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+        {%- endif %}
+            {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+            {{- "Given the following functions, please respond with a JSON for a function call " }}
+            {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+            {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+            {{- "Do not use variables.\n\n" }}
+            {%- for t in tools %}
+                {{- t | tojson(indent=4) }}
+                {{- "\n\n" }}
+            {%- endfor %}
+            {{- first_user_message + "<|eot_id|>"}}
+        {%- endif %}
+
+        {%- for message in messages %}
+            {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+                {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+            {%- elif 'tool_calls' in message %}
+                {%- if not message.tool_calls|length == 1 %}
+                    {{- raise_exception("This model only supports single tool-calls at once!") }}
+                {%- endif %}
+                {%- set tool_call = message.tool_calls[0].function %}
+                {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+                    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+                    {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+                    {%- for arg_name, arg_val in tool_call.arguments | items %}
+                        {{- arg_name + '="' + arg_val + '"' }}
+                        {%- if not loop.last %}
+                            {{- ", " }}
+                        {%- endif %}
+                        {%- endfor %}
+                    {{- ")" }}
+                {%- else  %}
+                    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+                    {{- '{"name": "' + tool_call.name + '", ' }}
+                    {{- '"parameters": ' }}
+                    {{- tool_call.arguments | tojson }}
+                    {{- "}" }}
+                {%- endif %}
+                {%- if builtin_tools is defined %}
+                    {#- This means we're in ipython mode #}
+                    {{- "<|eom_id|>" }}
+                {%- else %}
+                    {{- "<|eot_id|>" }}
+                {%- endif %}
+            {%- elif message.role == "tool" or message.role == "ipython" %}
+                {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+                    {{- "TOOL CALL ID: " + message.tool_call_id + "\n\n" }}
+                {%- if message.content is mapping or message.content is iterable %}
+                    {{- message.content | tojson }}
+                {%- else %}
+                    {{- message.content }}
+                {%- endif %}
+                {{- "<|eot_id|>" }}
+            {%- endif %}
+        {%- endfor %}
+        {%- if add_generation_prompt %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+        {%- endif %}
+        "#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                    ..Default::default()
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: r#"[ { "id": "0", "function": { "arguments": '{"longitude": 2.2945, "latitude": 48.8567}', "name": "get_weather", "description": None, }, "type": "function", } ]"#.to_string(),
+                    ..Default::default()
+                },
+                TextMessage {
+                    role: "tool".to_string(),
+                    content: "6.7".to_string(),
+                    tool_call_id: Some("0".to_string()),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
+
+        assert_eq!(
+            result,
+            r#"[BOS]<|start_header_id|>system<|end_header_id|>
+
+Cutting Knowledge Date: December 2023
+Today Date: 26 Jul 2024
+
+<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+Hi!<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+[ { "id": "0", "function": { "arguments": '{"longitude": 2.2945, "latitude": 48.8567}', "name": "get_weather", "description": None, }, "type": "function", } ]<|eot_id|><|start_header_id|>ipython<|end_header_id|>
+
+TOOL CALL ID: 0
+
+"6.7"<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+"#
+        );
+    }
+
     #[test]
     fn test_chat_template_loop_controls() {
         // some chat templates as e.g. CohereForAI/c4ai-command-r7b-12-202 contain `break`
@@ -224,18 +409,22 @@ mod tests {
                 TextMessage {
                     role: "user".to_string(),
                     content: "Hi!".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "Hello how can I help?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "user".to_string(),
                     content: "What is Deep Learning?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "magic!".to_string(),
+                    ..Default::default()
                 },
             ],
             bos_token: Some("[BOS]"),
@@ -287,22 +476,27 @@ mod tests {
                 TextMessage {
                     role: "user".to_string(),
                     content: "Hi!".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "user".to_string(),
                     content: "Hi again!".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "Hello how can I help?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "user".to_string(),
                     content: "What is Deep Learning?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "magic!".to_string(),
+                    ..Default::default()
                 },
             ],
             bos_token: Some("[BOS]"),
@@ -359,18 +553,22 @@ mod tests {
                 TextMessage {
                     role: "user".to_string(),
                     content: "Hi!".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "Hello how can I help?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "user".to_string(),
                     content: "What is Deep Learning?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "magic!".to_string(),
+                    ..Default::default()
                 },
             ],
             bos_token: Some("[BOS]"),
@@ -426,18 +624,22 @@ mod tests {
                 TextMessage {
                     role: "user".to_string(),
                     content: "Hi!".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "Hello how can I help?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "user".to_string(),
                     content: "What is Deep Learning?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "magic!".to_string(),
+                    ..Default::default()
                 },
             ],
             bos_token: Some("[BOS]"),
@@ -479,18 +681,22 @@ mod tests {
                 TextMessage {
                     role: "user".to_string(),
                     content: "Hi!".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "Hello how can I help?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "user".to_string(),
                     content: "What is Deep Learning?".to_string(),
+                    ..Default::default()
                 },
                 TextMessage {
                     role: "assistant".to_string(),
                     content: "magic!".to_string(),
+                    ..Default::default()
                 },
             ],
             bos_token: Some("[BOS]"),
@@ -516,14 +722,17 @@ mod tests {
             TextMessage {
                 role: "user".to_string(),
                 content: "Hello, how are you?".to_string(),
+                ..Default::default()
             },
             TextMessage {
                 role: "assistant".to_string(),
                 content: "I'm doing great. How can I help you today?".to_string(),
+                ..Default::default()
             },
             TextMessage {
                 role: "user".to_string(),
                 content: "I'd like to show off how chat templating works!".to_string(),
+                ..Default::default()
             },
         ];
 
@@ -531,6 +740,7 @@ mod tests {
             role: "system".to_string(),
             content: "You are a friendly chatbot who always responds in the style of a pirate"
                 .to_string(),
+            ..Default::default()
         }]
         .iter()
         .chain(&example_chat)
@@ -674,10 +884,12 @@ mod tests {
                         TextMessage {
                             role: "system".to_string(),
                             content: "You are a friendly chatbot who always responds in the style of a pirate".to_string(),
+                            ..Default::default()
                         },
                         TextMessage {
                             role: "user".to_string(),
                             content: "How many helicopters can a human eat in one sitting?".to_string(),
+                            ..Default::default()
                         },
                     ],
                     add_generation_prompt: true,
@@ -949,19 +1161,27 @@ mod tests {
             Message {
                 name: None,
                 role: "user".to_string(),
-                content: MessageContent::SingleText(
-                    "I'd like to show off how chat templating works!".to_string(),
-                ),
+                body: MessageBody::Content {
+                    content: MessageContent::SingleText(
+                        "I'd like to show off how chat templating works!".to_string(),
+                    ),
+                },
             },
             Message {
                 name: None,
                 role: "assistant".to_string(),
-                content: MessageContent::SingleText("Great! How can I help you today?".to_string()),
+                body: MessageBody::Content {
+                    content: MessageContent::SingleText(
+                        "Great! How can I help you today?".to_string(),
+                    ),
+                },
             },
             Message {
                 name: None,
                 role: "user".to_string(),
-                content: MessageContent::SingleText("Just testing".to_string()),
+                body: MessageBody::Content {
+                    content: MessageContent::SingleText("Just testing".to_string()),
+                },
             },
         ];
         let tools_string = r#"[{"type": "function","function": {"name": "get_current_weather","description": "Get the current weather","parameters": {"type": "object","properties": {"location": {"type": "string","description": "The city and state, e.g. San Francisco, CA"},"format": {"type": "string","enum": ["celsius", "fahrenheit"],"description": "The temperature unit to use. Infer this from the users location."}},"required": ["location", "format"]}}}]"#.to_string();
@@ -985,17 +1205,21 @@ mod tests {
             Message {
                 name: None,
                 role: "system".to_string(),
-                content: MessageContent::SingleText(
-                    "Youre a helpful assistant! Answer the users question best you can."
-                        .to_string(),
-                ),
+                body: MessageBody::Content {
+                    content: MessageContent::SingleText(
+                        "Youre a helpful assistant! Answer the users question best you can."
+                            .to_string(),
+                    ),
+                },
             },
             Message {
                 name: None,
                 role: "user".to_string(),
-                content: MessageContent::SingleText(
-                    "What is the weather like in Brooklyn, New York?".to_string(),
-                ),
+                body: MessageBody::Content {
+                    content: MessageContent::SingleText(
+                        "What is the weather like in Brooklyn, New York?".to_string(),
+                    ),
+                },
             },
         ];
         let tools_string = r#"[{"type": "function","function": {"name": "get_current_weather","description": "Get the current weather","parameters": {"type": "object","properties": {"location": {"type": "string","description": "The city and state, e.g. San Francisco, CA"},"format": {"type": "string","enum": ["celsius", "fahrenheit"],"description": "The temperature unit to use. Infer this from the users location."}},"required": ["location", "format"]}}}]"#.to_string();
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 414d38ed..e8c875a8 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -663,6 +663,7 @@ impl ChatCompletion {
             (Some(content), None) => OutputMessage::ChatMessage(TextMessage {
                 role: "assistant".into(),
                 content,
+                ..Default::default()
             }),
             (None, Some(tool_calls)) => OutputMessage::ToolCall(ToolCallMessage {
                 role: "assistant".to_string(),
@@ -673,6 +674,7 @@ impl ChatCompletion {
                 OutputMessage::ChatMessage(TextMessage {
                     role: "assistant".into(),
                     content: output,
+                    ..Default::default()
                 })
             }
             (None, None) => {
@@ -680,6 +682,7 @@ impl ChatCompletion {
                 OutputMessage::ChatMessage(TextMessage {
                     role: "assistant".into(),
                     content: "".to_string(),
+                    ..Default::default()
                 })
             }
         };
@@ -767,6 +770,7 @@ impl ChatCompletionChunk {
             (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
                 role: "assistant".to_string(),
                 content: delta,
+                ..Default::default()
             }),
             (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta {
                 role: "assistant".to_string(),
@@ -783,6 +787,7 @@ impl ChatCompletionChunk {
             (None, None) => ChatCompletionDelta::Chat(TextMessage {
                 role: "assistant".to_string(),
                 content: "".to_string(),
+                ..Default::default()
             }),
         };
         Self {
@@ -1025,7 +1030,7 @@ pub fn default_tool_prompt() -> String {
     "\nGiven the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.\n".to_string()
 }
 
-#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+#[derive(Clone, Debug, Deserialize, ToSchema, PartialEq, Serialize)]
 #[serde(tag = "type")]
 pub enum TypedChoice {
     #[serde(rename = "function")]
@@ -1100,19 +1105,19 @@ pub struct JsonSchemaTool {
     properties: Properties,
 }
 
-#[derive(Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, ToSchema, PartialEq)]
 struct FunctionsMap {
     #[serde(rename = "$functions")]
     functions: std::collections::HashMap<String, serde_json::Value>,
 }
 
-#[derive(Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, ToSchema, PartialEq)]
 struct FunctionRef {
     #[serde(rename = "$ref")]
     ref_path: String,
 }
 
-#[derive(Debug, Serialize, Deserialize, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, ToSchema, PartialEq)]
 struct Properties {
     #[serde(serialize_with = "serialize_function")]
     function: Vec<FunctionRef>,
@@ -1129,7 +1134,7 @@ where
 }
 
 #[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default, PartialEq)]
-pub(crate) struct FunctionDefinition {
+pub struct FunctionDefinition {
     #[serde(default)]
     pub description: Option<String>,
     pub name: String,
@@ -1157,7 +1162,7 @@ pub(crate) struct ChatTemplateInputs<'a> {
 }
 
 #[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)]
-pub(crate) struct ToolCall {
+pub struct ToolCall {
     pub id: String,
     pub r#type: String,
     pub function: FunctionDefinition,
@@ -1176,15 +1181,31 @@ pub enum MessageChunk {
     ImageUrl { image_url: Url },
 }
 
-#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
+#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
 pub struct Message {
     #[schema(example = "user")]
-    role: String,
+    pub role: String,
+    #[serde(flatten)]
     #[schema(example = "My name is David and I")]
-    pub content: MessageContent,
+    pub body: MessageBody,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     #[schema(example = "\"David\"")]
-    name: Option<String>,
+    pub name: Option<String>,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
+#[serde(untagged)]
+pub enum MessageBody {
+    // When a regular text message is provided.
+    Content {
+        #[serde(rename = "content")]
+        content: MessageContent,
+    },
+    // When tool calls are provided.
+    Tool {
+        #[serde(rename = "tool_calls")]
+        tool_calls: Vec<ToolCall>,
+    },
 }
 
 #[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
@@ -1211,19 +1232,28 @@ impl MessageContent {
     }
 }
 
-#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq, Default)]
 pub struct TextMessage {
     #[schema(example = "user")]
     pub role: String,
     #[schema(example = "My name is David and I")]
     pub content: String,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub tool_call_id: Option<String>,
 }
 
 impl From<Message> for TextMessage {
     fn from(value: Message) -> Self {
+        let content = match value.body {
+            MessageBody::Content { content } => content,
+            MessageBody::Tool { tool_calls } => {
+                let content = serde_json::to_string(&tool_calls).unwrap_or_default();
+                MessageContent::SingleText(content)
+            }
+        };
         TextMessage {
             role: value.role,
-            content: match value.content {
+            content: match content {
                 MessageContent::SingleText(text) => text,
                 MessageContent::MultipleChunks(chunks) => chunks
                     .into_iter()
@@ -1234,6 +1264,7 @@ impl From<Message> for TextMessage {
                     .collect::<Vec<_>>()
                     .join(""),
             },
+            ..Default::default()
         }
     }
 }
@@ -1565,9 +1596,11 @@ mod tests {
         assert_eq!(
             request.messages[0],
             Message {
+                name: None,
                 role: "user".to_string(),
-                content: MessageContent::SingleText("What is Deep Learning?".to_string()),
-                name: None
+                body: MessageBody::Content {
+                    content: MessageContent::SingleText("What is Deep Learning?".to_string())
+                },
             }
         );
     }
@@ -1617,13 +1650,16 @@ mod tests {
 
         assert_eq!(
             request.messages[0],
-            Message{
+            Message {
+                name: None,
                 role: "user".to_string(),
-                content: MessageContent::MultipleChunks(vec![
-                    MessageChunk::Text { text: "Whats in this image?".to_string() },
-                    MessageChunk::ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() }},
-                ]),
-                name: None
+
+                body: MessageBody::Content {
+                    content: MessageContent::MultipleChunks(vec![
+                        MessageChunk::Text { text: "Whats in this image?".to_string() },
+                        MessageChunk::ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() }},
+                    ]),
+                },
             }
         );
     }
@@ -1631,12 +1667,14 @@ mod tests {
     #[test]
     fn text_message_convert() {
         let message = Message{
+            name: None,
                 role: "user".to_string(),
-                content: MessageContent::MultipleChunks(vec![
-                    MessageChunk::Text { text: "Whats in this image?".to_string() },
-                    MessageChunk::ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } }
-                ]),
-                name: None
+                body: MessageBody::Content {
+                    content: MessageContent::MultipleChunks(vec![
+                        MessageChunk::Text { text: "Whats in this image?".to_string() },
+                        MessageChunk::ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } }
+                    ]),
+                }
             };
         let textmsg: TextMessage = message.into();
         assert_eq!(textmsg.content, "Whats in this image?![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)");
@@ -1667,6 +1705,7 @@ mod tests {
         let message = OutputMessage::ChatMessage(TextMessage {
             role: "assistant".to_string(),
             content: "This is the answer".to_string(),
+            ..Default::default()
         });
         let serialized = serde_json::to_string(&message).unwrap();
         assert_eq!(
diff --git a/router/src/server.rs b/router/src/server.rs
index e9d2fcf4..e9aa4612 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -28,7 +28,7 @@ use crate::{
     CompletionRequest, CompletionType, DeltaToolCall, Function, Prompt, Tool,
 };
 use crate::{FunctionDefinition, HubPreprocessorConfig, ToolCall, ToolChoice};
-use crate::{ModelInfo, ModelsInfo};
+use crate::{MessageBody, ModelInfo, ModelsInfo};
 use async_stream::__private::AsyncStream;
 use axum::extract::{DefaultBodyLimit, Extension};
 use axum::http::{HeaderMap, HeaderValue, Method, StatusCode};
@@ -1577,6 +1577,7 @@ FunctionDefinition,
 ToolChoice,
 ModelInfo,
 ChatTokenizeResponse,
+MessageBody,
 )
 ),
 tags(
diff --git a/router/src/vertex.rs b/router/src/vertex.rs
index 0a8c2278..38695532 100644
--- a/router/src/vertex.rs
+++ b/router/src/vertex.rs
@@ -147,7 +147,7 @@ pub(crate) async fn vertex_compatibility(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{Message, MessageContent};
+    use crate::{Message, MessageBody, MessageContent};
 
     #[test]
     fn vertex_deserialization() {
@@ -169,9 +169,13 @@ mod tests {
             VertexRequest {
                 instances: vec![VertexInstance::Chat(ChatRequest {
                     messages: vec![Message {
-                        role: "user".to_string(),
-                        content: MessageContent::SingleText("What's Deep Learning?".to_string()),
                         name: None,
+                        role: "user".to_string(),
+                        body: MessageBody::Content {
+                            content: MessageContent::SingleText(
+                                "What's Deep Learning?".to_string()
+                            )
+                        },
                     },],
                     max_tokens: Some(128),
                     top_p: Some(0.95),

From 97c5f7e685501201c637cf1848482d98a2c1697f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 21 Feb 2025 13:55:31 +0100
Subject: [PATCH 104/183] Use `rotary` kernel from the Hub (#3041)

---
 flake.lock                                    | 123 +++-----------
 flake.nix                                     |   2 +-
 nix/server.nix                                |   4 +-
 server/hf-kernels.lock                        | 150 ++++++++++++++++++
 server/pyproject.toml                         |   1 +
 .../text_generation_server/layers/rotary.py   |   8 +-
 .../custom_modeling/flash_cohere_modeling.py  |   8 +-
 .../custom_modeling/flash_gptj_modeling.py    |   8 +-
 8 files changed, 195 insertions(+), 109 deletions(-)

diff --git a/flake.lock b/flake.lock
index 512625de..b6cf7e53 100644
--- a/flake.lock
+++ b/flake.lock
@@ -2,16 +2,10 @@
   "nodes": {
     "cachix": {
       "inputs": {
-        "devenv": [
-          "crate2nix"
-        ],
-        "flake-compat": [
-          "crate2nix"
-        ],
+        "devenv": ["crate2nix"],
+        "flake-compat": ["crate2nix"],
         "nixpkgs": "nixpkgs",
-        "pre-commit-hooks": [
-          "crate2nix"
-        ]
+        "pre-commit-hooks": ["crate2nix"]
       },
       "locked": {
         "lastModified": 1709700175,
@@ -30,19 +24,10 @@
     },
     "cachix_2": {
       "inputs": {
-        "devenv": [
-          "crate2nix",
-          "crate2nix_stable"
-        ],
-        "flake-compat": [
-          "crate2nix",
-          "crate2nix_stable"
-        ],
+        "devenv": ["crate2nix", "crate2nix_stable"],
+        "flake-compat": ["crate2nix", "crate2nix_stable"],
         "nixpkgs": "nixpkgs_2",
-        "pre-commit-hooks": [
-          "crate2nix",
-          "crate2nix_stable"
-        ]
+        "pre-commit-hooks": ["crate2nix", "crate2nix_stable"]
       },
       "locked": {
         "lastModified": 1716549461,
@@ -61,16 +46,8 @@
     },
     "cachix_3": {
       "inputs": {
-        "devenv": [
-          "crate2nix",
-          "crate2nix_stable",
-          "crate2nix_stable"
-        ],
-        "flake-compat": [
-          "crate2nix",
-          "crate2nix_stable",
-          "crate2nix_stable"
-        ],
+        "devenv": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
+        "flake-compat": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
         "nixpkgs": "nixpkgs_3",
         "pre-commit-hooks": [
           "crate2nix",
@@ -101,10 +78,7 @@
         "flake-compat": "flake-compat_3",
         "flake-parts": "flake-parts_3",
         "nix-test-runner": "nix-test-runner_3",
-        "nixpkgs": [
-          "tgi-nix",
-          "nixpkgs"
-        ],
+        "nixpkgs": ["tgi-nix", "nixpkgs"],
         "pre-commit-hooks": "pre-commit-hooks_3"
       },
       "locked": {
@@ -219,11 +193,7 @@
     "devshell_2": {
       "inputs": {
         "flake-utils": "flake-utils_3",
-        "nixpkgs": [
-          "crate2nix",
-          "crate2nix_stable",
-          "nixpkgs"
-        ]
+        "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"]
       },
       "locked": {
         "lastModified": 1717408969,
@@ -242,10 +212,7 @@
     "devshell_3": {
       "inputs": {
         "flake-utils": "flake-utils_4",
-        "nixpkgs": [
-          "crate2nix",
-          "nixpkgs"
-        ]
+        "nixpkgs": ["crate2nix", "nixpkgs"]
       },
       "locked": {
         "lastModified": 1711099426,
@@ -343,11 +310,7 @@
     },
     "flake-parts_2": {
       "inputs": {
-        "nixpkgs-lib": [
-          "crate2nix",
-          "crate2nix_stable",
-          "nixpkgs"
-        ]
+        "nixpkgs-lib": ["crate2nix", "crate2nix_stable", "nixpkgs"]
       },
       "locked": {
         "lastModified": 1719745305,
@@ -365,10 +328,7 @@
     },
     "flake-parts_3": {
       "inputs": {
-        "nixpkgs-lib": [
-          "crate2nix",
-          "nixpkgs"
-        ]
+        "nixpkgs-lib": ["crate2nix", "nixpkgs"]
       },
       "locked": {
         "lastModified": 1712014858,
@@ -559,11 +519,7 @@
     },
     "gitignore_3": {
       "inputs": {
-        "nixpkgs": [
-          "crate2nix",
-          "pre-commit-hooks",
-          "nixpkgs"
-        ]
+        "nixpkgs": ["crate2nix", "pre-commit-hooks", "nixpkgs"]
       },
       "locked": {
         "lastModified": 1709087332,
@@ -770,22 +726,10 @@
     },
     "pre-commit-hooks_2": {
       "inputs": {
-        "flake-compat": [
-          "crate2nix",
-          "crate2nix_stable",
-          "flake-compat"
-        ],
+        "flake-compat": ["crate2nix", "crate2nix_stable", "flake-compat"],
         "gitignore": "gitignore_2",
-        "nixpkgs": [
-          "crate2nix",
-          "crate2nix_stable",
-          "nixpkgs"
-        ],
-        "nixpkgs-stable": [
-          "crate2nix",
-          "crate2nix_stable",
-          "nixpkgs"
-        ]
+        "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"],
+        "nixpkgs-stable": ["crate2nix", "crate2nix_stable", "nixpkgs"]
       },
       "locked": {
         "lastModified": 1719259945,
@@ -803,20 +747,11 @@
     },
     "pre-commit-hooks_3": {
       "inputs": {
-        "flake-compat": [
-          "crate2nix",
-          "flake-compat"
-        ],
+        "flake-compat": ["crate2nix", "flake-compat"],
         "flake-utils": "flake-utils_5",
         "gitignore": "gitignore_3",
-        "nixpkgs": [
-          "crate2nix",
-          "nixpkgs"
-        ],
-        "nixpkgs-stable": [
-          "crate2nix",
-          "nixpkgs"
-        ]
+        "nixpkgs": ["crate2nix", "nixpkgs"],
+        "nixpkgs-stable": ["crate2nix", "nixpkgs"]
       },
       "locked": {
         "lastModified": 1712055707,
@@ -837,20 +772,14 @@
         "crate2nix": "crate2nix",
         "flake-utils": "flake-utils_6",
         "nix-filter": "nix-filter",
-        "nixpkgs": [
-          "tgi-nix",
-          "nixpkgs"
-        ],
+        "nixpkgs": ["tgi-nix", "nixpkgs"],
         "rust-overlay": "rust-overlay",
         "tgi-nix": "tgi-nix"
       }
     },
     "rust-overlay": {
       "inputs": {
-        "nixpkgs": [
-          "tgi-nix",
-          "nixpkgs"
-        ]
+        "nixpkgs": ["tgi-nix", "nixpkgs"]
       },
       "locked": {
         "lastModified": 1738549608,
@@ -978,16 +907,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1740036032,
-        "narHash": "sha256-nqo3U8uNlFIgrOz8wCfgk08Oi+RzQxxFDPipeVMyM/E=",
+        "lastModified": 1740049068,
+        "narHash": "sha256-heYzYOt+TSnRKHIV24s74yEjLkTbBfjNCWHdQEX++eI=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "e9fb0e818a7e9a54cdab8d9c7c0cef5037fe084a",
+        "rev": "143e8451efa22b120f97e6698508e9a0aed82769",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "flashinfer-0.2.0.post2",
+        "ref": "hub-rotary",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 6068dc5f..943bf736 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/flashinfer-0.2.0.post2";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/hub-rotary";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/nix/server.nix b/nix/server.nix
index 98193cac..0640fe3a 100644
--- a/nix/server.nix
+++ b/nix/server.nix
@@ -11,7 +11,6 @@
   flashinfer,
   flash-attn,
   flash-attn-layer-norm,
-  flash-attn-rotary,
   flash-attn-v1,
   grpc-interceptor,
   grpcio-reflection,
@@ -36,6 +35,7 @@
   pydantic,
   quantization,
   quantization-eetq,
+  rotary,
   safetensors,
   tokenizers,
   torch,
@@ -87,7 +87,6 @@ buildPythonPackage {
     flashinfer
     flash-attn
     flash-attn-layer-norm
-    flash-attn-rotary
     grpc-interceptor
     grpcio-reflection
     grpcio-status
@@ -111,6 +110,7 @@ buildPythonPackage {
     pydantic
     quantization
     quantization-eetq
+    rotary
     safetensors
     sentencepiece
     tokenizers
diff --git a/server/hf-kernels.lock b/server/hf-kernels.lock
index 5254cb0c..7dc75943 100644
--- a/server/hf-kernels.lock
+++ b/server/hf-kernels.lock
@@ -6934,5 +6934,155 @@
         "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
       }
     ]
+  },
+  {
+    "repo_id": "kernels-community/rotary",
+    "sha": "4db658e027ec752840bb3f557ee076413b8db03f",
+    "files": [
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/rotary/_ops.py",
+        "blob_id": "4fe035c87ea1300ffedcfce17338167dd946e0e8"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu118-x86_64-linux/rotary/_rotary_5yzc45v7kk3yu.abi3.so",
+        "blob_id": "f315754ccb3e8b9dfb4d8954aefaac61b2a4e8bc"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/rotary/_ops.py",
+        "blob_id": "d45359065a7cdc43e2d512d38fc0bfcd88138835"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu121-x86_64-linux/rotary/_rotary_tbiepw2a2ep3e.abi3.so",
+        "blob_id": "9bc986ca760b6a57d05e891c3def1769341a2c29"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/rotary/_ops.py",
+        "blob_id": "7421978682125139f1169f3f71789e0cb44d3b45"
+      },
+      {
+        "filename": "build/torch25-cxx11-cu124-x86_64-linux/rotary/_rotary_6w5syhrhmerj6.abi3.so",
+        "blob_id": "35caf7d755000ca2afac33cc7d4f344112751f9f"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/rotary/_ops.py",
+        "blob_id": "d6f569b471eae628e738b3504c8a9a18b4973d97"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu118-x86_64-linux/rotary/_rotary_joujmbgvsytzg.abi3.so",
+        "blob_id": "30c4aaa83f2549f7363631a51b3341cdf0612f15"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/rotary/_ops.py",
+        "blob_id": "f1f71f34bf0f3c5dffb7c147b48f6396f8054310"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu121-x86_64-linux/rotary/_rotary_mi2o7e7sishyw.abi3.so",
+        "blob_id": "e022e9c1101bdb89d43913979107f7a56717ea6d"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/rotary/_ops.py",
+        "blob_id": "a46c19bd5adfb85d5b7795b3b9277e416f31d8ce"
+      },
+      {
+        "filename": "build/torch25-cxx98-cu124-x86_64-linux/rotary/_rotary_rngiohfhfwuge.abi3.so",
+        "blob_id": "1621cba0150465f67aa931fe3a55e38928b48bcb"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/rotary/_ops.py",
+        "blob_id": "3296d23431d1ec084e8644ff5d3d203a74d82ea1"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu118-x86_64-linux/rotary/_rotary_alv7mzltcxxpq.abi3.so",
+        "blob_id": "2f8b3b93bb7c8fae22c8e08c67771683f549f170"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/rotary/_ops.py",
+        "blob_id": "0bae33b64c71d6a6ad748be66a410a662bb5b28a"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu124-x86_64-linux/rotary/_rotary_c4eyapeep6gty.abi3.so",
+        "blob_id": "3a15076dbd1f9a05f1089cc2cb13c03e5838f2b9"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/rotary/_ops.py",
+        "blob_id": "5c5d7e4497962e0a3e9531ec6a5fdb18e995e0f8"
+      },
+      {
+        "filename": "build/torch26-cxx11-cu126-x86_64-linux/rotary/_rotary_lodp6xeztste6.abi3.so",
+        "blob_id": "efc4a4a0001fba7b0743d1d4a9774d1fc9089ee5"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/rotary/_ops.py",
+        "blob_id": "3cdda1ceda90e76b30b08cae6ad718aa2c2ec3ef"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu118-x86_64-linux/rotary/_rotary_z27mls7mz4e7m.abi3.so",
+        "blob_id": "43b0f08ea035cd2fadb6e119802e7d841a523246"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/rotary/_ops.py",
+        "blob_id": "914f1bb6f9499d0245c3f47345f3b95582be28b2"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu124-x86_64-linux/rotary/_rotary_3bktke4p3hz3a.abi3.so",
+        "blob_id": "29c0bba399e6f348ad8baf93daa5166a6ec6994a"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/rotary/__init__.py",
+        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/rotary/_ops.py",
+        "blob_id": "11056cd0ed09530830b614b8207cbb7fa7ef3288"
+      },
+      {
+        "filename": "build/torch26-cxx98-cu126-x86_64-linux/rotary/_rotary_fvednlzeqgg5s.abi3.so",
+        "blob_id": "8c7c34d9e603640576ba522dcbed341c0d780a9c"
+      }
+    ]
   }
 ]
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 37cb6b1a..bda9df1b 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -43,6 +43,7 @@ build-backend = "setuptools.build_meta"
 "kernels-community/moe" = ">=0.1.1"
 "kernels-community/quantization" = ">=0.0.3"
 "kernels-community/quantization-eetq" = ">=0.0.1"
+"kernels-community/rotary" = ">=0.0.1"
 
 [project.scripts]
 text-generation-server = "text_generation_server.cli:app"
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
index cd22a1f1..d312a8b8 100644
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@@ -5,7 +5,9 @@ from torch import nn
 from text_generation_server.utils.import_utils import SYSTEM
 
 if SYSTEM == "cuda":
-    import rotary_emb
+    from text_generation_server.utils.kernels import load_kernel
+
+    rotary = load_kernel(module="rotary", repo_id="kernels-community/rotary")
 elif SYSTEM == "rocm":
     import vllm._custom_ops as ops
 elif SYSTEM == "ipex":
@@ -54,12 +56,12 @@ class PositionRotaryEmbedding(nn.Module):
             q1 = query[..., :rotary_dim]
             q2 = query[..., rotary_dim : 2 * rotary_dim]
 
-            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+            rotary.apply_rotary(q1, q2, cos, sin, q1, q2, False)
 
             k1 = key[..., :rotary_dim]
             k2 = key[..., rotary_dim : 2 * rotary_dim]
 
-            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+            rotary.apply_rotary(k1, k2, cos, sin, k1, k2, False)
         elif SYSTEM == "rocm":
             # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
             # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
index ece15e94..ce68fc12 100644
--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -63,17 +63,19 @@ class CohereRotary(PositionRotaryEmbedding):
     ):
         # Such controlflows may add some overhead.
         if SYSTEM == "cuda":
-            import rotary_emb
+            from text_generation_server.utils.kernels import load_kernel
+
+            rotary = load_kernel(module="rotary", repo_id="kernels-community/rotary")
 
             q1 = query[..., ::2]
             q2 = query[..., 1::2]
 
-            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+            rotary.apply_rotary(q1, q2, cos, sin, q1, q2, False)
 
             k1 = key[..., ::2]
             k2 = key[..., 1::2]
 
-            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+            rotary.apply_rotary(k1, k2, cos, sin, k1, k2, False)
         elif SYSTEM == "rocm":
             import vllm._custom_ops as ops
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
index 45b90679..79f5ccb6 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
@@ -79,17 +79,19 @@ class GPTJRotary(PositionRotaryEmbedding):
     ):
         # Such controlflows may add some overhead.
         if SYSTEM == "cuda":
-            import rotary_emb
+            from text_generation_server.utils.kernels import load_kernel
+
+            rotary = load_kernel(module="rotary", repo_id="kernels-community/rotary")
 
             q1 = query[..., ::2]
             q2 = query[..., 1::2]
 
-            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+            rotary.apply_rotary(q1, q2, cos, sin, q1, q2, False)
 
             k1 = key[..., ::2]
             k2 = key[..., 1::2]
 
-            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+            rotary.apply_rotary(k1, k2, cos, sin, k1, k2, False)
         elif SYSTEM == "rocm":
             import vllm._custom_ops as ops
 

From c00add9c036491bbd64811b1c5509a823f496d7a Mon Sep 17 00:00:00 2001
From: David Corvoysier <david.corvoysier@gmail.com>
Date: Mon, 24 Feb 2025 09:10:05 +0100
Subject: [PATCH 105/183] Add Neuron backend (#3033)

* feat: add neuron backend

* feat(neuron): add server standalone installation

* feat(neuron): add server and integration tests

* fix(neuron): increase ulimit when building image

The base image used to compile the rust components seems to have a low
ulimit for opened files, which leads to errors during compilation.

* test(neuron): merge integration tests and fixtures

* test: add --neuron option

* review: do not use latest tag

* review: remove ureq pinned version

* review: --privileged should be the exception

* feat: add neuron case to build ci

* fix(neuron): export models from container in test fixtures

The neuron tests require models to have been previously exported and
cached on the hub. This is done automatically by the neuron.model
fixture the first time the tests are ran for a specific version.
This fixture used to export the models using optimum-neuron directly,
but this package is not necessarily present on the system.
Instead, it is now done through the neuron TGI itself, since it
contains all the tools required to export the models.
Note that since the CI runs docker in docker (dind) it does not seem
possible to share a volume between the CI container and the container
used to export the model.
For that reason, a specific image with a modified entrypoint is built
on-the-fly when a model export is required.

* refactor: remove sagemaker entry-point

The SageMaker image is built differently anyway.

* fix(neuron): avoid using Levenshtein

* test(neuron): use smaller llama model

* feat(neuron): avoid installing CUDA in image

* test(neuron): no error anymore when requesting too many tokens

* ci: doing a precompilation step (with a different token).

* test(neuron): avoid using image sha when exporting models

We now manually evaluate the apparent hash of the neuron backend by
combining the hash of the neuron backend directory and Dockerfile.
This new hash is used to identify exported neuron models instead of the
image sha.
This has two benefits:
- it changes less frequently (only hwen the neuron backend changes),
  which means less neuron models being pushed to the hub,
- it can be evaluated locally, meaning that running the tests once
  locally will export the models before the CI uses them.

* test(neuron): added a small script to prune test models

---------

Co-authored-by: drbh <david.richard.holtz@gmail.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 .github/workflows/build.yaml                  |  76 ++-
 .github/workflows/ci_build.yaml               |   2 +-
 Dockerfile.neuron                             | 169 +++++
 backends/neuron/Cargo.toml                    |  47 ++
 backends/neuron/Makefile                      |  34 +
 backends/neuron/README.md                     |  25 +
 backends/neuron/server/.gitignore             |   1 +
 backends/neuron/server/Makefile               |  74 ++
 backends/neuron/server/build-requirements.txt |   3 +
 backends/neuron/server/pyproject.toml         |  26 +
 .../server/text_generation_server/cli.py      | 111 +++
 .../text_generation_server/generator.py       | 636 ++++++++++++++++++
 .../text_generation_server/interceptor.py     |  27 +
 .../server/text_generation_server/model.py    | 118 ++++
 .../server/text_generation_server/server.py   |  89 +++
 backends/neuron/tests/conftest.py             |   1 +
 backends/neuron/tests/fixtures/model.py       | 129 ++++
 backends/neuron/tests/prune_test_models.py    |  25 +
 backends/neuron/tests/pytest.ini              |   2 +
 backends/neuron/tests/requirements.txt        |  19 +
 backends/neuron/tests/server/helpers.py       | 149 ++++
 .../tests/server/test_continuous_batching.py  |  74 ++
 backends/neuron/tests/server/test_decode.py   |  55 ++
 .../tests/server/test_generator_slot.py       |  61 ++
 backends/neuron/tests/server/test_info.py     |  10 +
 backends/neuron/tests/server/test_prefill.py  |  89 +++
 backends/neuron/tgi-entrypoint.sh             |  16 +
 backends/neuron/tgi_env.py                    | 226 +++++++
 docs/source/_toctree.yml                      |   2 +
 docs/source/backends/neuron.md                | 179 +++++
 integration-tests/conftest.py                 |  30 +-
 integration-tests/fixtures/neuron/model.py    | 223 ++++++
 integration-tests/fixtures/neuron/service.py  | 252 +++++++
 .../neuron/integration/test_generate.py       |  93 +++
 .../neuron/integration/test_implicit_env.py   |  65 ++
 35 files changed, 3114 insertions(+), 24 deletions(-)
 create mode 100644 Dockerfile.neuron
 create mode 100644 backends/neuron/Cargo.toml
 create mode 100644 backends/neuron/Makefile
 create mode 100644 backends/neuron/README.md
 create mode 100644 backends/neuron/server/.gitignore
 create mode 100644 backends/neuron/server/Makefile
 create mode 100644 backends/neuron/server/build-requirements.txt
 create mode 100644 backends/neuron/server/pyproject.toml
 create mode 100644 backends/neuron/server/text_generation_server/cli.py
 create mode 100644 backends/neuron/server/text_generation_server/generator.py
 create mode 100644 backends/neuron/server/text_generation_server/interceptor.py
 create mode 100644 backends/neuron/server/text_generation_server/model.py
 create mode 100644 backends/neuron/server/text_generation_server/server.py
 create mode 100644 backends/neuron/tests/conftest.py
 create mode 100644 backends/neuron/tests/fixtures/model.py
 create mode 100644 backends/neuron/tests/prune_test_models.py
 create mode 100644 backends/neuron/tests/pytest.ini
 create mode 100644 backends/neuron/tests/requirements.txt
 create mode 100644 backends/neuron/tests/server/helpers.py
 create mode 100644 backends/neuron/tests/server/test_continuous_batching.py
 create mode 100644 backends/neuron/tests/server/test_decode.py
 create mode 100644 backends/neuron/tests/server/test_generator_slot.py
 create mode 100644 backends/neuron/tests/server/test_info.py
 create mode 100644 backends/neuron/tests/server/test_prefill.py
 create mode 100755 backends/neuron/tgi-entrypoint.sh
 create mode 100755 backends/neuron/tgi_env.py
 create mode 100644 docs/source/backends/neuron.md
 create mode 100644 integration-tests/fixtures/neuron/model.py
 create mode 100644 integration-tests/fixtures/neuron/service.py
 create mode 100644 integration-tests/neuron/integration/test_generate.py
 create mode 100644 integration-tests/neuron/integration/test_implicit_env.py

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d9f58667..824a5a28 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -25,7 +25,7 @@ jobs:
       docker_volume: ${{ steps.final.outputs.docker_volume }}
       docker_devices: ${{ steps.final.outputs.docker_devices }}
       runs_on: ${{ steps.final.outputs.runs_on }}
-      label: ${{ steps.final.outputs.label }}
+      label_extension: ${{ steps.final.outputs.label_extension }}
       extra_pytest: ${{ steps.final.outputs.extra_pytest }}
     concurrency:
       group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
@@ -114,6 +114,16 @@ jobs:
                 export extra_pytest="-k test_flash_gemma_simple"
                 export target=""
                 ;;
+            neuron)
+                export dockerfile="Dockerfile.neuron"
+                export label_extension="-neuron"
+                export docker_devices="/dev/neuron0"
+                export docker_volume="/mnt/cache"
+                export runs_on="aws-inf2-8xlarge"
+                export platform="cpu"
+                export extra_pytest="--neuron"
+                export target=""
+                ;;
           esac
           echo $dockerfile
           echo "Dockerfile=${dockerfile}"
@@ -122,7 +132,7 @@ jobs:
           echo $runs_on
           echo $platform
           echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
-          echo "LABEL=${label_extension}" >> $GITHUB_ENV
+          echo "LABEL_EXTENSION=${label_extension}" >> $GITHUB_ENV
           echo "PLATFORM=${platform}" >> $GITHUB_ENV
           echo "DOCKER_VOLUME=${docker_volume}" >> $GITHUB_ENV
           echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
@@ -172,7 +182,7 @@ jobs:
           images: |
             docker.io/huggingface/text-generation-inference-ci
           tags: |
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL_EXTENSION }}
       # If main, release or tag
       - name: Extract metadata (tags, labels) for Docker
         if: ${{ github.event_name != 'pull_request' }}
@@ -186,10 +196,10 @@ jobs:
             ghcr.io/huggingface/text-generation-inference
             db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
           tags: |
-            type=semver,pattern={{version}}${{ env.LABEL }}
-            type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }}
-            type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
+            type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
+            type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}
+            type=raw,value=latest${{ env.LABEL_EXTENSION }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL_EXTENSION }}
       - name: Build and push Docker image
         id: build-and-push
         uses: docker/build-push-action@v4
@@ -200,7 +210,7 @@ jobs:
           platforms: 'linux/amd64'
           build-args: |
             GIT_SHA=${{ env.GITHUB_SHA }}
-            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
+            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL_EXTENSION }}
             PLATFORM=${{ env.PLATFORM }}
             build_type=${{ env.BUILD_TYPE }}
             sccache_gha_enabled=on
@@ -209,23 +219,55 @@ jobs:
           target: ${{ env.TARGET }}
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
-          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=max,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
-          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
+          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL_EXTENSION }},mode=max,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
+          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL_EXTENSION }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
       - name: Final
         id: final
         run: |
-          echo "docker_image=docker.io/huggingface/text-generation-inference-ci:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+          echo "docker_image=docker.io/huggingface/text-generation-inference-ci:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
           echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
           echo "docker_volume=${{ env.DOCKER_VOLUME }}" >> "$GITHUB_OUTPUT"
           echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
-          echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+          echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
           echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
-  integration_tests:
+  precompile_static_models:
     concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
+      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
     needs: build-and-push
-    if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
+    if: needs.build-and-push.outputs.label_extension == '-neuron'
+    runs-on:
+      group: ${{ needs.build-and-push.outputs.runs_on }}
+    env:
+      PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+      - name: Install
+        run: |
+          make install-integration-tests
+      - name: Run tests
+        run: |
+          export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
+          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
+          export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
+          export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
+          export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
+          echo $DOCKER_IMAGE
+          docker pull $DOCKER_IMAGE
+          pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
+  integration_tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    needs: [precompile_static_models, build-and-push]
+    if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }}
     runs-on:
       group: ${{ needs.build-and-push.outputs.runs_on }}
     env:
@@ -255,7 +297,7 @@ jobs:
 
   backend_trtllm_cxx_tests:
     needs: build-and-push
-    if: needs.build-and-push.outputs.label == '-trtllm'
+    if: needs.build-and-push.outputs.label_extension == '-trtllm'
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-trtllm-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
@@ -270,5 +312,5 @@ jobs:
 
     steps:
       - name: Run C++/CUDA tests
-        if: ${{ env.LABEL == 'ci-runtime' }}
+        if: ${{ env.LABEL_EXTENSION == 'ci-runtime' }}
         run: /usr/local/tgi/bin/tgi_trtllm_backend_tests
diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml
index d8746b65..b5487191 100644
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@@ -37,7 +37,7 @@ jobs:
       # fail-fast is true by default
       fail-fast: false
       matrix:
-        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu"]
+        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron"]
     uses: ./.github/workflows/build.yaml # calls the one above ^
     permissions:
       contents: write
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
new file mode 100644
index 00000000..17d25691
--- /dev/null
+++ b/Dockerfile.neuron
@@ -0,0 +1,169 @@
+# Fetch and extract the TGI sources
+FROM alpine AS tgi
+RUN mkdir -p /tgi
+
+# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
+FROM alpine AS optimum-neuron
+RUN mkdir -p /optimum-neuron
+ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
+RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
+
+# Build cargo components (adapted from TGI original Dockerfile)
+# Note: we cannot use the cargo-chef base image as it uses python 3.11
+FROM ubuntu:22.04 AS chef
+
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    curl ca-certificates build-essential \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.80.1 --profile minimal -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN cargo install cargo-chef --locked
+
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY backends/neuron/Cargo.toml Cargo.toml
+COPY Cargo.lock Cargo.lock
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    unzip python3-dev libssl-dev pkg-config \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY backends/neuron/Cargo.toml Cargo.toml
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --release --recipe-path recipe.json
+
+COPY Cargo.lock Cargo.lock
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo build --release
+
+# Python base image
+FROM ubuntu:22.04 AS base
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    python3-pip \
+    python3-setuptools \
+    python-is-python3 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+RUN pip3 --no-cache-dir install --upgrade pip
+
+# Python server build image
+FROM base AS pyserver
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    make \
+    python3-venv \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN install -d /pyserver
+WORKDIR /pyserver
+COPY backends/neuron/server server
+COPY proto proto
+RUN pip3 install -r server/build-requirements.txt
+RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server package
+
+# Neuron base image (used for deployment)
+FROM base AS neuron
+
+# Install system prerequisites
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    gnupg2 \
+    wget \
+    python3-dev \
+    libexpat1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+# Install neuronx packages
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    aws-neuronx-dkms=2.18.20.0 \
+    aws-neuronx-collectives=2.22.33.0-d2128d1aa \
+    aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
+    aws-neuronx-tools=2.19.0.0 \
+    libxml2 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
+
+# Install manually torch CPU version to avoid pulling CUDA
+RUN pip3 install \
+    torch==2.1.2 \
+    torchvision==0.16.2 \
+    --index-url https://download.pytorch.org/whl/cpu
+
+RUN pip3 install \
+    neuronx-cc==2.15.143.0 \
+    torch-neuronx==2.1.2.2.3.2 \
+    transformers-neuronx==0.12.313 \
+    neuronx-distributed==0.9.0 \
+    libneuronxla==2.0.5347.0 \
+    --extra-index-url=https://pip.repos.neuron.amazonaws.com
+
+# Install HuggingFace packages
+RUN pip3 install \
+    hf_transfer huggingface_hub
+
+# Install optimum-neuron
+COPY --from=optimum-neuron /optimum-neuron optimum-neuron
+RUN pip3 install ./optimum-neuron
+
+# TGI base env
+ENV HUGGINGFACE_HUB_CACHE=/tmp \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+# Disable color logs as they are not supported by CloudWatch
+ENV LOGURU_COLORIZE=NO
+ENV LOG_COLORIZE=0
+
+# Install router
+COPY --from=builder /usr/src/target/release/text-generation-router-v2 /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+# Install python server
+COPY --from=pyserver /pyserver/build/dist dist
+RUN pip install dist/text_generation_server*.tar.gz
+
+# Final image
+FROM neuron
+
+COPY backends/neuron/tgi_env.py /tgi_env.py
+COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
diff --git a/backends/neuron/Cargo.toml b/backends/neuron/Cargo.toml
new file mode 100644
index 00000000..3b237eda
--- /dev/null
+++ b/backends/neuron/Cargo.toml
@@ -0,0 +1,47 @@
+[workspace]
+members = [
+  "backends/v2",
+  "backends/grpc-metadata",
+  "launcher",
+  "router"
+]
+default-members = [
+  "backends/v2",
+  "backends/grpc-metadata",
+  "launcher",
+  "router"
+]
+resolver = "2"
+
+[workspace.package]
+version = "3.0.0"
+edition = "2021"
+authors = ["Olivier Dehaene"]
+homepage = "https://github.com/huggingface/text-generation-inference"
+
+[workspace.dependencies]
+base64 = "0.22.0"
+tokenizers = { version = "0.20.0", features = ["http"] }
+hf-hub = { version = "0.3.1", features = ["tokio"] }
+metrics = { version = "0.23.0" }
+metrics-exporter-prometheus = { version = "0.15.1", features = [] }
+minijinja = { version = "2.2.0", features = ["json"] }
+minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
+pyo3 = { version = "0.22.2", features = ["auto-initialize"] }
+
+[profile.release]
+incremental = true
+
+[profile.release-binary]
+inherits = "release"
+debug = 1
+incremental = true
+panic = "abort"
+
+[profile.release-opt]
+inherits = "release"
+debug = 0
+incremental = false
+lto = "fat"
+opt-level = 3
+codegen-units = 1
diff --git a/backends/neuron/Makefile b/backends/neuron/Makefile
new file mode 100644
index 00000000..6c5002ce
--- /dev/null
+++ b/backends/neuron/Makefile
@@ -0,0 +1,34 @@
+#  Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
+mkfile_dir := $(dir $(mkfile_path))
+root_dir := "${mkfile_dir}/../.."
+
+.PHONY:	image install_server test_server test_integration
+
+VERSION := $(shell gawk 'match($$0, /^version = "(.*)"/, a) {print a[1]}' ${root_dir}/Cargo.toml)
+
+image:
+	docker build --rm -f ${root_dir}/Dockerfile.neuron \
+				 --ulimit nofile=100000:100000 \
+				 --build-arg VERSION=$(VERSION) \
+				 -t text-generation-inference:$(VERSION)-neuron ${root_dir}
+
+install_server:
+	make -C ${mkfile_dir}/server install VERSION:=${VERSION}
+
+test_server: install_server
+	python -m pip install -r ${mkfile_dir}/tests/requirements.txt
+	python -m pytest -sv ${mkfile_dir}/tests/server
diff --git a/backends/neuron/README.md b/backends/neuron/README.md
new file mode 100644
index 00000000..55722c3b
--- /dev/null
+++ b/backends/neuron/README.md
@@ -0,0 +1,25 @@
+# Text-generation-inference - Neuron backend for AWS Trainium and inferentia2
+
+## Description
+
+This is the TGI backend for AWS Neuron Trainium and Inferentia family of chips.
+
+This backend is composed of:
+- the AWS Neuron SDK,
+- the legacy v2 TGI launcher and router,
+- a neuron specific inference server for text-generation.
+
+## Usage
+
+Please refer to the official [documentation](https://huggingface.co/docs/text-generation-inference/backends/neuron).
+
+## Build your own image
+
+The simplest way to build TGI with the neuron backend is to use the provided `Makefile`:
+
+```shell
+$ make -C backends/neuron image
+```
+
+Alternatively, you can build the image directly from the top directory using a command similar to the one defined
+in the `Makefile` under the `image` target.
diff --git a/backends/neuron/server/.gitignore b/backends/neuron/server/.gitignore
new file mode 100644
index 00000000..378eac25
--- /dev/null
+++ b/backends/neuron/server/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/backends/neuron/server/Makefile b/backends/neuron/server/Makefile
new file mode 100644
index 00000000..efe34bd0
--- /dev/null
+++ b/backends/neuron/server/Makefile
@@ -0,0 +1,74 @@
+# Initialize base variables
+SHELL := /bin/bash
+pkg_name := text_generation_server
+BUILDDIR ?= $(CURDIR)/build
+VERSION ?= 0.0.1
+mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
+mkfile_dir := $(dir $(mkfile_path))
+pkg_dir := $(BUILDDIR)/$(pkg_name)
+py_version := $(subst -,.,${VERSION})
+pkg_dist := ${BUILDDIR}/dist/${pkg_name}-$(py_version).tar.gz
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+${BUILDDIR}:
+	install -d $@
+
+# List static sources to be deployed in the package
+src_dir := $(mkfile_dir)/$(pkg_name)
+sources := $(wildcard $(src_dir)/*.py)
+deployed_sources := $(subst $(src_dir), $(pkg_dir), $(sources))
+
+# Static files are just copied
+
+define COPY
+	cp -f $< $@
+endef
+
+# We use a PHONY target to represent the VERSION
+.PHONY: VERSION
+
+VERSION: ${BUILDDIR}
+	# The trick is to compare the value of the variable with the content of a file in the build directory
+	@if [[ `cat ${BUILDDIR}/VERSION 2>&1` != '$(VERSION)' ]]; then echo -n $(VERSION) >${BUILDDIR}/VERSION; fi
+
+# Depending on the PHONY VERSION target makes sure the pyproject.toml is regenerated if the version changes
+$(BUILDDIR)/pyproject.toml: $(mkfile_dir)/pyproject.toml VERSION
+	mkdir -p $(BUILDDIR)
+	$(COPY)
+	sed -i -e 's/version = "VERSION"/version = \"${VERSION}\"/' $@
+
+$(pkg_dir)/%.py: $(src_dir)/%.py
+	mkdir -p $(pkg_dir)
+	$(COPY)
+
+# Generated files are produced by grpcio tools
+
+# If not provided, get local proto files
+ifndef PROTODIR
+PROTODIR := $(mkfile_dir)/../../../proto
+endif
+
+# Three python files are generated for each protobuf
+protobufs := $(PROTODIR)/generate.proto
+pkg_pb_dir := $(pkg_dir)/pb
+generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
+generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))
+
+$(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PROTODIR)/%.proto
+	mkdir -p $(pkg_pb_dir)
+	python -m grpc_tools.protoc -I$(PROTODIR) --python_out=$(pkg_pb_dir) \
+		--grpc_python_out=$(pkg_pb_dir) --mypy_out=$(pkg_pb_dir) $^
+	sed -i -e 's/^\(import.*pb2\)/from . \1/g' $(pkg_pb_dir)/$*_pb2_grpc.py
+
+${pkg_dist}: $(BUILDDIR)/pyproject.toml $(deployed_sources) $(generated_sources)
+	python -m build $(BUILDDIR)
+
+package: ${pkg_dist}
+
+install: ${pkg_dist}
+	python3 -m pip uninstall -y ${pkg_name}
+	python3 -m pip install ${pkg_dist}
diff --git a/backends/neuron/server/build-requirements.txt b/backends/neuron/server/build-requirements.txt
new file mode 100644
index 00000000..2083bd73
--- /dev/null
+++ b/backends/neuron/server/build-requirements.txt
@@ -0,0 +1,3 @@
+build
+grpcio-tools==1.53.0
+mypy-protobuf
diff --git a/backends/neuron/server/pyproject.toml b/backends/neuron/server/pyproject.toml
new file mode 100644
index 00000000..ed790fdb
--- /dev/null
+++ b/backends/neuron/server/pyproject.toml
@@ -0,0 +1,26 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "text-generation-server"
+version = "VERSION"
+authors = [{name="David Corvoysier", email="david@huggingface.co" }]
+description = "TGI compatible inference server for AWS Neuronx platforms"
+dependencies = [
+    'protobuf > 3.20.1, < 4',
+    'grpcio == 1.57.0',
+    'grpcio-status == 1.48.2',
+    'grpcio-reflection == 1.48.2',
+    'grpc-interceptor == 0.15.2',
+    'typer == 0.6.1',
+    'safetensors',
+    'loguru == 0.6.0',
+    'optimum-neuron[neuronx] >= 0.0.28',
+]
+
+[tool.setuptools]
+packages = ["text_generation_server", "text_generation_server.pb"]
+
+[project.scripts]
+text-generation-server = 'text_generation_server.cli:app'
diff --git a/backends/neuron/server/text_generation_server/cli.py b/backends/neuron/server/text_generation_server/cli.py
new file mode 100644
index 00000000..409143a9
--- /dev/null
+++ b/backends/neuron/server/text_generation_server/cli.py
@@ -0,0 +1,111 @@
+import sys
+from typing import Optional
+
+import typer
+from loguru import logger
+
+
+app = typer.Typer()
+
+
+@app.command()
+def serve(
+    model_id: str,
+    revision: Optional[str] = None,
+    sharded: bool = False,
+    trust_remote_code: bool = None,
+    uds_path: str = "/tmp/text-generation-server",
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    otlp_endpoint: Optional[str] = None,
+    otlp_service_name: str = "text-generation-inference.server",
+    max_input_tokens: Optional[int] = None,
+):
+    """This is the main entry-point for the server CLI.
+
+    Args:
+        model_id (`str`):
+            The *model_id* of a model on the HuggingFace hub or the path to a local model.
+        revision (`Optional[str]`, defaults to `None`):
+            The revision of the model on the HuggingFace hub.
+        sharded (`bool`):
+            Whether the model must be sharded or not. Kept for compatibility with the
+            text-generation-launcher, but must be set to False.
+        trust-remote-code (`bool`):
+            Kept for compatibility with text-generation-launcher. Ignored.
+        uds_path (`Union[Path, str]`):
+            The local path on which the server will expose its google RPC services.
+        logger_level (`str`):
+            The server logger level. Defaults to *INFO*.
+        json_output (`bool`):
+            Use JSON format for log serialization.
+        otlp_endpoint (`Optional[str]`, defaults to `None`):
+            The Open Telemetry endpoint to use.
+        otlp_service_name (`Optional[str]`, defaults to `None`):
+            The name to use when pushing data to the Open Telemetry endpoint.
+        max_input_tokens (`Optional[int]`, defaults to `None`):
+            The maximum number of input tokens each request should contain.
+    """
+    if sharded:
+        raise ValueError("Sharding is not supported.")
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    if trust_remote_code is not None:
+        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+
+    # Import here after the logger is added to log potential import exceptions
+    from .server import serve
+
+    serve(model_id, revision, uds_path)
+
+
+@app.command()
+def download_weights(
+    model_id: str,
+    revision: Optional[str] = None,
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    auto_convert: Optional[bool] = None,
+    extension: Optional[str] = None,
+    trust_remote_code: Optional[bool] = None,
+    merge_lora: Optional[bool] = None,
+):
+    """Download the model weights.
+
+    This command will be called by text-generation-launcher before serving the model.
+    """
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    if extension is not None:
+        logger.warning("'extension' argument is not supported and will be ignored.")
+    if trust_remote_code is not None:
+        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+    if auto_convert is not None:
+        logger.warning("'auto_convert' argument is not supported and will be ignored.")
+    if merge_lora is not None:
+        logger.warning("'merge_lora' argument is not supported and will be ignored.")
+
+    # Import here after the logger is added to log potential import exceptions
+    from .model import fetch_model
+
+    fetch_model(model_id, revision)
diff --git a/backends/neuron/server/text_generation_server/generator.py b/backends/neuron/server/text_generation_server/generator.py
new file mode 100644
index 00000000..3ddee690
--- /dev/null
+++ b/backends/neuron/server/text_generation_server/generator.py
@@ -0,0 +1,636 @@
+import copy
+import logging
+import time
+from abc import ABC
+from enum import Enum
+from typing import List, Optional, Tuple
+
+import torch
+from loguru import logger
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
+from transformers.generation import GenerationConfig
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.generation import TokenSelector
+
+from .model import get_export_kwargs_from_env
+from .pb.generate_pb2 import (
+    Batch,
+    CachedBatch,
+    FinishReason,
+    GeneratedText,
+    Generation,
+    InfoResponse,
+    Request,
+    Tokens,
+)
+
+
+# Disable optimum-neuron warnings as it seems to block the server after a while
+optimum_logger = logging.getLogger("optimum.neuron")
+optimum_logger.setLevel("CRITICAL")
+
+
+class Generator(ABC):
+    """An abstract class to represent the workhorse behind TextGenerationService.
+
+    Ideally, it should not rely on protobuf constructs, but in a first step it does.
+    Implementations would typically need a model and a tokenizer to implement the Generator methods.
+    """
+
+    @property
+    def info(self) -> InfoResponse:
+        """This should simply return the expected InfoResponse"""
+        raise NotImplementedError
+
+    def warmup(self, batch: Batch) -> int:
+        """Verify if the hardware can support the target load.
+
+        Args:
+            batch (`Batch`):
+                A batch corresponding to the maximum number of concurrent requests.
+
+        Return:
+            The maximum number of tokens the model supports.
+        """
+        raise NotImplementedError
+
+    def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
+        """Prefill is called whenever new requests need to be added.
+
+        When this method returns successfully, a decode method will follow
+        with both the current and newly prefilled batch(es).
+
+        Args:
+            batch (`Batch`):
+                A batch containing the new requests.
+
+        Return:
+            A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
+        """
+        raise NotImplementedError
+
+    def decode(self, batches: List[Batch]) -> Tuple[List[Generation], CachedBatch]:
+        """Decode after a prefill or another decode."""
+        raise NotImplementedError
+
+    def filter(self, batch_id: int, request_ids: List[int]) -> CachedBatch:
+        """Remove requests that are not listed from the specified batch"""
+        raise NotImplementedError
+
+    def clear(self):
+        """Remove all requests from the generator"""
+        raise NotImplementedError
+
+    @classmethod
+    def from_pretrained(cls, model_id: str, revision: Optional[str]):
+        """Factory method "a la transformers" """
+        raise NotImplementedError
+
+
+class Slot:
+    """Represents a slot in a static batch"""
+
+    class State(Enum):
+        EMPTY = 0
+        PAUSE = 1
+        READY = 2
+
+    def __init__(self, id: int, tokenizer: PreTrainedTokenizerBase):
+        self._id = id
+        self._tokenizer = tokenizer
+        self.clear()
+
+    def clear(self):
+        """Clear the slot and mark it as available."""
+        self._state = Slot.State.EMPTY
+        self._batch_id = None
+        self._request_id = None
+        self._inputs = ""
+        self._truncate = 0
+        self._generation_config = None
+        self._tokens = []
+        self._mask = torch.tensor([])
+        self._selector = None
+        self._generated_tokens = 0
+        self._next_text_token_start = 0
+        self._next_text_token_end = 0
+        self._generated_text = ""
+        self._next_text = ""
+
+    @property
+    def id(self) -> int:
+        return self._id
+
+    @property
+    def state(self) -> "Slot.State":
+        return self._state
+
+    @property
+    def batch_id(self) -> int:
+        return self._batch_id
+
+    @property
+    def request_id(self) -> int:
+        return self._request_id
+
+    @property
+    def cached_text(self) -> str:
+        return self._inputs + self._generated_text
+
+    @property
+    def generation_config(self) -> GenerationConfig:
+        return self._generation_config
+
+    @property
+    def generated_tokens(self) -> int:
+        return self._generated_tokens
+
+    def assign(self, batch_id: int, request: Request, generation_config: GenerationConfig):
+        """Assign a request to a slot.
+
+        Args:
+            request (`Request`):
+                The request to be assigned. Contains the inputs and tokens selection parameters.
+            generation_config (`transformers.GenerationConfig`):
+                The base generation config (might be modified by the request generation parameters).
+        """
+        self._state = Slot.State.READY
+        self._batch_id = batch_id
+        self._request_id = request.id
+        self._inputs = request.inputs
+        if request.truncate:
+            self._truncate = request.truncate
+        self._generation_config = copy.deepcopy(generation_config)
+        # Update generation config with request parameters
+        self._generation_config.do_sample = request.parameters.do_sample
+        if self._generation_config.do_sample:
+            if request.parameters.temperature != 0:
+                self._generation_config.temperature = request.parameters.temperature
+            if request.parameters.top_k != 0:
+                self._generation_config.top_k = request.parameters.top_k
+            if request.parameters.top_p != 0:
+                self._generation_config.top_p = request.parameters.top_p
+            if request.parameters.typical_p != 0:
+                self._generation_config.typical_p = request.parameters.typical_p
+        if request.parameters.repetition_penalty != 0:
+            self._generation_config.repetition_penalty = request.parameters.repetition_penalty
+        self.seed = request.parameters.seed
+        self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens
+        self._max_new_tokens = self._generation_config.max_new_tokens
+        stop_strings = request.stopping_parameters.stop_sequences
+        if stop_strings:
+            self._generation_config.stop_strings = stop_strings
+
+    def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, selector: TokenSelector):
+        """Reset the slot for the next generation.
+
+        Args:
+            input_ids: (`torch.LongTensor`):
+                The new input_ids to use to generate the next token.
+            attention_mask: (`torch.LongTensor`):
+                The new attention_mask to use to generate the next token.
+            selector: (`optimum.neuron.generation.TokenSelector`):
+                An object implementing the updated token selection logic.
+        """
+        self._tokens = input_ids.clone()
+        self._next_text_token_start = 0
+        self._next_text_token_end = torch.numel(self._tokens)
+        self._next_text = ""
+        self._mask = attention_mask.clone()
+        self._selector = selector
+
+    def pause(self, reset_on_pause: bool):
+        """Mark the current slot as paused for generation.
+
+        Note that the KV cache for this slot will still be filled.
+        """
+        if reset_on_pause:
+            # Drop the last token as it will be added back when resuming the slot
+            self._generated_tokens -= 1
+            # Since generated tokens are now part of the prefill, we need to reevaluate
+            # max_new_tokens for the next generation
+            self._generation_config.max_new_tokens = self._max_new_tokens - self._generated_tokens
+        self._state = Slot.State.PAUSE
+
+    def resume(self):
+        """Mark the slot as ready for generation."""
+        self._state = Slot.State.READY
+
+    def _decode_next_tokens(
+        self,
+    ) -> str:
+        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
+        # We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
+        # which decide to add a space or not depending on the surrounding ids.
+        new_text = self._tokenizer.decode(self._tokens[self._next_text_token_start :], skip_special_tokens=False)
+        if new_text.endswith("�"):
+            # utf-8 char at the end means it's a potential unfinished byte sequence
+            # from byte fallback tokenization.
+            return ""
+
+        # Compare the generated text with the one using only the tokens producing the last one
+        last_text = self._tokenizer.decode(
+            self._tokens[self._next_text_token_start : self._next_text_token_end],
+            skip_special_tokens=False,
+        )
+        if len(new_text) == len(last_text):
+            # Nothing new was actually generated
+            return ""
+        # Return the decoded text and store its token offsets
+        self._next_text_token_start = self._next_text_token_end
+        self._next_text_token_end = torch.numel(self._tokens)
+        return new_text[len(last_text) :]
+
+    def append(self, next_token: int) -> str:
+        """Append a new generated token to this slot
+
+        The new token is added to the list of generated tokens, which impacts
+        directly the generated_text and stopped property.
+
+        The new token is however not added immediately to the slot inputs: it will
+        be added later on when it has effectively been used to produce the next token.
+
+        Args:
+            next_token (`int`):
+                The newly generated token.
+
+        Return:
+            The corresponding decoded text (if any).
+        """
+        self._tokens = torch.cat([self._tokens, torch.LongTensor([next_token])])
+        self._mask = torch.cat([self._mask, torch.LongTensor([1])])
+        self._generated_tokens += 1
+        next_text = self._decode_next_tokens()
+        # Now that a new token has been generated, we can append the previous one to the generated text
+        self._generated_text += self._next_text
+        self._next_text = next_text
+        return next_text
+
+    def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
+        """Select the next token from the candidate logits.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation (not used in all generation modes).
+            logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                The logits corresponding to the generated tokens.
+
+        Return:
+            `torch.LongTensor`: A scalar torch.LongTensor` containing the selected token.
+        """
+        return self._selector.select(input_ids, logits)[0]
+
+    @property
+    def stopped(self) -> bool:
+        # Transformers stopping criteria expects a batch of input ids
+        input_ids = torch.unsqueeze(self._tokens, dim=0)
+        return self._selector.stopping_criteria(input_ids, None)
+
+    @property
+    def generated_text(self) -> str:
+        return self._generated_text + self._next_text
+
+    @property
+    def next_token(self) -> int:
+        return None if len(self._tokens) == 0 else self._tokens[-1]
+
+    @property
+    def attention_mask(self) -> torch.LongTensor:
+        return self._mask
+
+    @property
+    def max_token(self) -> int:
+        return self._generation_config.max_length
+
+    @property
+    def max_new_tokens(self) -> int:
+        # The current value of max_new_tokens: might be different of the target max_new_tokens
+        # if the slot has been paused and resumed.
+        return self._generation_config.max_new_tokens
+
+    @property
+    def truncate(self) -> int:
+        return self._truncate
+
+
+class NeuronGenerator(Generator):
+    """A Generator for Neuron models."""
+
+    def __init__(
+        self,
+        model: NeuronModelForCausalLM,
+        tokenizer: PreTrainedTokenizerBase,
+    ):
+        self.model = model
+        self.rebuild_cache_on_prefill = not self.model.continuous_batching
+        # Specify padding and truncation options for decoder-only architecture
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = "left"
+        tokenizer.truncation_side = "left"
+        self.tokenizer = tokenizer
+        self.special_tokens = self.tokenizer.all_special_ids
+        self.slots = [Slot(i, tokenizer) for i in range(self.model.batch_size)]
+        self.batch_id = 0
+
+    @property
+    def info(self) -> InfoResponse:
+        """Returns the expected InfoResponse."""
+        dtype = getattr(self.model.config, "torch_dtype", "float32")
+        return InfoResponse(
+            requires_padding=True,
+            dtype=str(dtype),
+            device_type="xla",
+        )
+
+    def warmup(self, batch: Batch) -> int:
+        """Verify if the hardware can support the target load.
+
+        Args:
+            batch (`Batch`):
+                A batch corresponding to the maximum number of concurrent requests.
+
+        Return:
+            The maximum number of tokens the model supports.
+        """
+        # Just check that the warmup request parameters match the model capacity
+        batch_size = self.model.batch_size
+        if len(batch.requests) > batch_size:
+            raise ValueError(
+                f"Inconsistent batch_size configuration: Please make sure the batch_size in the compiled model (currently {batch_size}) matches the batch_size passed to TGI.  The compiled model batch_size is usually in the neuron section of the model config.json file. You may also have passed it into optimum-cli during the compilation process.  The batch size for TGI is usually set in the environment as MAX_BATCH_SIZE."
+            )
+        self.prefill(batch)
+        self.clear()
+        return self.model.batch_size * self.model.max_length
+
+    def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
+        """Prefill new requests.
+
+        Args:
+            batch (`Batch`):
+                A batch containing the new requests.
+
+        Return:
+            A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
+        """
+        slots = {state: [] for state in Slot.State}
+        for slot in self.slots:
+            slots[slot.state].append(slot)
+        active_slots = slots[Slot.State.READY]
+        empty_slots = slots[Slot.State.EMPTY]
+        if len(empty_slots) < len(batch.requests):
+            raise ValueError(
+                f"Cannot prefill {len(batch.requests)} new request(s) with only {len(empty_slots)} empty slots."
+                f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
+            )
+        # Assign each request to an empty slot
+        logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)")
+        new_slots = []
+        for request in batch.requests:
+            slot = empty_slots.pop()
+            slot.assign(self.batch_id, request, self.model.generation_config)
+            new_slots.append(slot)
+            logger.debug(
+                f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
+            )
+        if self.rebuild_cache_on_prefill:
+            # We will clear pending slots and prefill all slots
+            prefill_slots = self.slots
+            seq_ids = None
+        else:
+            # We only need to pass inputs for the new requests
+            prefill_slots = new_slots
+            seq_ids = torch.tensor([slot.id for slot in prefill_slots])
+        # Reconstruct the full inputs (without padding) as seen by the model.
+        # This comprises:
+        # - the inputs for new requests,
+        # - only when rebuilding the cache, the inputs and the generated text that has already
+        # been cached (i.e. excluding the last generated token) for unfinished requests.
+        inputs = []
+        max_length = 0
+        for slot in prefill_slots:
+            inputs.append(slot.cached_text)
+            # Apply truncation, making sure we fit into static dimensions
+            if slot.truncate == 0:
+                max_length = self.model.max_length
+            elif slot.truncate > max_length and slot.truncate < self.model.max_length:
+                max_length = slot.truncate
+        # Tokenize with padding and truncation
+        padded_inputs = self.tokenizer(
+            inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
+        )
+        input_ids = padded_inputs.input_ids
+        attention_mask = padded_inputs.attention_mask
+        # Pause previously active slots during generation
+        next_tokens = []
+        for slot in active_slots:
+            slot.pause(reset_on_pause=self.rebuild_cache_on_prefill)
+            if self.rebuild_cache_on_prefill:
+                # The slot will be reset, so we need to store its next token
+                next_tokens.append(slot.next_token)
+        # Each slot must be reset with the padded inputs and masks
+        for i, slot in enumerate(prefill_slots):
+            if slot.state != slot.state.EMPTY:
+                if slot.truncate > 0 and slot.truncate < input_ids.shape[-1]:
+                    # Apply per-request truncation
+                    input_ids[i, : -slot.truncate] = self.tokenizer.pad_token_id
+                    attention_mask[i, : -slot.truncate] = 0
+                slot_input_ids = input_ids[i : i + 1, :]
+                # Padded input ids are also required to set logits processors and stopping criterias
+                selector = TokenSelector.create(
+                    slot_input_ids,
+                    slot.generation_config,
+                    self.model,
+                    self.model.max_length,
+                    tokenizer=self.tokenizer,
+                    seed=slot.seed,
+                )
+                slot_input_ids = slot_input_ids.squeeze(dim=0).type(torch.int64)
+                slot_attention_mask = attention_mask[i]
+                slot.reset(slot_input_ids, slot_attention_mask, selector)
+        # Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
+        # as they have already been generated and sent back in the last decode.
+        model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask, seq_ids)
+        logits = self.model(**model_inputs)[0]
+        generation, next_batch = self._generate_token(prefill_slots, self.batch_id, logits, input_ids)
+        self.batch_id += 1
+        # Reactivate previously active slots for the next decode
+        for i, slot in enumerate(active_slots):
+            slot.resume()
+            if self.rebuild_cache_on_prefill:
+                # Append back the next token
+                slot.append(next_tokens[i])
+        logger.debug("Model ready for decoding")
+        if next_batch is not None:
+            logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
+        return generation, next_batch
+
+    def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBatch]:
+        """Decode the specified prefilled requests.
+
+        Args:
+            batches (`List[CachedBatch]`):
+                A list of previous batches containing the prefilled requests.
+
+        Return:
+            A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
+        """
+        # batches contains a list composed of:
+        # - the batch id returned by the last decode,
+        # - the batch id(s) returned by the last prefill(s)
+        # Batches are always concatenated during prefill, so we can
+        # just carry on with decoding. We adopt the id of the first
+        # batch in the list as our next batch id.
+        next_batch_id = batches[0].id
+        request_ids = []
+        for batch in batches:
+            request_ids += batch.request_ids
+        cleared_request_ids = []
+        for slot in self.slots:
+            if slot.state == slot.State.READY and slot.request_id not in request_ids:
+                cleared_request_ids.append(slot.request_id)
+                slot.clear()
+        if len(cleared_request_ids) > 0:
+            logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.")
+        active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
+        if len(active_slots) < len(request_ids):
+            raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
+        if self.model.continuous_batching:
+            decode_slots = active_slots
+            seq_ids = torch.tensor([slot.id for slot in decode_slots])
+        else:
+            decode_slots = self.slots
+            seq_ids = None
+        # Reconstruct input_ids and attention_mask from decode slots
+        n_slots = len(decode_slots)
+        input_ids = torch.full([n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64)
+        max_length = 0
+        for slot in decode_slots:
+            max_length = max(max_length, slot.attention_mask.size(-1))
+        attention_mask = torch.zeros([n_slots, max_length], dtype=torch.int64)
+        for i, slot in enumerate(decode_slots):
+            if slot.state != Slot.State.EMPTY:
+                # input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
+                input_ids[i, 0] = slot.next_token
+                attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
+        model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask, seq_ids)
+        logits = self.model(**model_inputs)[0]
+        return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
+
+    def _generate_token(
+        self, slots: List[Slot], next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor
+    ) -> Tuple[List[Generation], CachedBatch]:
+        generations = []
+        active_slots = False
+        for i, slot in enumerate(slots):
+            if slot.state != Slot.State.READY:
+                continue
+            request_id = slot.request_id
+            next_token_logits = logits[i : i + 1, -1, :]
+            slot_input_ids = input_ids[i : i + 1, :]
+            next_token = slot.select(slot_input_ids, next_token_logits)
+            next_token_text = slot.append(next_token)
+            generated_text = None
+            finish_reason = None
+            if next_token == self.tokenizer.eos_token_id:
+                finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
+            elif slot.stopped:
+                if slot.generated_tokens == slot.max_new_tokens:
+                    finish_reason = FinishReason.FINISH_REASON_LENGTH
+                else:
+                    finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
+            if finish_reason is not None:
+                # We must include the generated text for each finished sequence in the response
+                generated_text = GeneratedText(
+                    text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
+                )
+                logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
+                # mark the slot as available
+                slot.clear()
+            else:
+                active_slots = True
+            generations.append(
+                Generation(
+                    request_id=request_id,
+                    prefill_tokens=None,
+                    tokens=Tokens(
+                        ids=[next_token],
+                        logprobs=[0],
+                        texts=[next_token_text],
+                        is_special=[next_token in self.special_tokens],
+                    ),
+                    generated_text=generated_text,
+                )
+            )
+        batch = None
+        if active_slots:
+            # Whatever initial batch these requests came from, we always return all pending requests in a single batch
+            request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY]
+            batch = self._cached_batch(next_batch_id, request_ids)
+        else:
+            logger.debug("No more pending requests")
+        return generations, batch
+
+    def _cached_batch(self, batch_id: int, request_ids: List):
+        size = len(request_ids)
+        max_tokens = size * self.model.max_length
+        return CachedBatch(id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens)
+
+    def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
+        """Remove requests that are not listed from the specified batch
+
+        Args:
+            batch_id (`int`):
+                The id of a cached batch.
+            keep_ids(`List[int]`):
+                The list of requests that must be kept.
+
+        Return:
+            A `CachedBatch` containing the pending requests.
+        """
+        keep_slot_ids = [slot.id for slot in self.slots if slot.request_id in keep_request_ids]
+        self._clear(keep_slot_ids)
+        return self._cached_batch(batch_id, keep_request_ids)
+
+    def clear(self, batch_id: Optional[int] = None):
+        """Remove a subset or all requests from the generator"""
+        keep_ids = []
+        if batch_id is not None:
+            keep_ids = [slot.id for slot in self.slots if slot.batch_id != batch_id]
+        return self._clear(keep_ids)
+
+    def _clear(self, keep_slot_ids: List):
+        for slot in self.slots:
+            if slot.state != Slot.State.EMPTY and slot.id not in keep_slot_ids:
+                logger.debug(f"Removing slot {slot.id} with request {slot.request_id}")
+                slot.clear()
+
+    @classmethod
+    def from_pretrained(cls, model_id: str, revision: str = None):
+        """Instantiate a NeuronGenerator.
+
+        Args:
+            model_id (`str`):
+                A hub model id or the path to a local model. This path must also contain a Tokenizer.
+            revision (`Optional[str]`, defaults to `None`):
+                The revision of the model on the HuggingFace hub.
+
+        Returns:
+            A NeuronGenerator.
+        """
+        config = AutoConfig.from_pretrained(model_id)
+        neuron_config = getattr(config, "neuron", None)
+        start = time.time()
+        if neuron_config is None:
+            export_kwargs = get_export_kwargs_from_env()
+            logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
+            model = NeuronModelForCausalLM.from_pretrained(
+                model_id, revision=revision, low_cpu_mem_usage=True, export=True, **export_kwargs
+            )
+        else:
+            logger.info("Loading model on neuron devices (this can take a few minutes).")
+            model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision)
+        end = time.time()
+        logger.info(f"Model successfully loaded in {end - start:.2f} s.")
+        tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+        return cls(model, tokenizer)
diff --git a/backends/neuron/server/text_generation_server/interceptor.py b/backends/neuron/server/text_generation_server/interceptor.py
new file mode 100644
index 00000000..ed29cdf2
--- /dev/null
+++ b/backends/neuron/server/text_generation_server/interceptor.py
@@ -0,0 +1,27 @@
+from typing import Any, Callable
+
+import grpc
+from google.rpc import code_pb2, status_pb2
+from grpc_interceptor.server import AsyncServerInterceptor
+from grpc_status import rpc_status
+from loguru import logger
+
+
+class ExceptionInterceptor(AsyncServerInterceptor):
+    async def intercept(
+        self,
+        method: Callable,
+        request_or_iterator: Any,
+        context: grpc.ServicerContext,
+        method_name: str,
+    ) -> Any:
+        try:
+            response = method(request_or_iterator, context)
+            return await response
+        except Exception as err:
+            method_name = method_name.split("/")[-1]
+            logger.exception(f"Method {method_name} encountered an error.")
+
+            await context.abort_with_status(
+                rpc_status.to_status(status_pb2.Status(code=code_pb2.INTERNAL, message=str(err)))
+            )
diff --git a/backends/neuron/server/text_generation_server/model.py b/backends/neuron/server/text_generation_server/model.py
new file mode 100644
index 00000000..e8cb34ee
--- /dev/null
+++ b/backends/neuron/server/text_generation_server/model.py
@@ -0,0 +1,118 @@
+import os
+import shutil
+import time
+from typing import Optional
+
+from huggingface_hub import snapshot_download
+from huggingface_hub.constants import HF_HUB_CACHE
+from loguru import logger
+from transformers import AutoConfig
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.utils import get_hub_cached_entries
+
+
+def get_export_kwargs_from_env():
+    batch_size = os.environ.get("MAX_BATCH_SIZE", None)
+    if batch_size is not None:
+        batch_size = int(batch_size)
+    sequence_length = os.environ.get("MAX_TOTAL_TOKENS", None)
+    if sequence_length is not None:
+        sequence_length = int(sequence_length)
+    num_cores = os.environ.get("HF_NUM_CORES", None)
+    if num_cores is not None:
+        num_cores = int(num_cores)
+    auto_cast_type = os.environ.get("HF_AUTO_CAST_TYPE", None)
+    return {
+        "task": "text-generation",
+        "batch_size": batch_size,
+        "sequence_length": sequence_length,
+        "num_cores": num_cores,
+        "auto_cast_type": auto_cast_type,
+    }
+
+
+def is_cached(model_id, neuron_config):
+    # Look for cached entries for the specified model
+    in_cache = False
+    entries = get_hub_cached_entries(model_id, "inference")
+    # Look for compatible entries
+    for entry in entries:
+        compatible = True
+        for key, value in neuron_config.items():
+            # Only weights can be different
+            if key in ["checkpoint_id", "checkpoint_revision"]:
+                continue
+            if entry[key] != value:
+                compatible = False
+        if compatible:
+            in_cache = True
+            break
+    return in_cache
+
+
+def log_cache_size():
+    path = HF_HUB_CACHE
+    if os.path.exists(path):
+        usage = shutil.disk_usage(path)
+        gb = 2**30
+        logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G")
+    else:
+        raise ValueError(f"The cache directory ({path}) does not exist.")
+
+
+def fetch_model(
+    model_id: str,
+    revision: Optional[str] = None,
+) -> str:
+    """Fetch a neuron model.
+
+    Args:
+        model_id (`str`):
+            The *model_id* of a model on the HuggingFace hub or the path to a local model.
+        revision (`Optional[str]`, defaults to `None`):
+            The revision of the model on the HuggingFace hub.
+
+    Returns:
+        A string corresponding to the model_id or path.
+    """
+    if not os.path.isdir("/sys/class/neuron_device/"):
+        raise SystemError("No neuron cores detected on the host.")
+    if os.path.isdir(model_id) and revision is not None:
+        logger.warning("Revision {} ignored for local model at {}".format(revision, model_id))
+        revision = None
+    # Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
+    # Note that the model may already be present in the cache.
+    config = AutoConfig.from_pretrained(model_id, revision=revision)
+    neuron_config = getattr(config, "neuron", None)
+    if neuron_config is not None:
+        if os.path.isdir(model_id):
+            return model_id
+        # Prefetch the neuron model from the Hub
+        logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}")
+        log_cache_size()
+        return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
+    # Model needs to be exported: look for compatible cached entries on the hub
+    export_kwargs = get_export_kwargs_from_env()
+    export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs)
+    neuron_config = export_config.neuron
+    if not is_cached(model_id, neuron_config):
+        hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
+        neuron_export_url = "https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-neuronx-tgi"
+        error_msg = (
+            f"No cached version found for {model_id} with {neuron_config}."
+            f"You can start a discussion to request it on {hub_cache_url}"
+            f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
+        )
+        raise ValueError(error_msg)
+    logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
+    if os.path.isdir(model_id):
+        return model_id
+    # Prefetch weights, tokenizer and generation config so that they are in cache
+    log_cache_size()
+    start = time.time()
+    snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
+    end = time.time()
+    logger.info(f"Model weights fetched in {end - start:.2f} s.")
+    log_cache_size()
+    return model_id
diff --git a/backends/neuron/server/text_generation_server/server.py b/backends/neuron/server/text_generation_server/server.py
new file mode 100644
index 00000000..8eb2592d
--- /dev/null
+++ b/backends/neuron/server/text_generation_server/server.py
@@ -0,0 +1,89 @@
+import asyncio
+from pathlib import Path
+from typing import List
+
+from grpc import aio
+from grpc_reflection.v1alpha import reflection
+from loguru import logger
+
+from .generator import Generator, NeuronGenerator
+from .interceptor import ExceptionInterceptor
+from .pb import generate_pb2, generate_pb2_grpc
+
+
+class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
+    def __init__(self, generator: Generator, server_urls: List[str]):
+        self.generator = generator
+        self.server_urls = server_urls
+
+    async def Info(self, request, context):
+        return self.generator.info
+
+    async def Health(self, request, context):
+        return generate_pb2.HealthResponse()
+
+    async def ServiceDiscovery(self, request, context):
+        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)
+
+    async def ClearCache(self, request, context):
+        if request.HasField("id"):
+            self.generator.clear(request.id)
+        else:
+            self.generator.clear()
+        return generate_pb2.ClearCacheResponse()
+
+    async def FilterBatch(self, request, context):
+        filtered_batch = self.generator.filter(request.batch_id, request.request_ids)
+        return generate_pb2.FilterBatchResponse(batch=filtered_batch)
+
+    async def Warmup(self, request, context):
+        max_tokens = self.generator.warmup(request.batch)
+        return generate_pb2.WarmupResponse(max_supported_total_tokens=max_tokens)
+
+    async def Prefill(self, request, context):
+        generations, batch = self.generator.prefill(request.batch)
+        return generate_pb2.PrefillResponse(generations=generations, batch=batch)
+
+    async def Decode(self, request, context):
+        generations, batch = self.generator.decode(request.batches)
+        return generate_pb2.DecodeResponse(generations=generations, batch=batch)
+
+
+def serve(
+    model_id: str,
+    revision: str,
+    uds_path: Path,
+):
+    async def serve_inner(model_id: str, revision: str):
+        unix_socket_template = "unix://{}-{}"
+        local_url = unix_socket_template.format(uds_path, 0)
+        server_urls = [local_url]
+
+        try:
+            generator = NeuronGenerator.from_pretrained(model_id, revision)
+        except Exception:
+            logger.exception("Error when initializing model")
+            raise
+
+        server = aio.server(interceptors=[ExceptionInterceptor()])
+        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
+            TextGenerationService(generator, server_urls), server
+        )
+        SERVICE_NAMES = (
+            generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
+            reflection.SERVICE_NAME,
+        )
+        reflection.enable_server_reflection(SERVICE_NAMES, server)
+        server.add_insecure_port(local_url)
+
+        await server.start()
+
+        logger.info("Server started at {}".format(local_url))
+
+        try:
+            await server.wait_for_termination()
+        except KeyboardInterrupt:
+            logger.info("Signal received. Shutting down")
+            await server.stop(0)
+
+    asyncio.run(serve_inner(model_id, revision))
diff --git a/backends/neuron/tests/conftest.py b/backends/neuron/tests/conftest.py
new file mode 100644
index 00000000..1dd20c8c
--- /dev/null
+++ b/backends/neuron/tests/conftest.py
@@ -0,0 +1 @@
+pytest_plugins = ["fixtures.model"]
diff --git a/backends/neuron/tests/fixtures/model.py b/backends/neuron/tests/fixtures/model.py
new file mode 100644
index 00000000..6fa63ce8
--- /dev/null
+++ b/backends/neuron/tests/fixtures/model.py
@@ -0,0 +1,129 @@
+import copy
+import logging
+import subprocess
+import sys
+from tempfile import TemporaryDirectory
+
+import huggingface_hub
+import pytest
+from transformers import AutoTokenizer
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.utils import synchronize_hub_cache
+from optimum.neuron.version import __sdk_version__ as sdk_version
+from optimum.neuron.version import __version__ as version
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)
+
+OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
+
+# All model configurations below will be added to the neuron_model_config fixture
+MODEL_CONFIGURATIONS = {
+    "gpt2": {
+        "model_id": "gpt2",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "llama": {
+        "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "mistral": {
+        "model_id": "optimum/mistral-1.1b-testing",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+    },
+    "qwen2": {
+        "model_id": "Qwen/Qwen2.5-0.5B",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "granite": {
+        "model_id": "ibm-granite/granite-3.1-2b-instruct",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+    },
+}
+
+
+def get_hub_neuron_model_id(config_name: str):
+    return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
+
+
+def export_model(model_id, export_kwargs, neuron_model_path):
+    export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"]
+    for kwarg, value in export_kwargs.items():
+        export_command.append(f"--{kwarg}")
+        export_command.append(str(value))
+    export_command.append(neuron_model_path)
+    logger.info(f"Exporting {model_id} with {export_kwargs}")
+    try:
+        subprocess.run(export_command, check=True)
+    except subprocess.CalledProcessError as e:
+        raise ValueError(f"Failed to export model: {e}")
+
+
+@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
+def neuron_model_config(request):
+    """Expose a pre-trained neuron model
+
+    The fixture first makes sure the following model artifacts are present on the hub:
+    - exported neuron model under optimum-internal-testing/neuron-testing-<version>-<name>,
+    - cached artifacts under optimum-internal-testing/neuron-testing-cache.
+    If not, it will export the model and push it to the hub.
+
+    It then fetches the model locally and return a dictionary containing:
+    - a configuration name,
+    - the original model id,
+    - the export parameters,
+    - the neuron model id,
+    - the neuron model local path.
+
+    For each exposed model, the local directory is maintained for the duration of the
+    test session and cleaned up afterwards.
+    The hub model artifacts are never cleaned up and persist accross sessions.
+    They must be cleaned up manually when the optimum-neuron version changes.
+
+    """
+    config_name = request.param
+    model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
+    model_id = model_config["model_id"]
+    export_kwargs = model_config["export_kwargs"]
+    neuron_model_id = get_hub_neuron_model_id(config_name)
+    with TemporaryDirectory() as neuron_model_path:
+        hub = huggingface_hub.HfApi()
+        if hub.repo_exists(neuron_model_id):
+            logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
+            hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
+        else:
+            export_model(model_id, export_kwargs, neuron_model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            tokenizer.save_pretrained(neuron_model_path)
+            del tokenizer
+            # Create the test model on the hub
+            hub.create_repo(neuron_model_id, private=True)
+            hub.upload_folder(
+                folder_path=neuron_model_path,
+                repo_id=neuron_model_id,
+                ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"],
+            )
+            # Make sure it is cached
+            synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
+        # Add dynamic parameters to the model configuration
+        model_config["neuron_model_path"] = neuron_model_path
+        model_config["neuron_model_id"] = neuron_model_id
+        # Also add model configuration name to allow tests to adapt their expectations
+        model_config["name"] = config_name
+        # Yield instead of returning to keep a reference to the temporary directory.
+        # It will go out of scope and be released only once all tests needing the fixture
+        # have been completed.
+        logger.info(f"{config_name} ready for testing ...")
+        yield model_config
+        logger.info(f"Done with {config_name}")
+
+
+@pytest.fixture(scope="module")
+def neuron_model_path(neuron_model_config):
+    yield neuron_model_config["neuron_model_path"]
diff --git a/backends/neuron/tests/prune_test_models.py b/backends/neuron/tests/prune_test_models.py
new file mode 100644
index 00000000..592b1070
--- /dev/null
+++ b/backends/neuron/tests/prune_test_models.py
@@ -0,0 +1,25 @@
+import os
+
+from argparse import ArgumentParser
+from huggingface_hub import HfApi
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--yes", action="store_true", default=False)
+    args = parser.parse_args()
+    api = HfApi()
+    models = api.list_models(search="optimum-internal-testing/neuron-tgi-testing")
+    for model in models:
+        if args.yes:
+            delete = True
+        else:
+            answer = input(f"Do you want to delete {model.id} [y/N] ?")
+            delete = (answer == "y")
+        if delete:
+            api.delete_repo(model.id)
+            print(f"Deleted {model.id}.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backends/neuron/tests/pytest.ini b/backends/neuron/tests/pytest.ini
new file mode 100644
index 00000000..2f4c80e3
--- /dev/null
+++ b/backends/neuron/tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+asyncio_mode = auto
diff --git a/backends/neuron/tests/requirements.txt b/backends/neuron/tests/requirements.txt
new file mode 100644
index 00000000..ef3c8543
--- /dev/null
+++ b/backends/neuron/tests/requirements.txt
@@ -0,0 +1,19 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+text-generation >= 0.6.0
+pytest >= 7.4.0
+pytest-asyncio >= 0.21.1
+requests < 2.32.0
+docker >= 6.1.3
+Levenshtein
diff --git a/backends/neuron/tests/server/helpers.py b/backends/neuron/tests/server/helpers.py
new file mode 100644
index 00000000..81547cb6
--- /dev/null
+++ b/backends/neuron/tests/server/helpers.py
@@ -0,0 +1,149 @@
+from text_generation_server.generator import NeuronGenerator
+from text_generation_server.pb.generate_pb2 import (
+    Batch,
+    NextTokenChooserParameters,
+    Request,
+    StoppingCriteriaParameters,
+)
+
+
+def create_request(
+    id: int,
+    inputs: str,
+    truncate: int = 0,
+    max_new_tokens: int = 20,
+    do_sample: bool = False,
+    top_k: int = 50,
+    top_p: float = 0.9,
+    temperature: float = 1.0,
+    seed: int = 42,
+    repetition_penalty: float = 1.0,
+):
+    parameters = NextTokenChooserParameters(
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        do_sample=do_sample,
+        seed=seed,
+        repetition_penalty=repetition_penalty,
+    )
+    stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
+    return Request(
+        id=id, inputs=inputs, truncate=truncate, parameters=parameters, stopping_parameters=stopping_parameters
+    )
+
+
+def check_prefill(input_text, expected_token_id, expected_token_text, do_sample, batch_size, model_path):
+    """Verify that a prefill for a single request generates the expected output."""
+    generator = NeuronGenerator.from_pretrained(model_path)
+    assert generator.model.batch_size >= batch_size
+    requests = []
+    max_new_tokens = 20
+    for i in range(batch_size):
+        requests.append(create_request(id=0, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+    # Let's be pessimistic when estimating max_tokens
+    batch_size * (len(input_text) + max_new_tokens)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == batch_size
+    # Whatever was passed as max_tokens, the server will correct it
+    # because of static batching
+    assert next_batch.max_tokens == batch_size * max_length
+    assert len(generations) == batch_size
+    for g in generations:
+        tokens = g.tokens
+        assert tokens.ids == [expected_token_id]
+        assert tokens.texts == [expected_token_text]
+
+
+def check_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path):
+    """Verify that a decoding for a single request generates the expected output."""
+    generator = NeuronGenerator.from_pretrained(model_path)
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch = generator.prefill(batch)
+    # We already generated one token: call decode max_new_tokens - 1 times
+    for _ in range(max_new_tokens - 1):
+        assert next_batch.size == 1
+        assert next_batch.max_tokens == max_length
+        assert len(generations) == 1
+        assert len(generations[0].tokens.ids) == 1
+        generations, next_batch = generator.decode([next_batch])
+    assert next_batch is None
+    assert len(generations) == 1
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert output.finish_reason == 0
+    assert output.text == generated_text
+
+
+def check_decode_multiple(model_path):
+    """Verify that two requests added to the batch at different generation steps
+    generate the same outputs (continuous batching).
+    """
+    generator = NeuronGenerator.from_pretrained(model_path)
+    assert generator.model.batch_size > 1
+    input_text = "Once upon a time"
+    max_new_tokens = 20
+    # Prefill a single request, remembering the generated token
+    tokens = {0: [], 1: []}
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == 1
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == 1
+    # Decode a few tokens
+    gen_tokens = 4
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert next_batch.size == 1
+    # Add a second request
+    request = create_request(id=1, inputs=input_text, max_new_tokens=max_new_tokens)
+    batch = Batch(id=1, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch_1 = generator.prefill(batch)
+    assert next_batch_1.size == 1
+    # We should have generated only a single token
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert len(tokens[1]) == 1
+    # Decode more tokens until we reach the maximum for the first request
+    batches = [next_batch, next_batch_1]
+    for _ in range(max_new_tokens - gen_tokens):
+        generations, next_batch = generator.decode(batches)
+        for g in generations:
+            tokens[g.request_id].append(g.tokens.ids[0])
+        batches = [next_batch]
+    # Verify we now only have one pending request
+    assert next_batch.size == 1
+    assert len(tokens[0]) == max_new_tokens
+    assert len(tokens[1]) == max_new_tokens - gen_tokens + 1
+    # Verify we have the output for the first request
+    for g in generations:
+        if g.request_id == 0:
+            output = g.generated_text
+            assert output.text != ""
+            assert output.generated_tokens == max_new_tokens
+            generated_text = output.text
+    # Continue decoding until the end of the second request
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert next_batch is None
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert tokens[0] == tokens[1]
+    assert output.text == generated_text
diff --git a/backends/neuron/tests/server/test_continuous_batching.py b/backends/neuron/tests/server/test_continuous_batching.py
new file mode 100644
index 00000000..48bb70cc
--- /dev/null
+++ b/backends/neuron/tests/server/test_continuous_batching.py
@@ -0,0 +1,74 @@
+from helpers import create_request
+from text_generation_server.generator import NeuronGenerator
+from text_generation_server.pb.generate_pb2 import Batch
+
+
+def test_continuous_batching_two_requests(neuron_model_config):
+    """Verify that two requests added to the batch at different generation steps
+    generate the same outputs (continuous batching).
+    """
+    neuron_model_path = neuron_model_config["neuron_model_path"]
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    assert generator.model.batch_size > 1
+    input_text = "Once upon a time"
+    max_new_tokens = 20
+    # Prefill a single request, remembering the generated token
+    tokens = {0: [], 1: []}
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == 1
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == 1
+    # Decode a few tokens
+    gen_tokens = 4
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert next_batch.size == 1
+    # Add a second request
+    request = create_request(id=1, inputs=input_text, max_new_tokens=max_new_tokens)
+    batch = Batch(id=1, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch_1 = generator.prefill(batch)
+    assert next_batch_1.size == 1
+    # We should have generated only a single token
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert len(tokens[1]) == 1
+    # Decode more tokens until we reach the maximum for the first request
+    batches = [next_batch, next_batch_1]
+    for _ in range(max_new_tokens - gen_tokens):
+        generations, next_batch = generator.decode(batches)
+        for g in generations:
+            tokens[g.request_id].append(g.tokens.ids[0])
+        batches = [next_batch]
+    # Verify we now only have one pending request
+    assert next_batch.size == 1
+    assert len(tokens[0]) == max_new_tokens
+    assert len(tokens[1]) == max_new_tokens - gen_tokens + 1
+    # Verify we have the output for the first request
+    for g in generations:
+        if g.request_id == 0:
+            output = g.generated_text
+            assert output.text != ""
+            assert output.generated_tokens == max_new_tokens
+            generated_text = output.text
+    # Continue decoding until the end of the second request
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert next_batch is None
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert tokens[0] == tokens[1]
+    assert output.text == generated_text
diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py
new file mode 100644
index 00000000..2ab4c2da
--- /dev/null
+++ b/backends/neuron/tests/server/test_decode.py
@@ -0,0 +1,55 @@
+from helpers import create_request
+from text_generation_server.generator import NeuronGenerator
+from text_generation_server.pb.generate_pb2 import Batch
+
+
+def test_decode(neuron_model_config):
+    """Verify that a decoding for a single request generates the expected output."""
+    config_name = neuron_model_config["name"]
+    neuron_model_path = neuron_model_config["neuron_model_path"]
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    for do_sample in [True, False]:
+        mode = "sample" if do_sample else "greedy"
+        print(f"{config_name}[{mode}]")
+        _test_decode(config_name, generator, do_sample)
+        generator.clear()
+
+
+def _test_decode(config_name, generator, do_sample):
+    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    max_new_tokens = 20
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
+    generations, next_batch = generator.prefill(batch)
+    # We already generated one token: call decode max_new_tokens - 1 times
+    for _ in range(max_new_tokens - 1):
+        assert next_batch.size == 1
+        assert next_batch.max_tokens == max_length
+        assert len(generations) == 1
+        assert len(generations[0].tokens.ids) == 1
+        generations, next_batch = generator.decode([next_batch])
+    assert next_batch is None
+    assert len(generations) == 1
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert output.finish_reason == 0
+    if do_sample:
+        expected_text = {
+            "gpt2": " The sun was set",
+            "llama": "George Orwell, 1984",
+            "mistral": "The sky was",
+            "qwen2": " A young woman with",
+            "granite": "1984, George Orwell",
+        }[config_name]
+        assert expected_text in output.text
+    else:
+        print(output.text)
+        expected_text = {
+            "gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going',
+            "llama": " George Orwell’s classic dystopian novel, 1984, begins with this ominous sentence. The story",
+            "mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.",
+            "qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
+            "granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
+        }[config_name]
+        assert output.text == expected_text
diff --git a/backends/neuron/tests/server/test_generator_slot.py b/backends/neuron/tests/server/test_generator_slot.py
new file mode 100644
index 00000000..459ee3e5
--- /dev/null
+++ b/backends/neuron/tests/server/test_generator_slot.py
@@ -0,0 +1,61 @@
+import pytest
+import torch
+from text_generation_server.generator import Slot
+from text_generation_server.pb.generate_pb2 import Request
+from transformers import AutoTokenizer, GenerationConfig
+
+
+TOKENIZERS = ["NousResearch/Llama-2-7b-hf", "gpt2"]
+
+
+@pytest.fixture(params=TOKENIZERS)
+def tokenizer(request):
+    t = AutoTokenizer.from_pretrained(request.param)
+    t.padding_side = "left"
+    t.pad_token_id = t.eos_token_id
+    return t
+
+
+@pytest.mark.parametrize(
+    "input_text, generated_text",
+    [
+        [
+            "It was a bright cold day in April, and the clocks were striking thirteen.",
+            " Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind,"
+            " slipped quickly through the glass doors of Victory Mansions, though not quickly enough"
+            " to prevent a swirl of gritty dust from entering along with him.",
+        ],
+        ["This sentence is written in chinese:", "我很感谢你的热情"],
+        ["Some text might contain a lot of emojis like 😃", "😍💪 👉 👀"],
+    ],
+    ids=["spaces", "chinese-utf8", "emojis"],
+)
+def test_decode_streaming(tokenizer, input_text, generated_text):
+    slot = Slot(0, tokenizer)
+    request = Request(id=0, inputs=input_text)
+    slot.assign(0, request, GenerationConfig())
+    assert slot.cached_text == input_text
+
+    inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt")
+    input_ids = inputs["input_ids"][0]
+    attention_mask = inputs["attention_mask"][0]
+    generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
+
+    # We need to regenerate the full text as the tokenizer might change it (extra spaces might be added)
+    all_input_ids = torch.cat([input_ids, torch.tensor(generated_tokens)])
+    full_text = tokenizer.decode(all_input_ids, skip_special_tokens=True)
+    regenerated_text = full_text[len(input_text) :]
+
+    # Initialize the slot with the inputs
+    slot.reset(input_ids, attention_mask, selector=None)
+
+    assert slot.generated_tokens == 0
+
+    # Simulate an iterative generation (i.e. don't call select and use known tokens instead)
+    decoded_text = ""
+    for i in range(len(generated_tokens)):
+        text = slot.append(generated_tokens[i])
+        assert slot.generated_tokens == i + 1
+        decoded_text += text
+
+    assert decoded_text == regenerated_text
diff --git a/backends/neuron/tests/server/test_info.py b/backends/neuron/tests/server/test_info.py
new file mode 100644
index 00000000..5913acec
--- /dev/null
+++ b/backends/neuron/tests/server/test_info.py
@@ -0,0 +1,10 @@
+from text_generation_server.generator import NeuronGenerator
+
+
+def test_info(neuron_model_path):
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    info = generator.info
+    assert info.requires_padding is True
+    assert info.device_type == "xla"
+    assert info.window_size == 0
+    assert info.speculate == 0
diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py
new file mode 100644
index 00000000..2120e5c5
--- /dev/null
+++ b/backends/neuron/tests/server/test_prefill.py
@@ -0,0 +1,89 @@
+from helpers import create_request
+from text_generation_server.generator import NeuronGenerator
+from text_generation_server.pb.generate_pb2 import Batch
+
+
+def test_prefill(neuron_model_config):
+    """Verify that a prefill for a single request generates the expected output."""
+    config_name = neuron_model_config["name"]
+    neuron_model_path = neuron_model_config["neuron_model_path"]
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    max_batch_size = 4
+    assert generator.model.batch_size >= max_batch_size
+    for num_requests in [1, max_batch_size]:
+        for do_sample in [True, False]:
+            mode = "sample" if do_sample else "greedy"
+            print(f"[{mode}]: {num_requests} requests")
+            _test_prefill(config_name, generator, num_requests, do_sample)
+            generator.clear()
+
+
+def _test_prefill(config_name, generator, batch_size, do_sample):
+    requests = []
+    max_new_tokens = 20
+    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    for i in range(batch_size):
+        requests.append(create_request(id=i, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+    # Let's be pessimistic when estimating max_tokens
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == batch_size
+    # Whatever was passed as max_tokens, the server will correct it
+    # because of static batching
+    assert next_batch.max_tokens == batch_size * max_length
+    assert len(generations) == batch_size
+    if do_sample:
+        expectations = {
+            "gpt2": [383, " The"],
+            "llama": [10058, " George"],
+            "mistral": [450, " The"],
+            "qwen2": [362, " A"],
+            "granite": [308, " ("],
+        }[config_name]
+    else:
+        expectations = {
+            "gpt2": [198, "\n"],
+            "llama": [10058, " George"],
+            "mistral": [13, "\n"],
+            "qwen2": [358, " I"],
+            "granite": [203, "\n"],
+        }[config_name]
+    for g in generations:
+        tokens = g.tokens
+        assert tokens.ids[0] == expectations[0]
+        assert tokens.texts[0] == expectations[1]
+
+
+def test_prefill_truncate(neuron_model_config):
+    config_name = neuron_model_config["name"]
+    neuron_model_path = neuron_model_config["neuron_model_path"]
+    generator = NeuronGenerator.from_pretrained(neuron_model_path)
+    batch_size = generator.model.batch_size
+    # We apply truncation to all requests but the first one
+    truncate = [
+        None,
+    ] + [i * 3 for i in range(1, batch_size)]
+    input_text = (
+        "Two gin-scented tears trickled down the sides of his nose."
+        " But it was all right, everything was all right, the struggle was finished."
+        " He had won the victory over himself. He loved Big Brother."
+    )
+    requests = []
+    for i in range(batch_size):
+        requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
+    max_length = generator.model.max_length
+    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    generations, _ = generator.prefill(batch)
+    # Even if the input text is identical for all requests, the first generated token might
+    # be different because of the truncation
+    expectations = {
+        "gpt2": [" He", " He", "\n", " He"],
+        "llama": [" —", " The", " He", " He"],
+        "mistral": [" He", "\n", " He", " He"],
+        "qwen2": [" He", " The", " He", " He"],
+        "granite": ["\n", "\n", " I", " He"],
+    }[config_name]
+    for i, g in enumerate(generations):
+        tokens = g.tokens
+        assert tokens.texts[0] == expectations[i]
diff --git a/backends/neuron/tgi-entrypoint.sh b/backends/neuron/tgi-entrypoint.sh
new file mode 100755
index 00000000..b959a795
--- /dev/null
+++ b/backends/neuron/tgi-entrypoint.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e -o pipefail -u
+
+export ENV_FILEPATH=$(mktemp)
+
+trap "rm -f ${ENV_FILEPATH}" EXIT
+
+touch $ENV_FILEPATH
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+${SCRIPT_DIR}/tgi_env.py $@
+
+source $ENV_FILEPATH
+
+exec text-generation-launcher $@
diff --git a/backends/neuron/tgi_env.py b/backends/neuron/tgi_env.py
new file mode 100755
index 00000000..ff647c98
--- /dev/null
+++ b/backends/neuron/tgi_env.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+
+import argparse
+import logging
+import os
+import sys
+from typing import Any, Dict, List, Optional
+
+from huggingface_hub import constants
+from transformers import AutoConfig
+
+from optimum.neuron.modeling_decoder import get_available_cores
+from optimum.neuron.utils import get_hub_cached_entries
+from optimum.neuron.utils.version_utils import get_neuronxcc_version
+
+
+logger = logging.getLogger(__name__)
+
+tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"]
+tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
+
+env_config_peering = [
+    ("MAX_BATCH_SIZE", "batch_size"),
+    ("MAX_TOTAL_TOKENS", "sequence_length"),
+    ("HF_AUTO_CAST_TYPE", "auto_cast_type"),
+    ("HF_NUM_CORES", "num_cores"),
+]
+
+# By the end of this script all env var should be specified properly
+env_vars = tgi_server_env_vars + tgi_router_env_vars
+
+available_cores = get_available_cores()
+neuronxcc_version = get_neuronxcc_version()
+
+
+def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    if not argv:
+        argv = sys.argv
+    # All these are params passed to tgi and intercepted here
+    parser.add_argument(
+        "--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
+    )
+    parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
+    parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
+    parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0))
+    parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
+    parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
+
+    args = parser.parse_known_args(argv)[0]
+
+    if not args.model_id:
+        raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var")
+
+    # Override env with cmdline params
+    os.environ["MODEL_ID"] = args.model_id
+
+    # Set all tgi router and tgi server values to consistent values as early as possible
+    # from the order of the parser defaults, the tgi router value can override the tgi server ones
+    if args.max_total_tokens > 0:
+        os.environ["MAX_TOTAL_TOKENS"] = str(args.max_total_tokens)
+
+    if args.max_input_tokens > 0:
+        os.environ["MAX_INPUT_TOKENS"] = str(args.max_input_tokens)
+
+    if args.max_batch_size > 0:
+        os.environ["MAX_BATCH_SIZE"] = str(args.max_batch_size)
+
+    if args.max_batch_prefill_tokens > 0:
+        os.environ["MAX_BATCH_PREFILL_TOKENS"] = str(args.max_batch_prefill_tokens)
+
+    if args.revision:
+        os.environ["REVISION"] = str(args.revision)
+
+    return args
+
+
+def neuron_config_to_env(neuron_config):
+    with open(os.environ["ENV_FILEPATH"], "w") as f:
+        for env_var, config_key in env_config_peering:
+            f.write("export {}={}\n".format(env_var, neuron_config[config_key]))
+        max_input_tokens = os.getenv("MAX_INPUT_TOKENS")
+        if not max_input_tokens:
+            max_input_tokens = int(neuron_config["sequence_length"]) // 2
+            if max_input_tokens == 0:
+                raise Exception("Model sequence length should be greater than 1")
+        f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
+        max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
+        if not max_batch_prefill_tokens:
+            max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens)
+        f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
+
+
+def sort_neuron_configs(dictionary):
+    return -dictionary["num_cores"], -dictionary["batch_size"]
+
+
+def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]:
+    # Reuse the same mechanic as the one in use to configure the tgi server part
+    # The only difference here is that we stay as flexible as possible on the compatibility part
+    entries = get_hub_cached_entries(model_id, "inference")
+
+    logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision)
+
+    all_compatible = []
+    for entry in entries:
+        if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True):
+            all_compatible.append(entry)
+
+    if not all_compatible:
+        logger.debug(
+            "No compatible cached entry found for model %s, env %s, available cores %s, neuronxcc version %s",
+            model_id,
+            get_env_dict(),
+            available_cores,
+            neuronxcc_version,
+        )
+        return None
+
+    logger.info("%d compatible neuron cached models found", len(all_compatible))
+
+    all_compatible = sorted(all_compatible, key=sort_neuron_configs)
+
+    entry = all_compatible[0]
+
+    return entry
+
+
+def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool:
+    logger.debug(
+        "Checking the provided neuron config %s is compatible with the local setup and provided environment",
+        neuron_config,
+    )
+
+    # Local setup compat checks
+    if neuron_config["num_cores"] > available_cores:
+        logger.debug("Not enough neuron cores available to run the provided neuron config")
+        return False
+
+    if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version:
+        logger.debug(
+            "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
+            neuronxcc_version,
+            neuron_config["compiler_version"],
+        )
+        return False
+
+    for env_var, config_key in env_config_peering:
+        neuron_config_value = str(neuron_config[config_key])
+        env_value = os.getenv(env_var, str(neuron_config_value))
+        if env_value != neuron_config_value:
+            logger.debug(
+                "The provided env var '%s' and the neuron config '%s' param differ (%s != %s)",
+                env_var,
+                config_key,
+                env_value,
+                neuron_config_value,
+            )
+            return False
+
+    max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
+    if max_input_tokens > 0:
+        sequence_length = neuron_config["sequence_length"]
+        if max_input_tokens >= sequence_length:
+            logger.debug(
+                "Specified max input tokens is not compatible with config sequence length ( %s >= %s)",
+                max_input_tokens,
+                sequence_length,
+            )
+            return False
+
+    return True
+
+
+def get_env_dict() -> Dict[str, str]:
+    d = {}
+    for k in env_vars:
+        d[k] = os.getenv(k)
+    return d
+
+
+def main():
+    """
+    This script determines proper default TGI env variables for the neuron precompiled models to
+    work properly
+    :return:
+    """
+    args = parse_cmdline_and_set_env()
+
+    for env_var in env_vars:
+        if not os.getenv(env_var):
+            break
+    else:
+        logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars)
+        sys.exit(0)
+
+    cache_dir = constants.HF_HUB_CACHE
+
+    logger.info("Cache dir %s, model %s", cache_dir, args.model_id)
+
+    config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
+    neuron_config = getattr(config, "neuron", None)
+    if neuron_config is not None:
+        compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False)
+        if not compatible:
+            env_dict = get_env_dict()
+            msg = (
+                "Invalid neuron config and env. Config {}, env {}, available cores {}, neuronxcc version {}"
+            ).format(neuron_config, env_dict, available_cores, neuronxcc_version)
+            logger.error(msg)
+            raise Exception(msg)
+    else:
+        neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
+
+    if not neuron_config:
+        msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format(
+            get_env_dict(), available_cores, neuronxcc_version
+        )
+        logger.error(msg)
+        raise Exception(msg)
+
+    neuron_config_to_env(neuron_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index e073353f..39f0ef4b 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -50,6 +50,8 @@
     title: Train Medusa
   title: Tutorials
 - sections:
+  - local: backends/neuron
+    title: Neuron
   - local: backends/trtllm
     title: TensorRT-LLM
   - local: backends/llamacpp
diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md
new file mode 100644
index 00000000..8e80701f
--- /dev/null
+++ b/docs/source/backends/neuron.md
@@ -0,0 +1,179 @@
+# Neuron backend for AWS Trainium and Inferentia
+
+The Neuron backend allows the deployment of TGI on AWS Trainium and Inferentia family of chips.
+
+The following hardware targets are supported:
+- Trainium 1,
+- Inferentia 2.
+
+## Features
+
+The basic TGI features are supported:
+
+- continuous batching,
+- token streaming,
+- greedy search and multinomial sampling using [transformers](https://huggingface.co/docs/transformers/generation_strategies#customize-text-generation).
+
+
+## Deploy the service from the Hugging Face hub
+
+The simplest way to deploy the NeuronX TGI service for a specific model is to follow the
+deployment instructions in the model card:
+
+- click on the "Deploy" button on the right,
+- select your deployment service ("Inference Endpoints" and "SageMaker" are supported),
+- select "AWS Trainum & Inferentia",
+- follow the instructions.
+
+
+## Deploy the service on a dedicated host
+
+The service is launched simply by running the text-generation-inference container with two sets of parameters:
+
+```
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.1.0-neuron <service_parameters>
+```
+
+- system parameters are used to map ports, volumes and devices between the host and the service,
+- service parameters are forwarded to the `text-generation-launcher`.
+
+When deploying a service, you will need a pre-compiled Neuron model. The Neuron TGI backend supports two main modes of operation:
+
+- you can either deploy the service on a model that has already been exported to Neuron,
+- or alternatively you can take advantage of the Neuron Model Cache to export your own model.
+
+### Common system parameters
+
+Whenever you launch a TGI service, we highly recommend you to mount a shared volume mounted as `/data` in the container: this is where
+the models will be cached to speed up further instantiations of the service.
+
+Note also that enough neuron devices should be made visible to the container, knowing that each neuron device has two cores (so when deploying on two cores you need to expose at least one device).
+The recommended way to expose a device in a production environment is to use explicitly the `--device` option (e.g `--device /dev/neuron0`) repeated as many time as there are devices to be exposed.
+
+Note: alternatively, for a quick local test it is also possible to launch the service in `privileged` mode to get access to all neuron devices.
+
+Finally, you might want to export the `HF_TOKEN` if you want to access gated repositories.
+
+Here is an example of a service instantiation exposing only the first device:
+
+```
+docker run -p 8080:80 \
+       -v $(pwd)/data:/data \
+       --device=/dev/neuron0 \
+       -e HF_TOKEN=${HF_TOKEN} \
+       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
+       <service_parameters>
+```
+
+### Using a standard model from the 🤗 [HuggingFace Hub](https://huggingface.co/aws-neuron) (recommended)
+
+We maintain a Neuron Model Cache of the most popular architecture and deployment parameters under [aws-neuron/optimum-neuron-cache](https://huggingface.co/aws-neuron/optimum-neuron-cache).
+
+If you just want to try the service quickly using a model without exporting it to Neuron first, it is thus still possible, pending some conditions:
+- you must specify the export parameters when launching the service (or use default parameters),
+- the model configuration must be cached.
+
+The snippet below shows how you can deploy a service from a hub standard model:
+
+```
+export HF_TOKEN=<YOUR_TOKEN>
+docker run -p 8080:80 \
+       -v $(pwd)/data:/data \
+       --device=/dev/neuron0 \
+       --device=/dev/neuron1 \
+       --device=/dev/neuron2 \
+       --device=/dev/neuron3 \
+       -e HF_TOKEN=${HF_TOKEN} \
+       -e HF_AUTO_CAST_TYPE="fp16" \
+       -e HF_NUM_CORES=8 \
+       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
+       --model-id meta-llama/Meta-Llama-3-8B \
+       --max-batch-size 1 \
+       --max-input-length 3164 \
+       --max-total-tokens 4096
+```
+
+### Using a model exported to a local path
+
+Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-text-generation-inference) locally.
+
+You can then deploy the service inside the shared volume:
+
+```
+docker run -p 8080:80 \
+       -v $(pwd)/data:/data \
+       --device=/dev/neuron0 \
+       --device=/dev/neuron1 \
+       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
+       --model-id /data/<neuron_model_path>
+```
+
+Note: You don't need to specify any service parameters, as they will all be deduced from the model export configuration. You must however expose enough devices to match the number of cores specified during the export phase.
+
+
+### Using a neuron model from the 🤗 [HuggingFace Hub](https://huggingface.co/)
+
+The easiest way to share a neuron model inside your organization is to push it on the Hugging Face hub, so that it can be deployed directly without requiring an export.
+
+The snippet below shows how you can deploy a service from a hub neuron model:
+
+```
+docker run -p 8080:80 \
+       -v $(pwd)/data:/data \
+       --device=/dev/neuron0 \
+       --device=/dev/neuron1 \
+       -e HF_TOKEN=${HF_TOKEN} \
+       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
+       --model-id <organization>/<neuron-model>
+```
+
+### Choosing service parameters
+
+Use the following command to list the available service parameters:
+
+```
+docker run ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron --help
+```
+
+The configuration of an inference endpoint is always a compromise between throughput and latency: serving more requests in parallel will allow a higher throughput, but it will increase the latency.
+
+The neuron models have static input dimensions `[batch_size, max_length]`.
+
+This adds several restrictions to the following parameters:
+
+- `--max-batch-size` must be set to `batch size`,
+- `--max-input-length` must be lower than `max_length`,
+- `--max-total-tokens` must be set to `max_length` (it is per-request).
+
+Although not strictly necessary, but important for efficient prefilling:
+
+- `--max-batch-prefill-tokens` should be set to `batch_size` * `max-input-length`.
+
+### Choosing the correct batch size
+
+As seen in the previous paragraph, neuron model static batch size has a direct influence on the endpoint latency and throughput.
+
+Please refer to [text-generation-inference](https://github.com/huggingface/text-generation-inference) for optimization hints.
+
+Note that the main constraint is to be able to fit the model for the specified `batch_size` within the total device memory available
+on your instance (16GB per neuron core, with 2 cores per device).
+
+## Query the service
+
+You can query the model using either the `/generate` or `/generate_stream` routes:
+
+```
+curl 127.0.0.1:8080/generate \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -H 'Content-Type: application/json'
+```
+
+```
+curl 127.0.0.1:8080/generate_stream \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -H 'Content-Type: application/json'
+```
+
+Note: replace 127.0.0.1:8080 with your actual IP address and port.
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 8440df34..5e5dfc2a 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,3 +1,4 @@
+pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.model"]
 # ruff: noqa: E402
 from _pytest.fixtures import SubRequest
 import requests
@@ -57,20 +58,37 @@ def pytest_addoption(parser):
     parser.addoption(
         "--release", action="store_true", default=False, help="run release tests"
     )
+    parser.addoption(
+        "--neuron", action="store_true", default=False, help="run neuron tests"
+    )
 
 
 def pytest_configure(config):
     config.addinivalue_line("markers", "release: mark test as a release-only test")
+    config.addinivalue_line("markers", "neuron: mark test as a neuron test")
 
 
 def pytest_collection_modifyitems(config, items):
-    if config.getoption("--release"):
-        # --release given in cli: do not skip release tests
-        return
-    skip_release = pytest.mark.skip(reason="need --release option to run")
+    selectors = []
+    if not config.getoption("--release"):
+        # --release not given in cli: skip release tests
+        def skip_release(item):
+            if "release" in item.keywords:
+                item.add_marker(pytest.mark.skip(reason="need --release option to run"))
+        selectors.append(skip_release)
+    if config.getoption("--neuron"):
+        def skip_not_neuron(item):
+            if "neuron" not in item.keywords:
+                item.add_marker(pytest.mark.skip(reason="incompatible with --neuron option"))
+        selectors.append(skip_not_neuron)
+    else:
+        def skip_neuron(item):
+            if "neuron" in item.keywords:
+                item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
+        selectors.append(skip_neuron)
     for item in items:
-        if "release" in item.keywords:
-            item.add_marker(skip_release)
+        for selector in selectors:
+            selector(item)
 
 
 @pytest.fixture(autouse=True)
diff --git a/integration-tests/fixtures/neuron/model.py b/integration-tests/fixtures/neuron/model.py
new file mode 100644
index 00000000..10f1c84e
--- /dev/null
+++ b/integration-tests/fixtures/neuron/model.py
@@ -0,0 +1,223 @@
+import copy
+import logging
+import sys
+from tempfile import TemporaryDirectory
+
+import huggingface_hub
+import pytest
+import docker
+import hashlib
+import os
+import tempfile
+
+from docker.errors import NotFound
+
+
+TEST_ORGANIZATION = "optimum-internal-testing"
+TEST_CACHE_REPO_ID = f"{TEST_ORGANIZATION}/neuron-testing-cache"
+HF_TOKEN = huggingface_hub.get_token()
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)
+
+
+# All model configurations below will be added to the neuron_model_config fixture
+MODEL_CONFIGURATIONS = {
+    "gpt2": {
+        "model_id": "gpt2",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "llama": {
+        "model_id": "unsloth/Llama-3.2-1B-Instruct",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "mistral": {
+        "model_id": "optimum/mistral-1.1b-testing",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+    },
+    "qwen2": {
+        "model_id": "Qwen/Qwen2.5-0.5B",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "granite": {
+        "model_id": "ibm-granite/granite-3.1-2b-instruct",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+    },
+}
+
+
+def get_neuron_backend_hash():
+    import subprocess
+    res = subprocess.run(["git", "rev-parse", "--show-toplevel"],
+                         capture_output=True,
+                         text=True)
+    root_dir = res.stdout.split('\n')[0]
+    def get_sha(path):
+        res = subprocess.run(["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
+                             capture_output=True,
+                             text=True)
+        # Output of the command is in the form '040000 tree|blob <SHA>\t<path>\n'
+        sha = res.stdout.split('\t')[0].split(' ')[-1]
+        return sha.encode()
+    # We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that
+    m = hashlib.sha256()
+    m.update(get_sha('backends/neuron'))
+    m.update(get_sha('Dockerfile.neuron'))
+    return m.hexdigest()[:10]
+
+
+def get_neuron_model_name(config_name: str):
+    return f"neuron-tgi-testing-{config_name}-{get_neuron_backend_hash()}"
+
+
+def get_tgi_docker_image():
+    docker_image = os.getenv("DOCKER_IMAGE", None)
+    if docker_image is None:
+        client = docker.from_env()
+        images = client.images.list(filters={"reference": "text-generation-inference"})
+        if not images:
+            raise ValueError("No text-generation-inference image found on this host to run tests.")
+        docker_image = images[0].tags[0]
+    return docker_image
+
+
+def export_model(config_name, model_config, neuron_model_name):
+    """Export a neuron model.
+
+    The model is exported by a custom image built on the fly from the base TGI image.
+    This makes sure the exported model and image are aligned and avoids introducing
+    neuron specific imports in the test suite.
+
+    Args:
+        config_name (`str`):
+            Used to identify test configurations
+        model_config (`str`):
+            The model configuration for export (includes the original model id)
+        neuron_model_name (`str`):
+            The name of the exported model on the hub
+    """
+
+    client = docker.from_env()
+
+    env = {"LOG_LEVEL": "info", "CUSTOM_CACHE_REPO": TEST_CACHE_REPO_ID}
+    if HF_TOKEN is not None:
+        env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
+        env["HF_TOKEN"] = HF_TOKEN
+
+    # Create a sub-image to export the model to workaround docker dind issues preventing
+    # to share a volume from the container running tests
+    model_id = model_config["model_id"]
+    export_kwargs = model_config["export_kwargs"]
+    base_image = get_tgi_docker_image()
+    export_image = f"neuron-tgi-tests-{config_name}-export-img"
+    logger.info(f"Building temporary image {export_image} from {base_image}")
+    with tempfile.TemporaryDirectory() as context_dir:
+        # Create entrypoint
+        model_path = "/data/neuron_model"
+        export_command = f"optimum-cli export neuron -m {model_id} --task text-generation"
+        for kwarg, value in export_kwargs.items():
+            export_command += f" --{kwarg} {str(value)}"
+        export_command += f" {model_path}"
+        entrypoint_content = f"""#!/bin/sh
+        {export_command}
+        huggingface-cli repo create --organization {TEST_ORGANIZATION} {neuron_model_name}
+        huggingface-cli upload {TEST_ORGANIZATION}/{neuron_model_name} {model_path} --exclude *.bin *.safetensors
+        optimum-cli neuron cache synchronize --repo_id {TEST_CACHE_REPO_ID}
+        """
+        with open(os.path.join(context_dir, "entrypoint.sh"), "wb") as f:
+            f.write(entrypoint_content.encode("utf-8"))
+            f.flush()
+        # Create Dockerfile
+        docker_content = f"""
+        FROM {base_image}
+        COPY entrypoint.sh /export-entrypoint.sh
+        RUN chmod +x /export-entrypoint.sh
+        ENTRYPOINT ["/export-entrypoint.sh"]
+        """
+        with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
+            f.write(docker_content.encode("utf-8"))
+            f.flush()
+        image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=export_image)
+        logger.info("Successfully built image %s", image.id)
+        logger.debug("Build logs %s", logs)
+
+    try:
+        container = client.containers.run(
+            export_image,
+            environment=env,
+            auto_remove=True,
+            detach=False,
+            devices=["/dev/neuron0"],
+            shm_size="1G",
+        )
+        logger.info(f"Successfully exported model for config {config_name}")
+        container.logs()
+    except Exception as e:
+        logger.exception(f"An exception occurred while running container: {e}.")
+        pass
+    finally:
+        # Cleanup the export image
+        logger.info("Cleaning image %s", image.id)
+        try:
+            image.remove(force=True)
+        except NotFound:
+            pass
+        except Exception as e:
+            logger.error("Error while removing image %s, skipping", image.id)
+            logger.exception(e)
+
+
+@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
+def neuron_model_config(request):
+    """Expose a pre-trained neuron model
+
+    The fixture first makes sure the following model artifacts are present on the hub:
+    - exported neuron model under optimum-internal-testing/neuron-testing-<name>-<version>,
+    - cached artifacts under optimum-internal-testing/neuron-testing-cache.
+    If not, it will export the model and push it to the hub.
+
+    It then fetches the model locally and return a dictionary containing:
+    - a configuration name,
+    - the original model id,
+    - the export parameters,
+    - the neuron model id,
+    - the neuron model local path.
+
+    For each exposed model, the local directory is maintained for the duration of the
+    test session and cleaned up afterwards.
+    The hub model artifacts are never cleaned up and persist accross sessions.
+    They must be cleaned up manually when the optimum-neuron version changes.
+
+    """
+    config_name = request.param
+    model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
+    neuron_model_name = get_neuron_model_name(config_name)
+    neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
+    with TemporaryDirectory() as neuron_model_path:
+        hub = huggingface_hub.HfApi()
+        if not hub.repo_exists(neuron_model_id):
+            # Export the model first
+            export_model(config_name, model_config, neuron_model_name)
+        logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
+        hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
+        # Add dynamic parameters to the model configuration
+        model_config["neuron_model_path"] = neuron_model_path
+        model_config["neuron_model_id"] = neuron_model_id
+        # Also add model configuration name to allow tests to adapt their expectations
+        model_config["name"] = config_name
+        # Yield instead of returning to keep a reference to the temporary directory.
+        # It will go out of scope and be released only once all tests needing the fixture
+        # have been completed.
+        logger.info(f"{config_name} ready for testing ...")
+        yield model_config
+        logger.info(f"Done with {config_name}")
+
+
+@pytest.fixture(scope="module")
+def neuron_model_path(neuron_model_config):
+    yield neuron_model_config["neuron_model_path"]
diff --git a/integration-tests/fixtures/neuron/service.py b/integration-tests/fixtures/neuron/service.py
new file mode 100644
index 00000000..927a35af
--- /dev/null
+++ b/integration-tests/fixtures/neuron/service.py
@@ -0,0 +1,252 @@
+import asyncio
+import contextlib
+import logging
+import os
+import random
+import shutil
+import sys
+import tempfile
+import time
+from typing import List
+
+import docker
+import huggingface_hub
+import pytest
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
+from docker.errors import NotFound
+from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
+
+
+OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
+HF_TOKEN = huggingface_hub.get_token()
+
+
+def get_tgi_docker_image():
+    docker_image = os.getenv("DOCKER_IMAGE", None)
+    if docker_image is None:
+        client = docker.from_env()
+        images = client.images.list(filters={"reference": "text-generation-inference"})
+        if not images:
+            raise ValueError("No text-generation-inference image found on this host to run tests.")
+        docker_image = images[0].tags[0]
+    return docker_image
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)
+
+
+class TestClient(AsyncInferenceClient):
+    def __init__(self, service_name: str, base_url: str):
+        super().__init__(model=base_url)
+        self.service_name = service_name
+
+
+class LauncherHandle:
+    def __init__(self, service_name: str, port: int):
+        self.client = TestClient(service_name, f"http://localhost:{port}")
+
+    def _inner_health(self):
+        raise NotImplementedError
+
+    async def health(self, timeout: int = 60):
+        assert timeout > 0
+        for i in range(timeout):
+            if not self._inner_health():
+                raise RuntimeError(f"Service crashed after {i} seconds.")
+
+            try:
+                await self.client.text_generation("test", max_new_tokens=1)
+                logger.info(f"Service started after {i} seconds")
+                return
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
+                time.sleep(1)
+            except Exception:
+                raise RuntimeError("Basic generation failed with: {e}")
+        raise RuntimeError(f"Service failed to start after {i} seconds.")
+
+
+class ContainerLauncherHandle(LauncherHandle):
+    def __init__(self, service_name, docker_client, container_name, port: int):
+        super(ContainerLauncherHandle, self).__init__(service_name, port)
+        self.docker_client = docker_client
+        self.container_name = container_name
+        self._log_since = time.time()
+
+    def _inner_health(self) -> bool:
+        container = self.docker_client.containers.get(self.container_name)
+        container_output = container.logs(since=self._log_since).decode("utf-8")
+        self._log_since = time.time()
+        if container_output != "":
+            print(container_output, end="")
+        return container.status in ["running", "created"]
+
+
+@pytest.fixture(scope="module")
+def event_loop():
+    loop = asyncio.get_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture(scope="module")
+def neuron_launcher(event_loop):
+    """Utility fixture to expose a TGI service.
+
+    The fixture uses a single event loop for each module, but it can create multiple
+    docker services with different parameters using the parametrized inner context.
+
+    Args:
+        service_name (`str`):
+            Used to identify test configurations and adjust test expectations,
+        model_name_or_path (`str`):
+            The model to use (can be a hub model or a path)
+        trust_remote_code (`bool`):
+            Must be set to True for gated models.
+
+    Returns:
+        A `ContainerLauncherHandle` containing both a TGI server and client.
+    """
+
+    @contextlib.contextmanager
+    def docker_launcher(
+        service_name: str,
+        model_name_or_path: str,
+        trust_remote_code: bool = False,
+    ):
+        port = random.randint(8000, 10_000)
+
+        client = docker.from_env()
+
+        container_name = f"tgi-tests-{service_name}-{port}"
+
+        try:
+            container = client.containers.get(container_name)
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+
+        env = {"LOG_LEVEL": "info,text_generation_router=debug", "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID}
+
+        if HF_TOKEN is not None:
+            env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
+            env["HF_TOKEN"] = HF_TOKEN
+
+        for var in ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]:
+            if var in os.environ:
+                env[var] = os.environ[var]
+
+        base_image = get_tgi_docker_image()
+        if os.path.isdir(model_name_or_path):
+            # Create a sub-image containing the model to workaround docker dind issues preventing
+            # to share a volume from the container running tests
+
+            test_image = f"{container_name}-img"
+            logger.info(
+                "Building image on the flight derivated from %s, tagged with %s",
+                base_image,
+                test_image,
+            )
+            with tempfile.TemporaryDirectory() as context_dir:
+                # Copy model directory to build context
+                model_path = os.path.join(context_dir, "model")
+                shutil.copytree(model_name_or_path, model_path)
+                # Create Dockerfile
+                container_model_id = f"/data/{model_name_or_path}"
+                docker_content = f"""
+                FROM {base_image}
+                COPY model {container_model_id}
+                """
+                with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
+                    f.write(docker_content.encode("utf-8"))
+                    f.flush()
+                image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=test_image)
+            logger.info("Successfully built image %s", image.id)
+            logger.debug("Build logs %s", logs)
+        else:
+            test_image = base_image
+            image = None
+            container_model_id = model_name_or_path
+
+        args = ["--model-id", container_model_id, "--env"]
+
+        if trust_remote_code:
+            args.append("--trust-remote-code")
+
+        container = client.containers.run(
+            test_image,
+            command=args,
+            name=container_name,
+            environment=env,
+            auto_remove=False,
+            detach=True,
+            devices=["/dev/neuron0"],
+            ports={"80/tcp": port},
+            shm_size="1G",
+        )
+
+        logger.info(f"Starting {container_name} container")
+        yield ContainerLauncherHandle(service_name, client, container.name, port)
+
+        try:
+            container.stop(timeout=60)
+            container.wait(timeout=60)
+        except Exception as e:
+            logger.exception(f"Ignoring exception while stopping container: {e}.")
+            pass
+        finally:
+            logger.info("Removing container %s", container_name)
+            try:
+                container.remove(force=True)
+            except Exception as e:
+                logger.error("Error while removing container %s, skipping", container_name)
+                logger.exception(e)
+
+            # Cleanup the build image
+            if image:
+                logger.info("Cleaning image %s", image.id)
+                try:
+                    image.remove(force=True)
+                except NotFound:
+                    pass
+                except Exception as e:
+                    logger.error("Error while removing image %s, skipping", image.id)
+                    logger.exception(e)
+
+    return docker_launcher
+
+
+@pytest.fixture(scope="module")
+def neuron_generate_load():
+    """A utility fixture to launch multiple asynchronous TGI requests in parallel
+
+    Args:
+        client (`AsyncClient`):
+            An async client
+        prompt (`str`):
+            The prompt to use (identical for all requests)
+        max_new_tokens (`int`):
+            The number of tokens to generate for each request.
+        n (`int`):
+            The number of requests
+
+    Returns:
+        A list of `huggingface_hub.TextGenerationOutput`.
+    """
+
+    async def generate_load_inner(
+        client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
+    ) -> List[TextGenerationOutput]:
+        futures = [
+            client.text_generation(prompt, max_new_tokens=max_new_tokens, details=True, decoder_input_details=True)
+            for _ in range(n)
+        ]
+
+        return await asyncio.gather(*futures)
+
+    return generate_load_inner
diff --git a/integration-tests/neuron/integration/test_generate.py b/integration-tests/neuron/integration/test_generate.py
new file mode 100644
index 00000000..96f09838
--- /dev/null
+++ b/integration-tests/neuron/integration/test_generate.py
@@ -0,0 +1,93 @@
+import pytest
+
+
+@pytest.fixture
+async def tgi_service(neuron_launcher, neuron_model_config):
+    model_name_or_path = neuron_model_config["neuron_model_path"]
+    service_name = neuron_model_config["name"]
+    with neuron_launcher(service_name, model_name_or_path) as tgi_service:
+        await tgi_service.health(600)
+        yield tgi_service
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_service):
+    service_name = tgi_service.client.service_name
+    prompt = "What is Deep Learning?"
+    # Greedy bounded without input
+    response = await tgi_service.client.text_generation(
+        prompt, max_new_tokens=17, details=True, decoder_input_details=True
+    )
+    assert response.details.generated_tokens == 17
+    greedy_expectations = {
+        "gpt2": "\n\nDeep learning is a new field of research that has been around for a while",
+        "llama": " and How Does it Work?\nDeep learning is a subset of machine learning that uses artificial",
+        "mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that",
+        "qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on",
+        "granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art",
+    }
+    assert response.generated_text == greedy_expectations[service_name]
+
+    # Greedy bounded with input
+    response = await tgi_service.client.text_generation(
+        "What is Deep Learning?", max_new_tokens=17, return_full_text=True, details=True, decoder_input_details=True
+    )
+    assert response.details.generated_tokens == 17
+    assert response.generated_text == prompt + greedy_expectations[service_name]
+
+    # Sampling
+    response = await tgi_service.client.text_generation(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=128,
+        seed=42,
+    )
+    sample_expectations = {
+        "gpt2": "Deep Learning",
+        "llama": "Deep Learning",
+        "mistral": "Deep learning",
+        "qwen2": "Deep Learning",
+        "granite": "Deep learning",
+    }
+    assert sample_expectations[service_name] in response
+
+    # Sampling with stop sequence
+    stop_sequence = sample_expectations[service_name][-5:]
+    response = await tgi_service.client.text_generation(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=128,
+        seed=42,
+        stop_sequences=[stop_sequence],
+    )
+    assert response.endswith(stop_sequence)
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(tgi_service, neuron_generate_load):
+    num_requests = 4
+    responses = await neuron_generate_load(
+        tgi_service.client,
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expectations = {
+        "gpt2": "Deep learning is a new field of research that has been around for a while",
+        "llama": "Deep learning is a subset of machine learning that uses artificial",
+        "mistral": "Deep Learning is a type of machine learning that",
+        "qwen2": "Deep Learning is a subset of Machine Learning that is based on",
+        "granite": "Deep Learning is a subset of Machine Learning, which is a branch of Art",
+    }
+    expected = expectations[tgi_service.client.service_name]
+    for r in responses:
+        assert r.details.generated_tokens == 17
+        assert expected in r.generated_text
diff --git a/integration-tests/neuron/integration/test_implicit_env.py b/integration-tests/neuron/integration/test_implicit_env.py
new file mode 100644
index 00000000..cf0c8062
--- /dev/null
+++ b/integration-tests/neuron/integration/test_implicit_env.py
@@ -0,0 +1,65 @@
+import os
+
+import pytest
+from huggingface_hub.errors import ValidationError
+
+
+@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
+async def tgi_service(request, neuron_launcher, neuron_model_config):
+    """Expose a TGI service corresponding to a model configuration
+
+    For each model configuration, the service will be started using the following
+    deployment options:
+    - from the hub original model (export parameters chosen after hub lookup),
+    - from the hub pre-exported neuron model,
+    - from a local path to the neuron model.
+    """
+    # the tgi_env.py script will take care of setting these
+    for var in [
+        "MAX_BATCH_SIZE",
+        "MAX_INPUT_TOKENS",
+        "MAX_TOTAL_TOKENS",
+        "HF_NUM_CORES",
+        "HF_AUTO_CAST_TYPE",
+    ]:
+        if var in os.environ:
+            del os.environ[var]
+    if request.param == "hub":
+        model_name_or_path = neuron_model_config["model_id"]
+    elif request.param == "hub-neuron":
+        model_name_or_path = neuron_model_config["neuron_model_id"]
+    else:
+        model_name_or_path = neuron_model_config["neuron_model_path"]
+    service_name = neuron_model_config["name"]
+    with neuron_launcher(service_name, model_name_or_path) as tgi_service:
+        await tgi_service.health(600)
+        yield tgi_service
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_service):
+    # Just verify that the generation works, and nothing is raised, with several set of params
+
+    # No params
+    await tgi_service.client.text_generation(
+        "What is Deep Learning?",
+    )
+
+    response = await tgi_service.client.text_generation(
+        "How to cook beans ?",
+        max_new_tokens=17,
+        details=True,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+
+    # Sampling
+    await tgi_service.client.text_generation(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=128,
+        seed=42,
+    )

From cea9dbc97173c74737fed3381a0d93721782762c Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 24 Feb 2025 14:58:23 +0100
Subject: [PATCH 106/183] You need to seek apparently. (#3049)

---
 integration-tests/conftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 5e5dfc2a..ed27e058 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -101,6 +101,7 @@ def container_log(request: SubRequest):
         print(error_log.read(), file=sys.stderr)
     else:
         error_log.truncate(0)
+        error_log.seek(0)
 
 
 class ResponseComparator(JSONSnapshotExtension):

From d7a24c03cfb466cdb563bfa129e3d738b7b2d73f Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Tue, 25 Feb 2025 19:07:55 +0800
Subject: [PATCH 107/183] some minor fix (#3048)

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 server/text_generation_server/layers/gptq/__init__.py   | 1 +
 server/text_generation_server/layers/moe/unquantized.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py
index 7e838035..25387682 100644
--- a/server/text_generation_server/layers/gptq/__init__.py
+++ b/server/text_generation_server/layers/gptq/__init__.py
@@ -171,6 +171,7 @@ class GPTQWeightsLoader(WeightsLoader):
             g_idx=g_idx,
             bits=self.bits,
             groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
             use_exllama=use_exllama,
         )
 
diff --git a/server/text_generation_server/layers/moe/unquantized.py b/server/text_generation_server/layers/moe/unquantized.py
index 77214286..007f99d0 100644
--- a/server/text_generation_server/layers/moe/unquantized.py
+++ b/server/text_generation_server/layers/moe/unquantized.py
@@ -85,6 +85,8 @@ class UnquantizedSparseMoELayer(nn.Module):
                 use_grouped_topk=self.n_expert_group is not None,
                 num_expert_group=self.n_expert_group,
                 topk_group=self.topk_group,
+                scoring_func=self.scoring_func,
+                e_score_correction_bias=self.e_score_correction_bias,
             )
         return fused_moe(
             x,

From b0069e04858f7a535525f688d1ccaf2a3ae0e59b Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Tue, 25 Feb 2025 16:11:34 -0500
Subject: [PATCH 108/183] fix: run linters and fix formatting (#3057)

---
 .../server/text_generation_server/cli.py      |   8 +-
 .../text_generation_server/generator.py       | 113 ++++++++++++++----
 .../text_generation_server/interceptor.py     |   4 +-
 .../server/text_generation_server/model.py    |  20 +++-
 backends/neuron/tests/fixtures/model.py       |  49 ++++++--
 backends/neuron/tests/prune_test_models.py    |   4 +-
 backends/neuron/tests/server/helpers.py       |  36 +++++-
 backends/neuron/tests/server/test_decode.py   |   8 +-
 .../tests/server/test_generator_slot.py       |   7 +-
 backends/neuron/tests/server/test_prefill.py  |  21 +++-
 backends/neuron/tgi_env.py                    |  80 ++++++++++---
 integration-tests/conftest.py                 |   9 +-
 integration-tests/fixtures/neuron/model.py    |  72 ++++++++---
 integration-tests/fixtures/neuron/service.py  |  31 ++++-
 .../neuron/integration/test_generate.py       |   6 +-
 .../neuron/integration/test_implicit_env.py   |   1 -
 16 files changed, 366 insertions(+), 103 deletions(-)

diff --git a/backends/neuron/server/text_generation_server/cli.py b/backends/neuron/server/text_generation_server/cli.py
index 409143a9..4a9c4734 100644
--- a/backends/neuron/server/text_generation_server/cli.py
+++ b/backends/neuron/server/text_generation_server/cli.py
@@ -61,7 +61,9 @@ def serve(
     )
 
     if trust_remote_code is not None:
-        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+        logger.warning(
+            "'trust_remote_code' argument is not supported and will be ignored."
+        )
 
     # Import here after the logger is added to log potential import exceptions
     from .server import serve
@@ -99,7 +101,9 @@ def download_weights(
     if extension is not None:
         logger.warning("'extension' argument is not supported and will be ignored.")
     if trust_remote_code is not None:
-        logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
+        logger.warning(
+            "'trust_remote_code' argument is not supported and will be ignored."
+        )
     if auto_convert is not None:
         logger.warning("'auto_convert' argument is not supported and will be ignored.")
     if merge_lora is not None:
diff --git a/backends/neuron/server/text_generation_server/generator.py b/backends/neuron/server/text_generation_server/generator.py
index 3ddee690..b3887e14 100644
--- a/backends/neuron/server/text_generation_server/generator.py
+++ b/backends/neuron/server/text_generation_server/generator.py
@@ -146,7 +146,9 @@ class Slot:
     def generated_tokens(self) -> int:
         return self._generated_tokens
 
-    def assign(self, batch_id: int, request: Request, generation_config: GenerationConfig):
+    def assign(
+        self, batch_id: int, request: Request, generation_config: GenerationConfig
+    ):
         """Assign a request to a slot.
 
         Args:
@@ -174,15 +176,24 @@ class Slot:
             if request.parameters.typical_p != 0:
                 self._generation_config.typical_p = request.parameters.typical_p
         if request.parameters.repetition_penalty != 0:
-            self._generation_config.repetition_penalty = request.parameters.repetition_penalty
+            self._generation_config.repetition_penalty = (
+                request.parameters.repetition_penalty
+            )
         self.seed = request.parameters.seed
-        self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens
+        self._generation_config.max_new_tokens = (
+            request.stopping_parameters.max_new_tokens
+        )
         self._max_new_tokens = self._generation_config.max_new_tokens
         stop_strings = request.stopping_parameters.stop_sequences
         if stop_strings:
             self._generation_config.stop_strings = stop_strings
 
-    def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, selector: TokenSelector):
+    def reset(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.LongTensor,
+        selector: TokenSelector,
+    ):
         """Reset the slot for the next generation.
 
         Args:
@@ -210,7 +221,9 @@ class Slot:
             self._generated_tokens -= 1
             # Since generated tokens are now part of the prefill, we need to reevaluate
             # max_new_tokens for the next generation
-            self._generation_config.max_new_tokens = self._max_new_tokens - self._generated_tokens
+            self._generation_config.max_new_tokens = (
+                self._max_new_tokens - self._generated_tokens
+            )
         self._state = Slot.State.PAUSE
 
     def resume(self):
@@ -223,7 +236,9 @@ class Slot:
         """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
         # We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
         # which decide to add a space or not depending on the surrounding ids.
-        new_text = self._tokenizer.decode(self._tokens[self._next_text_token_start :], skip_special_tokens=False)
+        new_text = self._tokenizer.decode(
+            self._tokens[self._next_text_token_start :], skip_special_tokens=False
+        )
         if new_text.endswith("�"):
             # utf-8 char at the end means it's a potential unfinished byte sequence
             # from byte fallback tokenization.
@@ -267,7 +282,9 @@ class Slot:
         self._next_text = next_text
         return next_text
 
-    def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
+    def select(
+        self, input_ids: torch.LongTensor, logits: torch.Tensor
+    ) -> torch.LongTensor:
         """Select the next token from the candidate logits.
 
         Args:
@@ -384,7 +401,9 @@ class NeuronGenerator(Generator):
                 f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
             )
         # Assign each request to an empty slot
-        logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)")
+        logger.debug(
+            f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)"
+        )
         new_slots = []
         for request in batch.requests:
             slot = empty_slots.pop()
@@ -417,7 +436,11 @@ class NeuronGenerator(Generator):
                 max_length = slot.truncate
         # Tokenize with padding and truncation
         padded_inputs = self.tokenizer(
-            inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
+            inputs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length,
         )
         input_ids = padded_inputs.input_ids
         attention_mask = padded_inputs.attention_mask
@@ -450,9 +473,13 @@ class NeuronGenerator(Generator):
                 slot.reset(slot_input_ids, slot_attention_mask, selector)
         # Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
         # as they have already been generated and sent back in the last decode.
-        model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask, seq_ids)
+        model_inputs = self.model.prepare_inputs_for_prefill(
+            input_ids, attention_mask, seq_ids
+        )
         logits = self.model(**model_inputs)[0]
-        generation, next_batch = self._generate_token(prefill_slots, self.batch_id, logits, input_ids)
+        generation, next_batch = self._generate_token(
+            prefill_slots, self.batch_id, logits, input_ids
+        )
         self.batch_id += 1
         # Reactivate previously active slots for the next decode
         for i, slot in enumerate(active_slots):
@@ -462,10 +489,14 @@ class NeuronGenerator(Generator):
                 slot.append(next_tokens[i])
         logger.debug("Model ready for decoding")
         if next_batch is not None:
-            logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
+            logger.debug(
+                f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}"
+            )
         return generation, next_batch
 
-    def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBatch]:
+    def decode(
+        self, batches: List[CachedBatch]
+    ) -> Tuple[List[Generation], CachedBatch]:
         """Decode the specified prefilled requests.
 
         Args:
@@ -491,10 +522,14 @@ class NeuronGenerator(Generator):
                 cleared_request_ids.append(slot.request_id)
                 slot.clear()
         if len(cleared_request_ids) > 0:
-            logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.")
+            logger.info(
+                f"Clearing slot for requests {cleared_request_ids} as they are not requested."
+            )
         active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
         if len(active_slots) < len(request_ids):
-            raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
+            raise ValueError(
+                "Unable to decode tokens for non-prefilled batches (probably due to a previous failure)"
+            )
         if self.model.continuous_batching:
             decode_slots = active_slots
             seq_ids = torch.tensor([slot.id for slot in decode_slots])
@@ -503,7 +538,9 @@ class NeuronGenerator(Generator):
             seq_ids = None
         # Reconstruct input_ids and attention_mask from decode slots
         n_slots = len(decode_slots)
-        input_ids = torch.full([n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64)
+        input_ids = torch.full(
+            [n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64
+        )
         max_length = 0
         for slot in decode_slots:
             max_length = max(max_length, slot.attention_mask.size(-1))
@@ -513,12 +550,18 @@ class NeuronGenerator(Generator):
                 # input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
                 input_ids[i, 0] = slot.next_token
                 attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
-        model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask, seq_ids)
+        model_inputs = self.model.prepare_inputs_for_decode(
+            input_ids, attention_mask, seq_ids
+        )
         logits = self.model(**model_inputs)[0]
         return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
 
     def _generate_token(
-        self, slots: List[Slot], next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor
+        self,
+        slots: List[Slot],
+        next_batch_id: int,
+        logits: torch.Tensor,
+        input_ids: torch.LongTensor,
     ) -> Tuple[List[Generation], CachedBatch]:
         generations = []
         active_slots = False
@@ -542,9 +585,13 @@ class NeuronGenerator(Generator):
             if finish_reason is not None:
                 # We must include the generated text for each finished sequence in the response
                 generated_text = GeneratedText(
-                    text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
+                    text=slot.generated_text,
+                    generated_tokens=slot.generated_tokens,
+                    finish_reason=finish_reason,
+                )
+                logger.debug(
+                    f"Decode complete for request {request_id} with {slot.generated_tokens} tokens"
                 )
-                logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
                 # mark the slot as available
                 slot.clear()
             else:
@@ -565,7 +612,9 @@ class NeuronGenerator(Generator):
         batch = None
         if active_slots:
             # Whatever initial batch these requests came from, we always return all pending requests in a single batch
-            request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY]
+            request_ids = [
+                slot.request_id for slot in self.slots if slot.state == Slot.State.READY
+            ]
             batch = self._cached_batch(next_batch_id, request_ids)
         else:
             logger.debug("No more pending requests")
@@ -574,7 +623,9 @@ class NeuronGenerator(Generator):
     def _cached_batch(self, batch_id: int, request_ids: List):
         size = len(request_ids)
         max_tokens = size * self.model.max_length
-        return CachedBatch(id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens)
+        return CachedBatch(
+            id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens
+        )
 
     def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
         """Remove requests that are not listed from the specified batch
@@ -588,7 +639,9 @@ class NeuronGenerator(Generator):
         Return:
             A `CachedBatch` containing the pending requests.
         """
-        keep_slot_ids = [slot.id for slot in self.slots if slot.request_id in keep_request_ids]
+        keep_slot_ids = [
+            slot.id for slot in self.slots if slot.request_id in keep_request_ids
+        ]
         self._clear(keep_slot_ids)
         return self._cached_batch(batch_id, keep_request_ids)
 
@@ -625,11 +678,19 @@ class NeuronGenerator(Generator):
             export_kwargs = get_export_kwargs_from_env()
             logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
             model = NeuronModelForCausalLM.from_pretrained(
-                model_id, revision=revision, low_cpu_mem_usage=True, export=True, **export_kwargs
+                model_id,
+                revision=revision,
+                low_cpu_mem_usage=True,
+                export=True,
+                **export_kwargs,
             )
         else:
-            logger.info("Loading model on neuron devices (this can take a few minutes).")
-            model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision)
+            logger.info(
+                "Loading model on neuron devices (this can take a few minutes)."
+            )
+            model = NeuronModelForCausalLM.from_pretrained(
+                model_id, low_cpu_mem_usage=True, revision=revision
+            )
         end = time.time()
         logger.info(f"Model successfully loaded in {end - start:.2f} s.")
         tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
diff --git a/backends/neuron/server/text_generation_server/interceptor.py b/backends/neuron/server/text_generation_server/interceptor.py
index ed29cdf2..301cafd8 100644
--- a/backends/neuron/server/text_generation_server/interceptor.py
+++ b/backends/neuron/server/text_generation_server/interceptor.py
@@ -23,5 +23,7 @@ class ExceptionInterceptor(AsyncServerInterceptor):
             logger.exception(f"Method {method_name} encountered an error.")
 
             await context.abort_with_status(
-                rpc_status.to_status(status_pb2.Status(code=code_pb2.INTERNAL, message=str(err)))
+                rpc_status.to_status(
+                    status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
+                )
             )
diff --git a/backends/neuron/server/text_generation_server/model.py b/backends/neuron/server/text_generation_server/model.py
index e8cb34ee..2151a921 100644
--- a/backends/neuron/server/text_generation_server/model.py
+++ b/backends/neuron/server/text_generation_server/model.py
@@ -56,7 +56,9 @@ def log_cache_size():
     if os.path.exists(path):
         usage = shutil.disk_usage(path)
         gb = 2**30
-        logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G")
+        logger.info(
+            f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G"
+        )
     else:
         raise ValueError(f"The cache directory ({path}) does not exist.")
 
@@ -79,7 +81,9 @@ def fetch_model(
     if not os.path.isdir("/sys/class/neuron_device/"):
         raise SystemError("No neuron cores detected on the host.")
     if os.path.isdir(model_id) and revision is not None:
-        logger.warning("Revision {} ignored for local model at {}".format(revision, model_id))
+        logger.warning(
+            "Revision {} ignored for local model at {}".format(revision, model_id)
+        )
         revision = None
     # Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
     # Note that the model may already be present in the cache.
@@ -89,12 +93,16 @@ def fetch_model(
         if os.path.isdir(model_id):
             return model_id
         # Prefetch the neuron model from the Hub
-        logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}")
+        logger.info(
+            f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}"
+        )
         log_cache_size()
         return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
     # Model needs to be exported: look for compatible cached entries on the hub
     export_kwargs = get_export_kwargs_from_env()
-    export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs)
+    export_config = NeuronModelForCausalLM.get_export_config(
+        model_id, config, revision=revision, **export_kwargs
+    )
     neuron_config = export_config.neuron
     if not is_cached(model_id, neuron_config):
         hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
@@ -105,7 +113,9 @@ def fetch_model(
             f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
         )
         raise ValueError(error_msg)
-    logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
+    logger.warning(
+        f"{model_id} is not a neuron model: it will be exported using cached artifacts."
+    )
     if os.path.isdir(model_id):
         return model_id
     # Prefetch weights, tokenizer and generation config so that they are in cache
diff --git a/backends/neuron/tests/fixtures/model.py b/backends/neuron/tests/fixtures/model.py
index 6fa63ce8..4b6a1375 100644
--- a/backends/neuron/tests/fixtures/model.py
+++ b/backends/neuron/tests/fixtures/model.py
@@ -27,33 +27,68 @@ OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
 MODEL_CONFIGURATIONS = {
     "gpt2": {
         "model_id": "gpt2",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 1024,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "llama": {
         "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 2048,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "mistral": {
         "model_id": "optimum/mistral-1.1b-testing",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
     },
     "qwen2": {
         "model_id": "Qwen/Qwen2.5-0.5B",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "granite": {
         "model_id": "ibm-granite/granite-3.1-2b-instruct",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
     },
 }
 
 
 def get_hub_neuron_model_id(config_name: str):
-    return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
+    return (
+        f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
+    )
 
 
 def export_model(model_id, export_kwargs, neuron_model_path):
-    export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"]
+    export_command = [
+        "optimum-cli",
+        "export",
+        "neuron",
+        "-m",
+        model_id,
+        "--task",
+        "text-generation",
+    ]
     for kwarg, value in export_kwargs.items():
         export_command.append(f"--{kwarg}")
         export_command.append(str(value))
diff --git a/backends/neuron/tests/prune_test_models.py b/backends/neuron/tests/prune_test_models.py
index 592b1070..448962fb 100644
--- a/backends/neuron/tests/prune_test_models.py
+++ b/backends/neuron/tests/prune_test_models.py
@@ -1,5 +1,3 @@
-import os
-
 from argparse import ArgumentParser
 from huggingface_hub import HfApi
 
@@ -15,7 +13,7 @@ def main():
             delete = True
         else:
             answer = input(f"Do you want to delete {model.id} [y/N] ?")
-            delete = (answer == "y")
+            delete = answer == "y"
         if delete:
             api.delete_repo(model.id)
             print(f"Deleted {model.id}.")
diff --git a/backends/neuron/tests/server/helpers.py b/backends/neuron/tests/server/helpers.py
index 81547cb6..f0f81d06 100644
--- a/backends/neuron/tests/server/helpers.py
+++ b/backends/neuron/tests/server/helpers.py
@@ -29,22 +29,42 @@ def create_request(
     )
     stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
     return Request(
-        id=id, inputs=inputs, truncate=truncate, parameters=parameters, stopping_parameters=stopping_parameters
+        id=id,
+        inputs=inputs,
+        truncate=truncate,
+        parameters=parameters,
+        stopping_parameters=stopping_parameters,
     )
 
 
-def check_prefill(input_text, expected_token_id, expected_token_text, do_sample, batch_size, model_path):
+def check_prefill(
+    input_text,
+    expected_token_id,
+    expected_token_text,
+    do_sample,
+    batch_size,
+    model_path,
+):
     """Verify that a prefill for a single request generates the expected output."""
     generator = NeuronGenerator.from_pretrained(model_path)
     assert generator.model.batch_size >= batch_size
     requests = []
     max_new_tokens = 20
     for i in range(batch_size):
-        requests.append(create_request(id=0, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+        requests.append(
+            create_request(
+                id=0,
+                inputs=input_text,
+                do_sample=do_sample,
+                max_new_tokens=max_new_tokens,
+            )
+        )
     # Let's be pessimistic when estimating max_tokens
     batch_size * (len(input_text) + max_new_tokens)
     max_length = generator.model.max_length
-    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    batch = Batch(
+        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
+    )
     generations, next_batch = generator.prefill(batch)
     assert next_batch.size == batch_size
     # Whatever was passed as max_tokens, the server will correct it
@@ -57,10 +77,14 @@ def check_prefill(input_text, expected_token_id, expected_token_text, do_sample,
         assert tokens.texts == [expected_token_text]
 
 
-def check_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path):
+def check_decode_single(
+    input_text, max_new_tokens, generated_text, do_sample, model_path
+):
     """Verify that a decoding for a single request generates the expected output."""
     generator = NeuronGenerator.from_pretrained(model_path)
-    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    request = create_request(
+        id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
+    )
     max_length = generator.model.max_length
     batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
     generations, next_batch = generator.prefill(batch)
diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py
index 2ab4c2da..9db5e3ab 100644
--- a/backends/neuron/tests/server/test_decode.py
+++ b/backends/neuron/tests/server/test_decode.py
@@ -16,9 +16,13 @@ def test_decode(neuron_model_config):
 
 
 def _test_decode(config_name, generator, do_sample):
-    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    input_text = (
+        "It was a bright cold day in April, and the clocks were striking thirteen."
+    )
     max_new_tokens = 20
-    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    request = create_request(
+        id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
+    )
     max_length = generator.model.max_length
     batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
     generations, next_batch = generator.prefill(batch)
diff --git a/backends/neuron/tests/server/test_generator_slot.py b/backends/neuron/tests/server/test_generator_slot.py
index 459ee3e5..0c03e9d1 100644
--- a/backends/neuron/tests/server/test_generator_slot.py
+++ b/backends/neuron/tests/server/test_generator_slot.py
@@ -36,7 +36,12 @@ def test_decode_streaming(tokenizer, input_text, generated_text):
     slot.assign(0, request, GenerationConfig())
     assert slot.cached_text == input_text
 
-    inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt")
+    inputs = tokenizer(
+        input_text,
+        padding="max_length",
+        max_length=len(input_text) + 1,
+        return_tensors="pt",
+    )
     input_ids = inputs["input_ids"][0]
     attention_mask = inputs["attention_mask"][0]
     generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py
index 2120e5c5..c0155b1a 100644
--- a/backends/neuron/tests/server/test_prefill.py
+++ b/backends/neuron/tests/server/test_prefill.py
@@ -21,12 +21,23 @@ def test_prefill(neuron_model_config):
 def _test_prefill(config_name, generator, batch_size, do_sample):
     requests = []
     max_new_tokens = 20
-    input_text = "It was a bright cold day in April, and the clocks were striking thirteen."
+    input_text = (
+        "It was a bright cold day in April, and the clocks were striking thirteen."
+    )
     for i in range(batch_size):
-        requests.append(create_request(id=i, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+        requests.append(
+            create_request(
+                id=i,
+                inputs=input_text,
+                do_sample=do_sample,
+                max_new_tokens=max_new_tokens,
+            )
+        )
     # Let's be pessimistic when estimating max_tokens
     max_length = generator.model.max_length
-    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    batch = Batch(
+        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
+    )
     generations, next_batch = generator.prefill(batch)
     assert next_batch.size == batch_size
     # Whatever was passed as max_tokens, the server will correct it
@@ -73,7 +84,9 @@ def test_prefill_truncate(neuron_model_config):
     for i in range(batch_size):
         requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
     max_length = generator.model.max_length
-    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length)
+    batch = Batch(
+        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
+    )
     generations, _ = generator.prefill(batch)
     # Even if the input text is identical for all requests, the first generated token might
     # be different because of the truncation
diff --git a/backends/neuron/tgi_env.py b/backends/neuron/tgi_env.py
index ff647c98..a7042130 100755
--- a/backends/neuron/tgi_env.py
+++ b/backends/neuron/tgi_env.py
@@ -16,7 +16,12 @@ from optimum.neuron.utils.version_utils import get_neuronxcc_version
 
 logger = logging.getLogger(__name__)
 
-tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"]
+tgi_router_env_vars = [
+    "MAX_BATCH_SIZE",
+    "MAX_TOTAL_TOKENS",
+    "MAX_INPUT_TOKENS",
+    "MAX_BATCH_PREFILL_TOKENS",
+]
 tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
 
 env_config_peering = [
@@ -39,18 +44,30 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
         argv = sys.argv
     # All these are params passed to tgi and intercepted here
     parser.add_argument(
-        "--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
+        "--max-input-tokens",
+        type=int,
+        default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)),
+    )
+    parser.add_argument(
+        "--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0)
+    )
+    parser.add_argument(
+        "--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0)
+    )
+    parser.add_argument(
+        "--max-batch-prefill-tokens",
+        type=int,
+        default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0),
     )
-    parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
-    parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
-    parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0))
     parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
     parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
 
     args = parser.parse_known_args(argv)[0]
 
     if not args.model_id:
-        raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var")
+        raise Exception(
+            "No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var"
+        )
 
     # Override env with cmdline params
     os.environ["MODEL_ID"] = args.model_id
@@ -87,7 +104,9 @@ def neuron_config_to_env(neuron_config):
         f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
         max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
         if not max_batch_prefill_tokens:
-            max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens)
+            max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(
+                max_input_tokens
+            )
         f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
 
 
@@ -95,16 +114,25 @@ def sort_neuron_configs(dictionary):
     return -dictionary["num_cores"], -dictionary["batch_size"]
 
 
-def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]:
+def lookup_compatible_cached_model(
+    model_id: str, revision: Optional[str]
+) -> Optional[Dict[str, Any]]:
     # Reuse the same mechanic as the one in use to configure the tgi server part
     # The only difference here is that we stay as flexible as possible on the compatibility part
     entries = get_hub_cached_entries(model_id, "inference")
 
-    logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision)
+    logger.debug(
+        "Found %d cached entries for model %s, revision %s",
+        len(entries),
+        model_id,
+        revision,
+    )
 
     all_compatible = []
     for entry in entries:
-        if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True):
+        if check_env_and_neuron_config_compatibility(
+            entry, check_compiler_version=True
+        ):
             all_compatible.append(entry)
 
     if not all_compatible:
@@ -126,7 +154,9 @@ def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Op
     return entry
 
 
-def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool:
+def check_env_and_neuron_config_compatibility(
+    neuron_config: Dict[str, Any], check_compiler_version: bool
+) -> bool:
     logger.debug(
         "Checking the provided neuron config %s is compatible with the local setup and provided environment",
         neuron_config,
@@ -134,10 +164,15 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
 
     # Local setup compat checks
     if neuron_config["num_cores"] > available_cores:
-        logger.debug("Not enough neuron cores available to run the provided neuron config")
+        logger.debug(
+            "Not enough neuron cores available to run the provided neuron config"
+        )
         return False
 
-    if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version:
+    if (
+        check_compiler_version
+        and neuron_config["compiler_version"] != neuronxcc_version
+    ):
         logger.debug(
             "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
             neuronxcc_version,
@@ -158,7 +193,9 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
             )
             return False
 
-    max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
+    max_input_tokens = int(
+        os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
+    )
     if max_input_tokens > 0:
         sequence_length = neuron_config["sequence_length"]
         if max_input_tokens >= sequence_length:
@@ -191,7 +228,10 @@ def main():
         if not os.getenv(env_var):
             break
     else:
-        logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars)
+        logger.info(
+            "All env vars %s already set, skipping, user know what they are doing",
+            env_vars,
+        )
         sys.exit(0)
 
     cache_dir = constants.HF_HUB_CACHE
@@ -201,7 +241,9 @@ def main():
     config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
     neuron_config = getattr(config, "neuron", None)
     if neuron_config is not None:
-        compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False)
+        compatible = check_env_and_neuron_config_compatibility(
+            neuron_config, check_compiler_version=False
+        )
         if not compatible:
             env_dict = get_env_dict()
             msg = (
@@ -213,9 +255,9 @@ def main():
         neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
 
     if not neuron_config:
-        msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format(
-            get_env_dict(), available_cores, neuronxcc_version
-        )
+        msg = (
+            "No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}"
+        ).format(get_env_dict(), available_cores, neuronxcc_version)
         logger.error(msg)
         raise Exception(msg)
 
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index ed27e058..e0451052 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -75,16 +75,23 @@ def pytest_collection_modifyitems(config, items):
         def skip_release(item):
             if "release" in item.keywords:
                 item.add_marker(pytest.mark.skip(reason="need --release option to run"))
+
         selectors.append(skip_release)
     if config.getoption("--neuron"):
+
         def skip_not_neuron(item):
             if "neuron" not in item.keywords:
-                item.add_marker(pytest.mark.skip(reason="incompatible with --neuron option"))
+                item.add_marker(
+                    pytest.mark.skip(reason="incompatible with --neuron option")
+                )
+
         selectors.append(skip_not_neuron)
     else:
+
         def skip_neuron(item):
             if "neuron" in item.keywords:
                 item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
+
         selectors.append(skip_neuron)
     for item in items:
         for selector in selectors:
diff --git a/integration-tests/fixtures/neuron/model.py b/integration-tests/fixtures/neuron/model.py
index 10f1c84e..3345e2ea 100644
--- a/integration-tests/fixtures/neuron/model.py
+++ b/integration-tests/fixtures/neuron/model.py
@@ -30,44 +30,74 @@ logger = logging.getLogger(__file__)
 MODEL_CONFIGURATIONS = {
     "gpt2": {
         "model_id": "gpt2",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 1024,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "llama": {
         "model_id": "unsloth/Llama-3.2-1B-Instruct",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 2048,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "mistral": {
         "model_id": "optimum/mistral-1.1b-testing",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
     },
     "qwen2": {
         "model_id": "Qwen/Qwen2.5-0.5B",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "fp16",
+        },
     },
     "granite": {
         "model_id": "ibm-granite/granite-3.1-2b-instruct",
-        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        "export_kwargs": {
+            "batch_size": 4,
+            "sequence_length": 4096,
+            "num_cores": 2,
+            "auto_cast_type": "bf16",
+        },
     },
 }
 
 
 def get_neuron_backend_hash():
     import subprocess
-    res = subprocess.run(["git", "rev-parse", "--show-toplevel"],
-                         capture_output=True,
-                         text=True)
-    root_dir = res.stdout.split('\n')[0]
+
+    res = subprocess.run(
+        ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
+    )
+    root_dir = res.stdout.split("\n")[0]
+
     def get_sha(path):
-        res = subprocess.run(["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
-                             capture_output=True,
-                             text=True)
+        res = subprocess.run(
+            ["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
+            capture_output=True,
+            text=True,
+        )
         # Output of the command is in the form '040000 tree|blob <SHA>\t<path>\n'
-        sha = res.stdout.split('\t')[0].split(' ')[-1]
+        sha = res.stdout.split("\t")[0].split(" ")[-1]
         return sha.encode()
+
     # We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that
     m = hashlib.sha256()
-    m.update(get_sha('backends/neuron'))
-    m.update(get_sha('Dockerfile.neuron'))
+    m.update(get_sha("backends/neuron"))
+    m.update(get_sha("Dockerfile.neuron"))
     return m.hexdigest()[:10]
 
 
@@ -81,7 +111,9 @@ def get_tgi_docker_image():
         client = docker.from_env()
         images = client.images.list(filters={"reference": "text-generation-inference"})
         if not images:
-            raise ValueError("No text-generation-inference image found on this host to run tests.")
+            raise ValueError(
+                "No text-generation-inference image found on this host to run tests."
+            )
         docker_image = images[0].tags[0]
     return docker_image
 
@@ -119,7 +151,9 @@ def export_model(config_name, model_config, neuron_model_name):
     with tempfile.TemporaryDirectory() as context_dir:
         # Create entrypoint
         model_path = "/data/neuron_model"
-        export_command = f"optimum-cli export neuron -m {model_id} --task text-generation"
+        export_command = (
+            f"optimum-cli export neuron -m {model_id} --task text-generation"
+        )
         for kwarg, value in export_kwargs.items():
             export_command += f" --{kwarg} {str(value)}"
         export_command += f" {model_path}"
@@ -142,7 +176,9 @@ def export_model(config_name, model_config, neuron_model_name):
         with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
             f.write(docker_content.encode("utf-8"))
             f.flush()
-        image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=export_image)
+        image, logs = client.images.build(
+            path=context_dir, dockerfile=f.name, tag=export_image
+        )
         logger.info("Successfully built image %s", image.id)
         logger.debug("Build logs %s", logs)
 
diff --git a/integration-tests/fixtures/neuron/service.py b/integration-tests/fixtures/neuron/service.py
index 927a35af..39241d6f 100644
--- a/integration-tests/fixtures/neuron/service.py
+++ b/integration-tests/fixtures/neuron/service.py
@@ -27,7 +27,9 @@ def get_tgi_docker_image():
         client = docker.from_env()
         images = client.images.list(filters={"reference": "text-generation-inference"})
         if not images:
-            raise ValueError("No text-generation-inference image found on this host to run tests.")
+            raise ValueError(
+                "No text-generation-inference image found on this host to run tests."
+            )
         docker_image = images[0].tags[0]
     return docker_image
 
@@ -131,13 +133,21 @@ def neuron_launcher(event_loop):
         except NotFound:
             pass
 
-        env = {"LOG_LEVEL": "info,text_generation_router=debug", "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID}
+        env = {
+            "LOG_LEVEL": "info,text_generation_router=debug",
+            "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID,
+        }
 
         if HF_TOKEN is not None:
             env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
             env["HF_TOKEN"] = HF_TOKEN
 
-        for var in ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "HF_AUTO_CAST_TYPE", "HF_NUM_CORES"]:
+        for var in [
+            "MAX_BATCH_SIZE",
+            "MAX_TOTAL_TOKENS",
+            "HF_AUTO_CAST_TYPE",
+            "HF_NUM_CORES",
+        ]:
             if var in os.environ:
                 env[var] = os.environ[var]
 
@@ -165,7 +175,9 @@ def neuron_launcher(event_loop):
                 with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
                     f.write(docker_content.encode("utf-8"))
                     f.flush()
-                image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=test_image)
+                image, logs = client.images.build(
+                    path=context_dir, dockerfile=f.name, tag=test_image
+                )
             logger.info("Successfully built image %s", image.id)
             logger.debug("Build logs %s", logs)
         else:
@@ -204,7 +216,9 @@ def neuron_launcher(event_loop):
             try:
                 container.remove(force=True)
             except Exception as e:
-                logger.error("Error while removing container %s, skipping", container_name)
+                logger.error(
+                    "Error while removing container %s, skipping", container_name
+                )
                 logger.exception(e)
 
             # Cleanup the build image
@@ -243,7 +257,12 @@ def neuron_generate_load():
         client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
     ) -> List[TextGenerationOutput]:
         futures = [
-            client.text_generation(prompt, max_new_tokens=max_new_tokens, details=True, decoder_input_details=True)
+            client.text_generation(
+                prompt,
+                max_new_tokens=max_new_tokens,
+                details=True,
+                decoder_input_details=True,
+            )
             for _ in range(n)
         ]
 
diff --git a/integration-tests/neuron/integration/test_generate.py b/integration-tests/neuron/integration/test_generate.py
index 96f09838..6a1b4990 100644
--- a/integration-tests/neuron/integration/test_generate.py
+++ b/integration-tests/neuron/integration/test_generate.py
@@ -30,7 +30,11 @@ async def test_model_single_request(tgi_service):
 
     # Greedy bounded with input
     response = await tgi_service.client.text_generation(
-        "What is Deep Learning?", max_new_tokens=17, return_full_text=True, details=True, decoder_input_details=True
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        return_full_text=True,
+        details=True,
+        decoder_input_details=True,
     )
     assert response.details.generated_tokens == 17
     assert response.generated_text == prompt + greedy_expectations[service_name]
diff --git a/integration-tests/neuron/integration/test_implicit_env.py b/integration-tests/neuron/integration/test_implicit_env.py
index cf0c8062..7eee7b33 100644
--- a/integration-tests/neuron/integration/test_implicit_env.py
+++ b/integration-tests/neuron/integration/test_implicit_env.py
@@ -1,7 +1,6 @@
 import os
 
 import pytest
-from huggingface_hub.errors import ValidationError
 
 
 @pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])

From 5eec3a8bb6554ebb942bdaae309cfd3080c4e997 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david.corvoysier@gmail.com>
Date: Wed, 26 Feb 2025 12:15:01 +0100
Subject: [PATCH 109/183] Avoid running neuron integration tests twice (#3054)

* test(neuron): refactor to prepare batch export

* test(neuron): add helper to batch export models

Also rename fixture file fro clarity.

* ci(neuron): do not run tests twice

* ci(neuron): rename precompilation job

* test(neuron): remove redundant subdirectory

* test(neuron): remove erroneous line

* doc(neuron): update links to installation page

* feat(neuron): cleanup Dockerfile

CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse is not required anymore.

* test(neuron): try to reduce download errors
---
 .github/workflows/build.yaml                  | 13 +++---
 Dockerfile.neuron                             |  2 -
 docs/source/_toctree.yml                      |  2 +-
 docs/source/architecture.md                   |  2 +-
 docs/source/installation_inferentia.md        |  2 +-
 docs/source/multi_backend_support.md          |  1 +
 integration-tests/conftest.py                 |  2 +-
 .../neuron/{model.py => export_models.py}     | 43 +++++++++++++------
 .../neuron/{integration => }/test_generate.py |  0
 .../{integration => }/test_implicit_env.py    |  0
 10 files changed, 39 insertions(+), 28 deletions(-)
 rename integration-tests/fixtures/neuron/{model.py => export_models.py} (89%)
 rename integration-tests/neuron/{integration => }/test_generate.py (100%)
 rename integration-tests/neuron/{integration => }/test_implicit_env.py (100%)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 824a5a28..b7cc7955 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -230,7 +230,7 @@ jobs:
           echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
           echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
           echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
-  precompile_static_models:
+  precompile_neuron_models:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
@@ -252,21 +252,18 @@ jobs:
       - name: Install
         run: |
           make install-integration-tests
-      - name: Run tests
+      - name: Export neuron models
         run: |
-          export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
           export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
-          export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
-          export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
-          export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
           echo $DOCKER_IMAGE
           docker pull $DOCKER_IMAGE
-          pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
+          export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
+          python integration-tests/fixtures/neuron/export_models.py
   integration_tests:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    needs: [precompile_static_models, build-and-push]
+    needs: [precompile_neuron_models, build-and-push]
     if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }}
     runs-on:
       group: ${{ needs.build-and-push.outputs.runs_on }}
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 17d25691..c7c4af68 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -24,8 +24,6 @@ RUN cargo install cargo-chef --locked
 
 WORKDIR /usr/src
 
-ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
-
 FROM chef AS planner
 COPY backends/neuron/Cargo.toml Cargo.toml
 COPY Cargo.lock Cargo.lock
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 39f0ef4b..37b57d6f 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -12,7 +12,7 @@
   - local: installation_gaudi
     title: Using TGI with Intel Gaudi
   - local: installation_inferentia
-    title: Using TGI with AWS Inferentia
+    title: Using TGI with AWS Trainium and Inferentia
   - local: installation_tpu
     title: Using TGI with Google TPUs
   - local: installation_intel
diff --git a/docs/source/architecture.md b/docs/source/architecture.md
index d3a6fa92..b475bb6d 100644
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
@@ -107,7 +107,7 @@ Several variants of the model server exist that are actively supported by Huggin
 - A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
 - A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
 - The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
-- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
+- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained in the main TGI repository. Some model features differ.
 - A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
 
 Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
diff --git a/docs/source/installation_inferentia.md b/docs/source/installation_inferentia.md
index 0394e6de..bfd0f657 100644
--- a/docs/source/installation_inferentia.md
+++ b/docs/source/installation_inferentia.md
@@ -1,3 +1,3 @@
 # Using TGI with Inferentia
 
-Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
+You can use TGI on AWS Trainium and Inferentia platforms using the [TGI neuron backend](https://huggingface.co/docs/text-generation-inference/backends/neuron).
diff --git a/docs/source/multi_backend_support.md b/docs/source/multi_backend_support.md
index 03d6d30b..997503a4 100644
--- a/docs/source/multi_backend_support.md
+++ b/docs/source/multi_backend_support.md
@@ -13,3 +13,4 @@ TGI remains consistent across backends, allowing you to switch between them seam
   However, it requires a model-specific compilation step for each GPU architecture.
 * **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models
   (LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation.
+* **[TGI Neuron backend](./backends/neuron)**: This backend leverages the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) to allow the deployment of large language models (LLMs) on [AWS Trainium and Inferentia chips](https://aws.amazon.com/ai/machine-learning/trainium/).
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index e0451052..0ffcd162 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,4 +1,4 @@
-pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.model"]
+pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
 # ruff: noqa: E402
 from _pytest.fixtures import SubRequest
 import requests
diff --git a/integration-tests/fixtures/neuron/model.py b/integration-tests/fixtures/neuron/export_models.py
similarity index 89%
rename from integration-tests/fixtures/neuron/model.py
rename to integration-tests/fixtures/neuron/export_models.py
index 3345e2ea..836402ec 100644
--- a/integration-tests/fixtures/neuron/model.py
+++ b/integration-tests/fixtures/neuron/export_models.py
@@ -118,10 +118,11 @@ def get_tgi_docker_image():
     return docker_image
 
 
-def export_model(config_name, model_config, neuron_model_name):
-    """Export a neuron model.
+def maybe_export_model(config_name, model_config):
+    """Export a neuron model for the specified test configuration.
 
-    The model is exported by a custom image built on the fly from the base TGI image.
+    If the neuron model has not already been compiled and pushed to the hub, it is
+    exported by a custom image built on the fly from the base TGI image.
     This makes sure the exported model and image are aligned and avoids introducing
     neuron specific imports in the test suite.
 
@@ -130,9 +131,15 @@ def export_model(config_name, model_config, neuron_model_name):
             Used to identify test configurations
         model_config (`str`):
             The model configuration for export (includes the original model id)
-        neuron_model_name (`str`):
-            The name of the exported model on the hub
     """
+    neuron_model_name = get_neuron_model_name(config_name)
+    neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
+    hub = huggingface_hub.HfApi()
+    if hub.repo_exists(neuron_model_id):
+        logger.info(
+            f"Skipping model export for config {config_name} as {neuron_model_id} already exists"
+        )
+        return neuron_model_id
 
     client = docker.from_env()
 
@@ -183,7 +190,7 @@ def export_model(config_name, model_config, neuron_model_name):
         logger.debug("Build logs %s", logs)
 
     try:
-        container = client.containers.run(
+        client.containers.run(
             export_image,
             environment=env,
             auto_remove=True,
@@ -192,7 +199,6 @@ def export_model(config_name, model_config, neuron_model_name):
             shm_size="1G",
         )
         logger.info(f"Successfully exported model for config {config_name}")
-        container.logs()
     except Exception as e:
         logger.exception(f"An exception occurred while running container: {e}.")
         pass
@@ -206,6 +212,12 @@ def export_model(config_name, model_config, neuron_model_name):
         except Exception as e:
             logger.error("Error while removing image %s, skipping", image.id)
             logger.exception(e)
+    return neuron_model_id
+
+
+def maybe_export_models():
+    for config_name, model_config in MODEL_CONFIGURATIONS.items():
+        maybe_export_model(config_name, model_config)
 
 
 @pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
@@ -232,15 +244,14 @@ def neuron_model_config(request):
     """
     config_name = request.param
     model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
-    neuron_model_name = get_neuron_model_name(config_name)
-    neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
+    # Export the model first (only if needed)
+    neuron_model_id = maybe_export_model(config_name, model_config)
     with TemporaryDirectory() as neuron_model_path:
-        hub = huggingface_hub.HfApi()
-        if not hub.repo_exists(neuron_model_id):
-            # Export the model first
-            export_model(config_name, model_config, neuron_model_name)
         logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
-        hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
+        hub = huggingface_hub.HfApi()
+        hub.snapshot_download(
+            neuron_model_id, etag_timeout=30, local_dir=neuron_model_path
+        )
         # Add dynamic parameters to the model configuration
         model_config["neuron_model_path"] = neuron_model_path
         model_config["neuron_model_id"] = neuron_model_id
@@ -257,3 +268,7 @@ def neuron_model_config(request):
 @pytest.fixture(scope="module")
 def neuron_model_path(neuron_model_config):
     yield neuron_model_config["neuron_model_path"]
+
+
+if __name__ == "__main__":
+    maybe_export_models()
diff --git a/integration-tests/neuron/integration/test_generate.py b/integration-tests/neuron/test_generate.py
similarity index 100%
rename from integration-tests/neuron/integration/test_generate.py
rename to integration-tests/neuron/test_generate.py
diff --git a/integration-tests/neuron/integration/test_implicit_env.py b/integration-tests/neuron/test_implicit_env.py
similarity index 100%
rename from integration-tests/neuron/integration/test_implicit_env.py
rename to integration-tests/neuron/test_implicit_env.py

From 683ff53fa3d29d6e0660b4f874d5d2e74a75dfa7 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Fri, 28 Feb 2025 12:14:58 +0100
Subject: [PATCH 110/183] Add Gaudi Backend (#3055)

* wip(gaudi): import server and dockerfile from tgi-gaudi fork

* feat(gaudi): new gaudi backend working

* fix: fix style

* fix prehooks issues

* fix(gaudi): refactor server and implement requested changes
---
 .gitignore                                    |    5 +
 Dockerfile_gaudi                              |  127 +
 backends/gaudi/Makefile                       |   49 +
 backends/gaudi/README.md                      |   98 +
 backends/gaudi/server/.gitignore              |  164 +
 backends/gaudi/server/Makefile                |   38 +
 backends/gaudi/server/Makefile-awq            |   15 +
 backends/gaudi/server/Makefile-eetq           |   13 +
 backends/gaudi/server/Makefile-fbgemm         |   15 +
 backends/gaudi/server/Makefile-flash-att      |   12 +
 backends/gaudi/server/Makefile-flash-att-v2   |   21 +
 backends/gaudi/server/Makefile-selective-scan |   28 +
 backends/gaudi/server/Makefile-vllm           |   23 +
 backends/gaudi/server/README.md               |   15 +
 backends/gaudi/server/dill-0.3.7-patch.sh     |   91 +
 backends/gaudi/server/dill-0.3.8-patch.sh     |   91 +
 backends/gaudi/server/poetry.lock             | 4019 +++++++++++++++++
 backends/gaudi/server/pyproject.toml          |   42 +
 backends/gaudi/server/requirements.txt        |   89 +
 backends/gaudi/server/tests/conftest.py       |   23 +
 .../gaudi/server/tests/models/test_bloom.py   |  363 ++
 .../server/tests/models/test_causal_lm.py     |  354 ++
 .../gaudi/server/tests/models/test_model.py   |   83 +
 .../server/tests/models/test_santacoder.py    |  108 +
 .../server/tests/models/test_seq2seq_lm.py    |  365 ++
 .../gaudi/server/tests/utils/test_adapter.py  |  247 +
 .../gaudi/server/tests/utils/test_convert.py  |   21 +
 backends/gaudi/server/tests/utils/test_hub.py |  103 +
 .../gaudi/server/tests/utils/test_layers.py   |   82 +
 .../gaudi/server/tests/utils/test_tokens.py   |   88 +
 .../server/tests/utils/test_watermark.py      |   54 +
 .../gaudi/server/tests/utils/test_weights.py  | 1177 +++++
 .../server/text_generation_server/__init__.py |    0
 .../adapters/__init__.py                      |   13 +
 .../text_generation_server/adapters/config.py |   30 +
 .../text_generation_server/adapters/lora.py   |  471 ++
 .../adapters/weights.py                       |  146 +
 .../server/text_generation_server/cache.py    |   34 +
 .../server/text_generation_server/cli.py      |  429 ++
 .../habana_quantization_env.py                |   53 +
 .../text_generation_server/interceptor.py     |   45 +
 .../text_generation_server/layers/__init__.py |   34 +
 .../layers/attention/__init__.py              |   43 +
 .../layers/attention/common.py                |   72 +
 .../layers/attention/cuda.py                  |  357 ++
 .../layers/attention/flash_attn_triton.py     |  813 ++++
 .../layers/attention/flashinfer.py            |  251 +
 .../layers/attention/ipex.py                  |   82 +
 .../layers/attention/rocm.py                  |  308 ++
 .../layers/awq/conversion_utils.py            |   97 +
 .../layers/awq/quantize/qmodule.py            |   49 +
 .../text_generation_server/layers/bnb.py      |  124 +
 .../text_generation_server/layers/conv.py     |   41 +
 .../text_generation_server/layers/eetq.py     |   43 +
 .../text_generation_server/layers/exl2.py     |   78 +
 .../text_generation_server/layers/fp8.py      |  285 ++
 .../layers/gptq/__init__.py                   |  440 ++
 .../layers/gptq/custom_autotune.py            |  261 ++
 .../layers/gptq/exllama.py                    |  134 +
 .../layers/gptq/exllamav2.py                  |  267 ++
 .../layers/gptq/quant_linear.py               |  359 ++
 .../layers/gptq/quantize.py                   | 1017 +++++
 .../layers/gptq/utils.py                      |   56 +
 .../layers/layernorm.py                       |  184 +
 .../text_generation_server/layers/linear.py   |  126 +
 .../text_generation_server/layers/lora.py     |  279 ++
 .../layers/marlin/__init__.py                 |   15 +
 .../layers/marlin/fp8.py                      |  140 +
 .../layers/marlin/gptq.py                     |  464 ++
 .../layers/marlin/marlin.py                   |  346 ++
 .../layers/marlin/util.py                     |  141 +
 .../text_generation_server/layers/medusa.py   |  191 +
 .../text_generation_server/layers/mlp.py      |  282 ++
 .../layers/moe/__init__.py                    |  257 ++
 .../layers/moe/fused_moe_rocm.py              |   52 +
 .../layers/moe/gptq_marlin.py                 |  215 +
 .../layers/moe/unquantized.py                 |  138 +
 .../text_generation_server/layers/rotary.py   |  548 +++
 .../layers/speculative.py                     |   52 +
 .../layers/tensor_parallel.py                 |  249 +
 .../text_generation_server/models/__init__.py |  326 ++
 .../text_generation_server/models/bloom.py    |   52 +
 .../models/causal_lm.py                       | 1421 ++++++
 .../models/custom_modeling/__init__.py        |    0
 .../models/custom_modeling/bloom_modeling.py  |  923 ++++
 .../models/custom_modeling/clip.py            |  817 ++++
 .../custom_modeling/flash_cohere_modeling.py  |  543 +++
 .../custom_modeling/flash_dbrx_modeling.py    |  756 ++++
 .../flash_deepseek_v2_modeling.py             |  659 +++
 .../custom_modeling/flash_gemma2_modeling.py  |  560 +++
 .../custom_modeling/flash_gemma_modeling.py   |  471 ++
 .../custom_modeling/flash_gpt2_modeling.py    |  461 ++
 .../custom_modeling/flash_gptj_modeling.py    |  406 ++
 .../custom_modeling/flash_llama_modeling.py   |  668 +++
 .../custom_modeling/flash_mistral_modeling.py |  535 +++
 .../custom_modeling/flash_mixtral_modeling.py |  542 +++
 .../custom_modeling/flash_neox_modeling.py    |  425 ++
 .../flash_pali_gemma_modeling.py              |  119 +
 .../custom_modeling/flash_phi_modeling.py     |  418 ++
 .../custom_modeling/flash_phi_moe_modeling.py |  254 ++
 .../custom_modeling/flash_qwen2_modeling.py   |  394 ++
 .../custom_modeling/flash_rw_modeling.py      |  699 +++
 .../flash_santacoder_modeling.py              |  508 +++
 .../flash_starcoder2_modeling.py              |  554 +++
 .../models/custom_modeling/idefics2.py        |  839 ++++
 .../models/custom_modeling/idefics_config.py  |  326 ++
 .../idefics_image_processing.py               |  297 ++
 .../custom_modeling/idefics_modeling.py       | 1556 +++++++
 .../custom_modeling/idefics_perceiver.py      |  276 ++
 .../custom_modeling/idefics_processing.py     |  443 ++
 .../models/custom_modeling/idefics_vision.py  |  529 +++
 .../models/custom_modeling/llava_next.py      |  342 ++
 .../models/custom_modeling/mamba_modeling.py  |  232 +
 .../models/custom_modeling/mllama.py          |  995 ++++
 .../models/custom_modeling/mpt_modeling.py    | 1215 +++++
 .../models/custom_modeling/neox_modeling.py   |  796 ++++
 .../models/custom_modeling/opt_modeling.py    |  857 ++++
 .../models/custom_modeling/phi_modeling.py    |  336 ++
 .../models/custom_modeling/siglip.py          |  410 ++
 .../models/custom_modeling/t5_modeling.py     | 1227 +++++
 .../models/custom_modeling/vlm.py             |   48 +
 .../models/flash_causal_lm.py                 | 2003 ++++++++
 .../models/galactica.py                       |  156 +
 .../text_generation_server/models/globals.py  |   74 +
 .../models/idefics_causal_lm.py               |  899 ++++
 .../text_generation_server/models/mamba.py    |  814 ++++
 .../models/mllama_causal_lm.py                |  357 ++
 .../text_generation_server/models/model.py    |  141 +
 .../models/pali_gemma.py                      |   71 +
 .../models/seq2seq_lm.py                      |  932 ++++
 .../models/starcoder.py                       |   47 +
 .../text_generation_server/models/types.py    |  102 +
 .../models/vlm_causal_lm.py                   | 1330 ++++++
 .../text_generation_server/pb/.gitignore      |    3 +
 .../server/text_generation_server/server.py   |  290 ++
 .../text_generation_server/tgi_service.py     |   49 +
 .../server/text_generation_server/tracing.py  |   63 +
 .../text_generation_server/utils/__init__.py  |   50 +
 .../text_generation_server/utils/adapter.py   |  320 ++
 .../text_generation_server/utils/chunks.py    |   27 +
 .../text_generation_server/utils/convert.py   |  114 +
 .../text_generation_server/utils/debug.py     |   35 +
 .../text_generation_server/utils/dist.py      |   99 +
 .../text_generation_server/utils/hub.py       |  234 +
 .../utils/import_utils.py                     |   75 +
 .../text_generation_server/utils/log.py       |   15 +
 .../utils/logits_process.py                   |  611 +++
 .../utils/merges/strategies.py                |  220 +
 .../utils/merges/utils.py                     |  108 +
 .../text_generation_server/utils/peft.py      |   68 +
 .../utils/quantization.py                     |  202 +
 .../text_generation_server/utils/segments.py  |   66 +
 .../text_generation_server/utils/sgmv.py      |  252 ++
 .../text_generation_server/utils/speculate.py |   11 +
 .../text_generation_server/utils/tokens.py    |  762 ++++
 .../text_generation_server/utils/version.py   |   12 +
 .../text_generation_server/utils/watermark.py |   98 +
 .../text_generation_server/utils/weights.py   |  437 ++
 backends/gaudi/tgi-entrypoint.sh              |   11 +
 launcher/src/env_runtime.rs                   |   17 +-
 launcher/src/main.rs                          |    9 +
 161 files changed, 52552 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile_gaudi
 create mode 100644 backends/gaudi/Makefile
 create mode 100644 backends/gaudi/README.md
 create mode 100644 backends/gaudi/server/.gitignore
 create mode 100644 backends/gaudi/server/Makefile
 create mode 100644 backends/gaudi/server/Makefile-awq
 create mode 100644 backends/gaudi/server/Makefile-eetq
 create mode 100644 backends/gaudi/server/Makefile-fbgemm
 create mode 100644 backends/gaudi/server/Makefile-flash-att
 create mode 100644 backends/gaudi/server/Makefile-flash-att-v2
 create mode 100644 backends/gaudi/server/Makefile-selective-scan
 create mode 100644 backends/gaudi/server/Makefile-vllm
 create mode 100644 backends/gaudi/server/README.md
 create mode 100644 backends/gaudi/server/dill-0.3.7-patch.sh
 create mode 100644 backends/gaudi/server/dill-0.3.8-patch.sh
 create mode 100644 backends/gaudi/server/poetry.lock
 create mode 100644 backends/gaudi/server/pyproject.toml
 create mode 100644 backends/gaudi/server/requirements.txt
 create mode 100644 backends/gaudi/server/tests/conftest.py
 create mode 100644 backends/gaudi/server/tests/models/test_bloom.py
 create mode 100644 backends/gaudi/server/tests/models/test_causal_lm.py
 create mode 100644 backends/gaudi/server/tests/models/test_model.py
 create mode 100644 backends/gaudi/server/tests/models/test_santacoder.py
 create mode 100644 backends/gaudi/server/tests/models/test_seq2seq_lm.py
 create mode 100644 backends/gaudi/server/tests/utils/test_adapter.py
 create mode 100644 backends/gaudi/server/tests/utils/test_convert.py
 create mode 100644 backends/gaudi/server/tests/utils/test_hub.py
 create mode 100644 backends/gaudi/server/tests/utils/test_layers.py
 create mode 100644 backends/gaudi/server/tests/utils/test_tokens.py
 create mode 100644 backends/gaudi/server/tests/utils/test_watermark.py
 create mode 100644 backends/gaudi/server/tests/utils/test_weights.py
 create mode 100644 backends/gaudi/server/text_generation_server/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/adapters/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/adapters/config.py
 create mode 100644 backends/gaudi/server/text_generation_server/adapters/lora.py
 create mode 100644 backends/gaudi/server/text_generation_server/adapters/weights.py
 create mode 100644 backends/gaudi/server/text_generation_server/cache.py
 create mode 100644 backends/gaudi/server/text_generation_server/cli.py
 create mode 100644 backends/gaudi/server/text_generation_server/habana_quantization_env.py
 create mode 100644 backends/gaudi/server/text_generation_server/interceptor.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/attention/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/attention/common.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/attention/cuda.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/attention/flash_attn_triton.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/attention/flashinfer.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/attention/ipex.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/attention/rocm.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/awq/conversion_utils.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/awq/quantize/qmodule.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/bnb.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/conv.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/eetq.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/exl2.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/fp8.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/custom_autotune.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/exllama.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/exllamav2.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/quant_linear.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/utils.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/layernorm.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/linear.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/lora.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/fp8.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/gptq.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/marlin.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/util.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/medusa.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/mlp.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/moe/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/moe/fused_moe_rocm.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/moe/gptq_marlin.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/moe/unquantized.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/rotary.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/speculative.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/tensor_parallel.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/bloom.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/causal_lm.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/clip.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_processing.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/mpt_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/neox_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/opt_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/phi_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/siglip.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/t5_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/galactica.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/globals.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/idefics_causal_lm.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/mamba.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/model.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/pali_gemma.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/seq2seq_lm.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/starcoder.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/types.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
 create mode 100644 backends/gaudi/server/text_generation_server/pb/.gitignore
 create mode 100644 backends/gaudi/server/text_generation_server/server.py
 create mode 100644 backends/gaudi/server/text_generation_server/tgi_service.py
 create mode 100644 backends/gaudi/server/text_generation_server/tracing.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/adapter.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/chunks.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/convert.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/debug.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/dist.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/hub.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/import_utils.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/log.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/logits_process.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/merges/strategies.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/merges/utils.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/peft.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/quantization.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/segments.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/sgmv.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/speculate.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/tokens.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/version.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/watermark.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/weights.py
 create mode 100644 backends/gaudi/tgi-entrypoint.sh

diff --git a/.gitignore b/.gitignore
index 9434d75c..7d6c7564 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,8 @@ server/fbgemmm
 
 .direnv/
 .venv/
+
+# Gaudi auto-generated files
+hl-smi_log*.txt
+.graph_dumps
+out
diff --git a/Dockerfile_gaudi b/Dockerfile_gaudi
new file mode 100644
index 00000000..91b172ed
--- /dev/null
+++ b/Dockerfile_gaudi
@@ -0,0 +1,127 @@
+# Those arguments are required to build the image
+ARG HABANA_VERSION
+ARG PYTORCH_VERSION
+
+# Rust builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+ENV PYO3_PYTHON="/root/.local/bin/python" \
+    PYTHON_SYS_EXECUTABLE="/root/.local/bin/python" \
+    PYO3_PYTHON_VERSION="3.10"
+
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && . $HOME/.local/bin/env \
+    && uv python install 3.10 --default --preview \
+    && test -f /root/.local/bin/python || (echo "Python 3.10 not found at /root/.local/bin/python" && exit 1)
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo build --profile release-opt
+
+# Text Generation Inference base image
+ARG HABANA_VERSION
+ARG PYTORCH_VERSION
+
+FROM vault.habana.ai/gaudi-docker/${HABANA_VERSION}/ubuntu22.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:latest AS base
+
+ENV ATTENTION=default
+ENV PREFIX_CACHING=0
+ENV PREFILL_CHUNKING=0
+
+# Text Generation Inference base env
+ENV HF_HOME=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+# Assert that Python 3.10 is installed as the launcher is compiled with Python 3.10
+RUN python3.10 --version || (echo "Python 3.10 is not installed" && exit 1)
+
+# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
+RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
+    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
+
+WORKDIR /usr/src
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        libssl-dev \
+        ca-certificates \
+        make \
+        curl \
+        git \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install server
+COPY proto proto
+COPY backends/gaudi/server server
+COPY backends/gaudi/server/Makefile server/Makefile
+ARG HABANA_VERSION
+RUN cd server && \
+    make gen-server && \
+    pip install --no-deps -r requirements.txt && \
+    bash ./dill-0.3.8-patch.sh && \
+    pip install outlines~=0.0.34 && \
+    pip install "git+https://github.com/HabanaAI/DeepSpeed.git@${HABANA_VERSION}" && \
+    BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \
+    pip install . --no-cache-dir
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# AWS Sagemaker compatible image
+FROM base AS sagemaker
+
+COPY sagemaker-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
+
+# Final image
+FROM base
+
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+ENV HABANA_VISIBLE_DEVICES all
+ENV OMPI_MCA_btl_vader_single_copy_mechanism NONE
+
+COPY backends/gaudi/tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
+CMD ["--json-output"]
diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile
new file mode 100644
index 00000000..ce3be25d
--- /dev/null
+++ b/backends/gaudi/Makefile
@@ -0,0 +1,49 @@
+mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
+mkfile_dir := $(dir $(mkfile_path))
+root_dir := "${mkfile_dir}/../.."
+
+HABANA_VERSION := 1.19.0
+PYTORCH_VERSION := 2.5.1
+
+.PHONY:	image run-local-dev-container install-dependencies install-server install-router install-launcher local-dev-install
+
+image:
+	docker build -t tgi-gaudi -f ${root_dir}/Dockerfile_gaudi ${root_dir} --build-arg HABANA_VERSION=$(HABANA_VERSION) --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION)
+
+run-local-dev-container:
+		docker run -it \
+		--runtime=habana \
+		--ipc=host \
+		--cap-add=sys_nice \
+		--net=host \
+		-e HABANA_VISIBLE_DEVICES=all \
+		-e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+		-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
+		-e HF_TOKEN=`cat /home/ubuntu/.cache/huggingface/token` \
+		-e LOG_LEVEL=debug \
+		-e PORT=8080 \
+		-v /home/ubuntu/.cache/huggingface:/data \
+		-v $(PWD):/text-generation-inference \
+		-w /text-generation-inference \
+		vault.habana.ai/gaudi-docker/$(HABANA_VERSION)/ubuntu22.04/habanalabs/pytorch-installer-$(PYTORCH_VERSION):latest
+
+install-dependencies:
+	pip install git+https://github.com/HabanaAI/DeepSpeed.git@$(HABANA_VERSION)
+	pip install outlines~=0.0.34
+	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+install-server:
+	make -C ${root_dir}/backends/gaudi/server install PROTO_PATH=../../../proto/v3
+
+install-router:
+	make -C ${root_dir} install-router
+
+install-launcher:
+	make -C ${root_dir} install-launcher
+
+# use source to load the rust in path
+local-dev-install: install-dependencies
+	bash -c 'source "$$HOME/.cargo/env" && \
+		make install-server && \
+		make install-router && \
+		make install-launcher'
diff --git a/backends/gaudi/README.md b/backends/gaudi/README.md
new file mode 100644
index 00000000..765ad447
--- /dev/null
+++ b/backends/gaudi/README.md
@@ -0,0 +1,98 @@
+# Text-generation-inference - Gaudi backend
+
+## Description
+
+This is the TGI backend for Intel Gaudi. This backend is composed of the tgi server optimized for Gaudi hardware.
+
+## Build your own image
+
+The simplest way to build TGI with the Gaudi backend is to use the provided `Makefile`:
+
+Option 1: From the project root directory:
+```bash
+make -C backends/gaudi image
+```
+
+Option 2: From the Gaudi backend directory:
+```bash
+cd backends/gaudi
+make image
+```
+
+You can now run the server with the following command:
+
+Option 1: Sharded:
+```bash
+model=meta-llama/Llama-3.1-8B-Instruct
+hf_token=$(cat ${HOME}/.cache/huggingface/token)
+volume=${HOME}/.cache/huggingface
+
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+  -p 8080:80 -v $volume:/data \
+  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
+  tgi-gaudi --model-id $model \
+  --sharded true --num-shard 8 \
+  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 8 --max-batch-prefill-tokens 2048
+```
+
+Option 2: Non-sharded:
+```bash
+model=meta-llama/Llama-3.1-8B-Instruct
+hf_token=$(cat ${HOME}/.cache/huggingface/token)
+volume=${HOME}/.cache/huggingface
+
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+  -p 8080:80 -v $volume:/data \
+  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
+  tgi-gaudi --model-id $model \
+  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 4 --max-batch-prefill-tokens 2048
+```
+
+## Contributing
+
+### Local Development
+
+This is useful if you want to run the server locally for better debugging.
+```bash
+make -C backends/gaudi run-local-dev-container
+```
+
+Then run the following command inside the container to install tgi for gaudi:
+```bash
+make -C backends/gaudi local-dev-install
+```
+
+Add rust to path:
+```bash
+. "$HOME/.cargo/env"
+```
+
+Option 1: Run the server (sharded model):
+```bash
+LOG_LEVEL=debug text-generation-launcher \
+    --model-id meta-llama/Llama-3.1-8B-Instruct \
+    --sharded true \
+    --num-shard 8 \
+    --max-input-tokens 512 \
+    --max-total-tokens 1024 \
+    --max-batch-size 8 \
+    --max-batch-prefill-tokens 2048
+```
+
+Option 2: Run the server (non-sharded model):
+```bash
+LOG_LEVEL=debug text-generation-launcher \
+    --model-id meta-llama/Llama-3.1-8B-Instruct \
+    --max-input-tokens 512 \
+    --max-total-tokens 1024 \
+    --max-batch-size 4 \
+    --max-batch-prefill-tokens 2048
+```
+
+You can then test the server with the following curl command from another terminal (can be outside the container):
+```bash
+curl 127.0.0.1:8080/generate \
+     -X POST \
+     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+     -H 'Content-Type: application/json'
+```
diff --git a/backends/gaudi/server/.gitignore b/backends/gaudi/server/.gitignore
new file mode 100644
index 00000000..576746ee
--- /dev/null
+++ b/backends/gaudi/server/.gitignore
@@ -0,0 +1,164 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+text_generation_server/__pycache__/
+text_generation_server/pb/__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+transformers
+safetensors
+flash-attention/
+flash-attention-v2/
+vllm/
+llm-awq/
+eetq/
+mamba/
diff --git a/backends/gaudi/server/Makefile b/backends/gaudi/server/Makefile
new file mode 100644
index 00000000..b5b84338
--- /dev/null
+++ b/backends/gaudi/server/Makefile
@@ -0,0 +1,38 @@
+include Makefile-flash-att
+include Makefile-flash-att-v2
+include Makefile-vllm
+include Makefile-awq
+include Makefile-eetq
+include Makefile-selective-scan
+
+PROTO_PATH ?= ../proto/v3
+
+unit-tests:
+	pytest -s -vv -m "not private" tests
+
+gen-server:
+	# Compile protos
+	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
+	mkdir text_generation_server/pb || true
+	python -m grpc_tools.protoc -I$(PROTO_PATH) --python_out=text_generation_server/pb \
+		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb $(PROTO_PATH)/generate.proto
+	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+	touch text_generation_server/pb/__init__.py
+
+install: gen-server
+	pip install pip --upgrade
+	pip install --no-deps -r requirements.txt
+	pip install -e "."
+
+run-dev:
+	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
+
+install-poetry:
+	curl -sSL https://install.python-poetry.org | python3 -
+
+update-lock:
+	rm poetry.lock
+	poetry lock --no-update
+
+export-requirements:
+	poetry export -o requirements.txt --without-hashes
diff --git a/backends/gaudi/server/Makefile-awq b/backends/gaudi/server/Makefile-awq
new file mode 100644
index 00000000..4e074a13
--- /dev/null
+++ b/backends/gaudi/server/Makefile-awq
@@ -0,0 +1,15 @@
+# Fork that adds only the correct stream to this kernel in order
+# to make cuda graphs work.
+awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4
+
+awq:
+	rm -rf llm-awq
+	git clone https://github.com/huggingface/llm-awq
+
+build-awq: awq
+	cd llm-awq/ && git fetch && git checkout $(awq_commit)
+	cd llm-awq/awq/kernels && python setup.py build
+
+install-awq: build-awq
+	pip uninstall awq_inference_engine -y || true
+	cd llm-awq/awq/kernels && python setup.py install
diff --git a/backends/gaudi/server/Makefile-eetq b/backends/gaudi/server/Makefile-eetq
new file mode 100644
index 00000000..726e47b5
--- /dev/null
+++ b/backends/gaudi/server/Makefile-eetq
@@ -0,0 +1,13 @@
+eetq_commit := 1657b1504faa359e2ce0ac02999439d7ac8c74c0
+
+eetq:
+    # Clone eetq
+	pip install packaging
+	git clone https://github.com/NetEase-FuXi/EETQ.git eetq
+
+build-eetq: eetq
+	cd eetq && git fetch && git checkout $(eetq_commit) && git submodule update --init --recursive
+	cd eetq && python setup.py build
+
+install-eetq: build-eetq
+	cd eetq && python setup.py install
diff --git a/backends/gaudi/server/Makefile-fbgemm b/backends/gaudi/server/Makefile-fbgemm
new file mode 100644
index 00000000..3b8061a1
--- /dev/null
+++ b/backends/gaudi/server/Makefile-fbgemm
@@ -0,0 +1,15 @@
+fbgemm_commit := v0.8.0
+
+build-fbgemm:
+	@if [ ! -d "fbgemm" ]; then \
+		git clone https://github.com/pytorch/FBGEMM.git fbgemm; \
+	fi
+	cd fbgemm && git fetch && git checkout $(fbgemm_commit)  && \
+	git submodule update --init --recursive && \
+	cd fbgemm_gpu && \
+	pip install -r requirements.txt && \
+	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai build
+
+install-fbgemm: build-fbgemm
+	cd fbgemm/fbgemm_gpu &&  \
+	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai install
diff --git a/backends/gaudi/server/Makefile-flash-att b/backends/gaudi/server/Makefile-flash-att
new file mode 100644
index 00000000..29e75bc4
--- /dev/null
+++ b/backends/gaudi/server/Makefile-flash-att
@@ -0,0 +1,12 @@
+flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec
+
+build-flash-attention:
+	if [ ! -d 'flash-attention' ]; then \
+		pip install -U packaging ninja  --no-cache-dir && \
+		git clone https://github.com/HazyResearch/flash-attention.git; \
+	fi
+	cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
+	MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build
+
+install-flash-attention: build-flash-attention
+	cd flash-attention && git checkout $(flash_att_commit) && MAX_JOBS=8 python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
diff --git a/backends/gaudi/server/Makefile-flash-att-v2 b/backends/gaudi/server/Makefile-flash-att-v2
new file mode 100644
index 00000000..a9cdf782
--- /dev/null
+++ b/backends/gaudi/server/Makefile-flash-att-v2
@@ -0,0 +1,21 @@
+flash_att_v2_commit_cuda := v2.6.1
+flash_att_v2_commit_rocm := 2092111b9f975b3347c652ff7fabd431130256c4
+
+build-flash-attention-v2-cuda:
+	pip install -U packaging wheel
+	pip install flash-attn==$(flash_att_v2_commit_cuda)
+
+install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
+	echo "Flash v2 installed"
+
+build-flash-attention-v2-rocm:
+	if [ ! -d 'flash-attention-v2' ]; then \
+		pip install -U packaging ninja  --no-cache-dir && \
+		git clone https://github.com/mht-sharma/flash-attention.git flash-attention-v2 && \
+		cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) && \
+		git submodule update --init --recursive && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
+	fi
+
+install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
+	cd flash-attention-v2 &&  \
+	GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install
diff --git a/backends/gaudi/server/Makefile-selective-scan b/backends/gaudi/server/Makefile-selective-scan
new file mode 100644
index 00000000..b93b517d
--- /dev/null
+++ b/backends/gaudi/server/Makefile-selective-scan
@@ -0,0 +1,28 @@
+selective_scan_commit := 2a3704fd47ba817b415627b06fd796b971fdc137
+
+causal-conv1d:
+	rm -rf causal-conv1d
+	git clone https://github.com/Dao-AILab/causal-conv1d.git
+
+build-causal-conv1d: causal-conv1d
+	cd causal-conv1d/ && git checkout v1.1.1 # known latest working version tag
+	cd causal-conv1d/ && CAUSAL_CONV1D_FORCE_BUILD=TRUE python setup.py build
+
+install-causal-conv1d: build-causal-conv1d
+	pip uninstall causal-conv1d -y || true
+	cd causal-conv1d/ && pip install .
+
+# selective-scan dependends on causal-conv1d
+selective-scan:
+	rm -rf mamba
+	git clone https://github.com/state-spaces/mamba.git mamba
+
+build-selective-scan: selective-scan
+	cd mamba/ && git fetch && git checkout $(selective_scan_commit)
+	cd mamba && python setup.py build
+
+install-selective-scan: install-causal-conv1d build-selective-scan
+	pip uninstall selective-scan-cuda -y || true
+	cd mamba && pip install .
+
+build-all: build-causal-conv1d build-selective-scan
diff --git a/backends/gaudi/server/Makefile-vllm b/backends/gaudi/server/Makefile-vllm
new file mode 100644
index 00000000..18dcc4a0
--- /dev/null
+++ b/backends/gaudi/server/Makefile-vllm
@@ -0,0 +1,23 @@
+commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
+commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
+build-vllm-cuda:
+	if [ ! -d 'vllm' ]; then \
+		pip install -U ninja packaging --no-cache-dir && \
+		git clone https://github.com/Narsil/vllm.git vllm; \
+	fi
+	cd vllm  && git fetch origin && git checkout $(commit_cuda) && python setup.py build
+
+install-vllm-cuda: build-vllm-cuda
+	cd vllm  && git fetch origin && git checkout $(commit_cuda) && pip install -e .
+
+build-vllm-rocm:
+	if [ ! -d 'vllm' ]; then \
+		pip install -U ninja packaging --no-cache-dir && \
+		git clone https://github.com/mht-sharma/vllm.git vllm; \
+	fi
+	cd vllm && git fetch && git checkout $(commit_rocm) &&  \
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
+
+install-vllm-rocm: build-vllm-rocm
+	cd vllm && git fetch && git checkout $(commit_rocm) && \
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .
diff --git a/backends/gaudi/server/README.md b/backends/gaudi/server/README.md
new file mode 100644
index 00000000..b8208f9e
--- /dev/null
+++ b/backends/gaudi/server/README.md
@@ -0,0 +1,15 @@
+# Text Generation Inference Python gRPC Server
+
+A Python gRPC server for Text Generation Inference
+
+## Install
+
+```shell
+make install
+```
+
+## Run
+
+```shell
+make run-dev
+```
diff --git a/backends/gaudi/server/dill-0.3.7-patch.sh b/backends/gaudi/server/dill-0.3.7-patch.sh
new file mode 100644
index 00000000..5efd6c54
--- /dev/null
+++ b/backends/gaudi/server/dill-0.3.7-patch.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+git clone -b dill-0.3.7 https://github.com/uqfoundation/dill.git
+pushd dill
+cat <<EOF > dill-0.3.7.patch
+diff --git a/dill/_dill.py b/dill/_dill.py
+index d0cf543..f6eb662 100644
+--- a/dill/_dill.py
++++ b/dill/_dill.py
+@@ -69,7 +69,15 @@ TypeType = type # 'new-style' classes #XXX: unregistered
+ XRangeType = range
+ from types import MappingProxyType as DictProxyType, new_class
+ from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, PickleError, PicklingError, UnpicklingError
+-import __main__ as _main_module
++class _LazyMainModule(object):
++    _module = None
++    @property
++    def module(self):
++        if self._module is None:
++            import __main__ as _m_module
++            self._module = _m_module
++        return self._module
++_main_module = _LazyMainModule()
+ import marshal
+ import gc
+ # import zlib
+@@ -353,7 +361,7 @@ class Pickler(StockPickler):
+         _fmode = kwds.pop('fmode', None)
+         _recurse = kwds.pop('recurse', None)
+         StockPickler.__init__(self, file, *args, **kwds)
+-        self._main = _main_module
++        self._main = _main_module.module
+         self._diff_cache = {}
+         self._byref = settings['byref'] if _byref is None else _byref
+         self._strictio = False #_strictio
+@@ -435,12 +443,12 @@ class Unpickler(StockUnpickler):
+         settings = Pickler.settings
+         _ignore = kwds.pop('ignore', None)
+         StockUnpickler.__init__(self, *args, **kwds)
+-        self._main = _main_module
++        self._main = _main_module.module
+         self._ignore = settings['ignore'] if _ignore is None else _ignore
+
+     def load(self): #NOTE: if settings change, need to update attributes
+         obj = StockUnpickler.load(self)
+-        if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
++        if type(obj).__module__ == getattr(self._main, '__name__', '__main__'):
+             if not self._ignore:
+                 # point obj class to main
+                 try: obj.__class__ = getattr(self._main, type(obj).__name__)
+@@ -1194,11 +1202,11 @@ def save_module_dict(pickler, obj):
+         logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8'))
+         logger.trace(pickler, "# D1")
+-    elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__):
++    elif (not is_dill(pickler, child=False)) and (obj == _main_module.module.__dict__):
+         logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8'))  #XXX: works in general?
+         logger.trace(pickler, "# D3")
+-    elif '__name__' in obj and obj != _main_module.__dict__ \\
++    elif '__name__' in obj and obj != _main_module.module.__dict__ \\
+             and type(obj['__name__']) is str \\
+             and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None):
+         logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj
+diff --git a/dill/session.py b/dill/session.py
+index 74234ab..1be8d89 100644
+--- a/dill/session.py
++++ b/dill/session.py
+@@ -233,7 +233,7 @@ def dump_module(
+     protocol = settings['protocol']
+     main = module
+     if main is None:
+-        main = _main_module
++        main = _main_module.module
+     elif isinstance(main, str):
+         main = _import_module(main)
+     if not isinstance(main, ModuleType):
+@@ -501,7 +501,7 @@ def load_module(
+             pass
+     assert loaded is main
+     _restore_modules(unpickler, main)
+-    if main is _main_module or main is module:
++    if main is _main_module.module or main is module:
+         return None
+     else:
+         return main
+
+EOF
+git apply dill-0.3.7.patch
+python -m pip install .
+popd
+rm -fr dill
diff --git a/backends/gaudi/server/dill-0.3.8-patch.sh b/backends/gaudi/server/dill-0.3.8-patch.sh
new file mode 100644
index 00000000..414790e7
--- /dev/null
+++ b/backends/gaudi/server/dill-0.3.8-patch.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+git clone -b 0.3.8 https://github.com/uqfoundation/dill.git
+pushd dill
+cat <<EOF > dill-0.3.8.patch
+diff --git a/dill/_dill.py b/dill/_dill.py
+index d42432f..1d251e6 100644
+--- a/dill/_dill.py
++++ b/dill/_dill.py
+@@ -69,7 +69,15 @@ TypeType = type # 'new-style' classes #XXX: unregistered
+ XRangeType = range
+ from types import MappingProxyType as DictProxyType, new_class
+ from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, PickleError, PicklingError, UnpicklingError
+-import __main__ as _main_module
++class _LazyMainModule(object):
++    _module = None
++    @property
++    def module(self):
++        if self._module is None:
++            import __main__ as _m_module
++            self._module = _m_module
++        return self._module
++_main_module = _LazyMainModule()
+ import marshal
+ import gc
+ # import zlib
+@@ -355,7 +363,7 @@ class Pickler(StockPickler):
+         _fmode = kwds.pop('fmode', None)
+         _recurse = kwds.pop('recurse', None)
+         StockPickler.__init__(self, file, *args, **kwds)
+-        self._main = _main_module
++        self._main = _main_module.module
+         self._diff_cache = {}
+         self._byref = settings['byref'] if _byref is None else _byref
+         self._strictio = False #_strictio
+@@ -437,12 +445,12 @@ class Unpickler(StockUnpickler):
+         settings = Pickler.settings
+         _ignore = kwds.pop('ignore', None)
+         StockUnpickler.__init__(self, *args, **kwds)
+-        self._main = _main_module
++        self._main = _main_module.module
+         self._ignore = settings['ignore'] if _ignore is None else _ignore
+
+     def load(self): #NOTE: if settings change, need to update attributes
+         obj = StockUnpickler.load(self)
+-        if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
++        if type(obj).__module__ == getattr(self._main, '__name__', '__main__'):
+             if not self._ignore:
+                 # point obj class to main
+                 try: obj.__class__ = getattr(self._main, type(obj).__name__)
+@@ -1199,11 +1207,11 @@ def save_module_dict(pickler, obj):
+         logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8'))
+         logger.trace(pickler, "# D1")
+-    elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__):
++    elif (not is_dill(pickler, child=False)) and (obj == _main_module.module.__dict__):
+         logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8'))  #XXX: works in general?
+         logger.trace(pickler, "# D3")
+-    elif '__name__' in obj and obj != _main_module.__dict__ \\
++    elif '__name__' in obj and obj != _main_module.module.__dict__ \\
+             and type(obj['__name__']) is str \\
+             and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None):
+         logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj
+diff --git a/dill/session.py b/dill/session.py
+index e91068a..a921b43 100644
+--- a/dill/session.py
++++ b/dill/session.py
+@@ -233,7 +233,7 @@ def dump_module(
+     protocol = settings['protocol']
+     main = module
+     if main is None:
+-        main = _main_module
++        main = _main_module.module
+     elif isinstance(main, str):
+         main = _import_module(main)
+     if not isinstance(main, ModuleType):
+@@ -501,7 +501,7 @@ def load_module(
+             pass
+     assert loaded is main
+     _restore_modules(unpickler, main)
+-    if main is _main_module or main is module:
++    if main is _main_module.module or main is module:
+         return None
+     else:
+         return main
+
+EOF
+git apply dill-0.3.8.patch
+python -m pip install .
+popd
+rm -fr dill
diff --git a/backends/gaudi/server/poetry.lock b/backends/gaudi/server/poetry.lock
new file mode 100644
index 00000000..1987c3b3
--- /dev/null
+++ b/backends/gaudi/server/poetry.lock
@@ -0,0 +1,4019 @@
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+
+[[package]]
+name = "accelerate"
+version = "0.29.3"
+description = "Accelerate"
+optional = true
+python-versions = ">=3.8.0"
+files = [
+    {file = "accelerate-0.29.3-py3-none-any.whl", hash = "sha256:99d633d4b6126817c5e554487406748be95c8d1d1e659dd2fd60657e35f532dd"},
+    {file = "accelerate-0.29.3.tar.gz", hash = "sha256:1a5a845b06b24b41736b219b2b20fd021ca5dff4070a252445fd6de736e347ac"},
+]
+
+[package.dependencies]
+huggingface-hub = "*"
+numpy = ">=1.17"
+packaging = ">=20.0"
+psutil = "*"
+pyyaml = "*"
+safetensors = ">=0.3.1"
+torch = ">=1.10.0"
+
+[package.extras]
+dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "deepspeed", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.2.1,<0.3.0)", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
+quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.2.1,<0.3.0)"]
+rich = ["rich"]
+sagemaker = ["sagemaker"]
+test-dev = ["bitsandbytes", "datasets", "deepspeed", "evaluate", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
+test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist"]
+test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"]
+testing = ["bitsandbytes", "datasets", "deepspeed", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.4.3"
+description = "Happy Eyeballs for asyncio"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"},
+    {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"},
+]
+
+[[package]]
+name = "aiohttp"
+version = "3.10.10"
+description = "Async http client/server framework (asyncio)"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"},
+    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"},
+    {file = "aiohttp-3.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:333cf6cf8e65f6a1e06e9eb3e643a0c515bb850d470902274239fea02033e9a8"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:274cfa632350225ce3fdeb318c23b4a10ec25c0e2c880eff951a3842cf358ac1"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9e5e4a85bdb56d224f412d9c98ae4cbd032cc4f3161818f692cd81766eee65a"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b606353da03edcc71130b52388d25f9a30a126e04caef1fd637e31683033abd"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab5a5a0c7a7991d90446a198689c0535be89bbd6b410a1f9a66688f0880ec026"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:578a4b875af3e0daaf1ac6fa983d93e0bbfec3ead753b6d6f33d467100cdc67b"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8105fd8a890df77b76dd3054cddf01a879fc13e8af576805d667e0fa0224c35d"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3bcd391d083f636c06a68715e69467963d1f9600f85ef556ea82e9ef25f043f7"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fbc6264158392bad9df19537e872d476f7c57adf718944cc1e4495cbabf38e2a"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e48d5021a84d341bcaf95c8460b152cfbad770d28e5fe14a768988c461b821bc"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2609e9ab08474702cc67b7702dbb8a80e392c54613ebe80db7e8dbdb79837c68"},
+    {file = "aiohttp-3.10.10-cp310-cp310-win32.whl", hash = "sha256:84afcdea18eda514c25bc68b9af2a2b1adea7c08899175a51fe7c4fb6d551257"},
+    {file = "aiohttp-3.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:9c72109213eb9d3874f7ac8c0c5fa90e072d678e117d9061c06e30c85b4cf0e6"},
+    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c30a0eafc89d28e7f959281b58198a9fa5e99405f716c0289b7892ca345fe45f"},
+    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:258c5dd01afc10015866114e210fb7365f0d02d9d059c3c3415382ab633fcbcb"},
+    {file = "aiohttp-3.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:15ecd889a709b0080f02721255b3f80bb261c2293d3c748151274dfea93ac871"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3935f82f6f4a3820270842e90456ebad3af15810cf65932bd24da4463bc0a4c"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:413251f6fcf552a33c981c4709a6bba37b12710982fec8e558ae944bfb2abd38"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1720b4f14c78a3089562b8875b53e36b51c97c51adc53325a69b79b4b48ebcb"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:679abe5d3858b33c2cf74faec299fda60ea9de62916e8b67e625d65bf069a3b7"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79019094f87c9fb44f8d769e41dbb664d6e8fcfd62f665ccce36762deaa0e911"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe2fb38c2ed905a2582948e2de560675e9dfbee94c6d5ccdb1301c6d0a5bf092"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a3f00003de6eba42d6e94fabb4125600d6e484846dbf90ea8e48a800430cc142"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1bbb122c557a16fafc10354b9d99ebf2f2808a660d78202f10ba9d50786384b9"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:30ca7c3b94708a9d7ae76ff281b2f47d8eaf2579cd05971b5dc681db8caac6e1"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:df9270660711670e68803107d55c2b5949c2e0f2e4896da176e1ecfc068b974a"},
+    {file = "aiohttp-3.10.10-cp311-cp311-win32.whl", hash = "sha256:aafc8ee9b742ce75044ae9a4d3e60e3d918d15a4c2e08a6c3c3e38fa59b92d94"},
+    {file = "aiohttp-3.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:362f641f9071e5f3ee6f8e7d37d5ed0d95aae656adf4ef578313ee585b585959"},
+    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9294bbb581f92770e6ed5c19559e1e99255e4ca604a22c5c6397b2f9dd3ee42c"},
+    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a8fa23fe62c436ccf23ff930149c047f060c7126eae3ccea005f0483f27b2e28"},
+    {file = "aiohttp-3.10.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c6a5b8c7926ba5d8545c7dd22961a107526562da31a7a32fa2456baf040939f"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:007ec22fbc573e5eb2fb7dec4198ef8f6bf2fe4ce20020798b2eb5d0abda6138"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9627cc1a10c8c409b5822a92d57a77f383b554463d1884008e051c32ab1b3742"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50edbcad60d8f0e3eccc68da67f37268b5144ecc34d59f27a02f9611c1d4eec7"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a45d85cf20b5e0d0aa5a8dca27cce8eddef3292bc29d72dcad1641f4ed50aa16"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b00807e2605f16e1e198f33a53ce3c4523114059b0c09c337209ae55e3823a8"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f2d4324a98062be0525d16f768a03e0bbb3b9fe301ceee99611dc9a7953124e6"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438cd072f75bb6612f2aca29f8bd7cdf6e35e8f160bc312e49fbecab77c99e3a"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:baa42524a82f75303f714108fea528ccacf0386af429b69fff141ffef1c534f9"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a7d8d14fe962153fc681f6366bdec33d4356f98a3e3567782aac1b6e0e40109a"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c1277cd707c465cd09572a774559a3cc7c7a28802eb3a2a9472588f062097205"},
+    {file = "aiohttp-3.10.10-cp312-cp312-win32.whl", hash = "sha256:59bb3c54aa420521dc4ce3cc2c3fe2ad82adf7b09403fa1f48ae45c0cbde6628"},
+    {file = "aiohttp-3.10.10-cp312-cp312-win_amd64.whl", hash = "sha256:0e1b370d8007c4ae31ee6db7f9a2fe801a42b146cec80a86766e7ad5c4a259cf"},
+    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ad7593bb24b2ab09e65e8a1d385606f0f47c65b5a2ae6c551db67d6653e78c28"},
+    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1eb89d3d29adaf533588f209768a9c02e44e4baf832b08118749c5fad191781d"},
+    {file = "aiohttp-3.10.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3fe407bf93533a6fa82dece0e74dbcaaf5d684e5a51862887f9eaebe6372cd79"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50aed5155f819873d23520919e16703fc8925e509abbb1a1491b0087d1cd969e"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f05e9727ce409358baa615dbeb9b969db94324a79b5a5cea45d39bdb01d82e6"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dffb610a30d643983aeb185ce134f97f290f8935f0abccdd32c77bed9388b42"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6658732517ddabe22c9036479eabce6036655ba87a0224c612e1ae6af2087e"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:741a46d58677d8c733175d7e5aa618d277cd9d880301a380fd296975a9cdd7bc"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e00e3505cd80440f6c98c6d69269dcc2a119f86ad0a9fd70bccc59504bebd68a"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ffe595f10566f8276b76dc3a11ae4bb7eba1aac8ddd75811736a15b0d5311414"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdfcf6443637c148c4e1a20c48c566aa694fa5e288d34b20fcdc58507882fed3"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d183cf9c797a5291e8301790ed6d053480ed94070637bfaad914dd38b0981f67"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"},
+    {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"},
+    {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"},
+    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1b66ccafef7336a1e1f0e389901f60c1d920102315a56df85e49552308fc0486"},
+    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:acd48d5b80ee80f9432a165c0ac8cbf9253eaddb6113269a5e18699b33958dbb"},
+    {file = "aiohttp-3.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3455522392fb15ff549d92fbf4b73b559d5e43dc522588f7eb3e54c3f38beee7"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c3b868724137f713a38376fef8120c166d1eadd50da1855c112fe97954aed8"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:da1dee8948d2137bb51fbb8a53cce6b1bcc86003c6b42565f008438b806cccd8"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5ce2ce7c997e1971b7184ee37deb6ea9922ef5163c6ee5aa3c274b05f9e12fa"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28529e08fde6f12eba8677f5a8608500ed33c086f974de68cc65ab218713a59d"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7db54c7914cc99d901d93a34704833568d86c20925b2762f9fa779f9cd2e70f"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:03a42ac7895406220124c88911ebee31ba8b2d24c98507f4a8bf826b2937c7f2"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7e338c0523d024fad378b376a79faff37fafb3c001872a618cde1d322400a572"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:038f514fe39e235e9fef6717fbf944057bfa24f9b3db9ee551a7ecf584b5b480"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:64f6c17757251e2b8d885d728b6433d9d970573586a78b78ba8929b0f41d045a"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:93429602396f3383a797a2a70e5f1de5df8e35535d7806c9f91df06f297e109b"},
+    {file = "aiohttp-3.10.10-cp38-cp38-win32.whl", hash = "sha256:c823bc3971c44ab93e611ab1a46b1eafeae474c0c844aff4b7474287b75fe49c"},
+    {file = "aiohttp-3.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:54ca74df1be3c7ca1cf7f4c971c79c2daf48d9aa65dea1a662ae18926f5bc8ce"},
+    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"},
+    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"},
+    {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"},
+    {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"},
+    {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"},
+    {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"},
+]
+
+[package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
+aiosignal = ">=1.1.2"
+async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
+attrs = ">=17.3.0"
+frozenlist = ">=1.1.1"
+multidict = ">=4.5,<7.0"
+yarl = ">=1.12.0,<2.0"
+
+[package.extras]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
+
+[[package]]
+name = "aiosignal"
+version = "1.3.1"
+description = "aiosignal: a list of registered asynchronous callbacks"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
+    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
+]
+
+[package.dependencies]
+frozenlist = ">=1.1.0"
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
+    {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
+]
+
+[[package]]
+name = "async-timeout"
+version = "4.0.3"
+description = "Timeout context manager for asyncio programs"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
+    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
+]
+
+[[package]]
+name = "attrs"
+version = "24.2.0"
+description = "Classes Without Boilerplate"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
+    {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
+]
+
+[package.extras]
+benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
+
+[[package]]
+name = "bitsandbytes"
+version = "0.43.3"
+description = "k-bit optimizers and matrix multiplication routines."
+optional = true
+python-versions = "*"
+files = [
+    {file = "bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:cc99507c352be0715098b2c7577b690dd158972dc4ea10c7495bac104c7c79f0"},
+    {file = "bitsandbytes-0.43.3-py3-none-win_amd64.whl", hash = "sha256:257f6552f2144748a84e6c44e1f7a98f3da888f675ed74e18fd7f7eb13c6cafa"},
+]
+
+[package.dependencies]
+numpy = "*"
+torch = "*"
+
+[package.extras]
+benchmark = ["matplotlib", "pandas"]
+test = ["scipy"]
+
+[[package]]
+name = "certifi"
+version = "2024.8.30"
+description = "Python package for providing Mozilla's CA Bundle."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
+    {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.0"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
+    {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
+    {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
+]
+
+[[package]]
+name = "click"
+version = "8.1.7"
+description = "Composable command line interface toolkit"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
+    {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[[package]]
+name = "cloudpickle"
+version = "3.1.0"
+description = "Pickler class to extend the standard pickle.Pickler functionality"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"},
+    {file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"},
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "datasets"
+version = "2.14.4"
+description = "HuggingFace community-driven open-source library of datasets"
+optional = true
+python-versions = ">=3.8.0"
+files = [
+    {file = "datasets-2.14.4-py3-none-any.whl", hash = "sha256:29336bd316a7d827ccd4da2236596279b20ca2ac78f64c04c9483da7cbc2459b"},
+    {file = "datasets-2.14.4.tar.gz", hash = "sha256:ef29c2b5841de488cd343cfc26ab979bff77efa4d2285af51f1ad7db5c46a83b"},
+]
+
+[package.dependencies]
+aiohttp = "*"
+dill = ">=0.3.0,<0.3.8"
+fsspec = {version = ">=2021.11.1", extras = ["http"]}
+huggingface-hub = ">=0.14.0,<1.0.0"
+multiprocess = "*"
+numpy = ">=1.17"
+packaging = "*"
+pandas = "*"
+pyarrow = ">=8.0.0"
+pyyaml = ">=5.1"
+requests = ">=2.19.0"
+tqdm = ">=4.62.1"
+xxhash = "*"
+
+[package.extras]
+apache-beam = ["apache-beam (>=2.26.0,<2.44.0)"]
+audio = ["librosa", "soundfile (>=0.12.1)"]
+benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"]
+dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "black (>=23.1,<24.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "pyyaml (>=5.3.1)", "rarfile (>=4.0)", "ruff (>=0.0.241)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"]
+docs = ["s3fs", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos", "torch", "transformers"]
+jax = ["jax (>=0.2.8,!=0.3.2,<=0.3.25)", "jaxlib (>=0.1.65,<=0.3.25)"]
+metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"]
+quality = ["black (>=23.1,<24.0)", "pyyaml (>=5.3.1)", "ruff (>=0.0.241)"]
+s3 = ["s3fs"]
+tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos"]
+tensorflow-gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"]
+tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"]
+torch = ["torch"]
+vision = ["Pillow (>=6.2.1)"]
+
+[[package]]
+name = "deprecated"
+version = "1.2.14"
+description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"},
+    {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"},
+]
+
+[package.dependencies]
+wrapt = ">=1.10,<2"
+
+[package.extras]
+dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
+
+[[package]]
+name = "dill"
+version = "0.3.7"
+description = "serialize all of Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "dill-0.3.7-py3-none-any.whl", hash = "sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e"},
+    {file = "dill-0.3.7.tar.gz", hash = "sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03"},
+]
+
+[package.extras]
+graph = ["objgraph (>=1.7.2)"]
+
+[[package]]
+name = "diskcache"
+version = "5.6.3"
+description = "Disk Cache -- Disk and file backed persistent cache."
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"},
+    {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"},
+]
+
+[[package]]
+name = "einops"
+version = "0.6.1"
+description = "A new flavour of deep learning operations"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "einops-0.6.1-py3-none-any.whl", hash = "sha256:99149e46cc808956b174932fe563d920db4d6e5dadb8c6ecdaa7483b7ef7cfc3"},
+    {file = "einops-0.6.1.tar.gz", hash = "sha256:f95f8d00f4ded90dbc4b19b6f98b177332614b0357dde66997f3ae5d474dc8c8"},
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.2.2"
+description = "Backport of PEP 654 (exception groups)"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
+    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
+]
+
+[package.extras]
+test = ["pytest (>=6)"]
+
+[[package]]
+name = "filelock"
+version = "3.16.1"
+description = "A platform independent file lock."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"},
+    {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"},
+]
+
+[package.extras]
+docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
+typing = ["typing-extensions (>=4.12.2)"]
+
+[[package]]
+name = "frozenlist"
+version = "1.4.1"
+description = "A list-like structure which implements collections.abc.MutableSequence"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
+    {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
+    {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
+    {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
+    {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
+    {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
+    {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
+    {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
+    {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
+    {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
+    {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
+    {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
+    {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
+]
+
+[[package]]
+name = "fsspec"
+version = "2024.10.0"
+description = "File-system specification"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "fsspec-2024.10.0-py3-none-any.whl", hash = "sha256:03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871"},
+    {file = "fsspec-2024.10.0.tar.gz", hash = "sha256:eda2d8a4116d4f2429db8550f2457da57279247dd930bb12f821b58391359493"},
+]
+
+[package.dependencies]
+aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional = true, markers = "extra == \"http\""}
+
+[package.extras]
+abfs = ["adlfs"]
+adl = ["adlfs"]
+arrow = ["pyarrow (>=1)"]
+dask = ["dask", "distributed"]
+dev = ["pre-commit", "ruff"]
+doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"]
+dropbox = ["dropbox", "dropboxdrivefs", "requests"]
+full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
+fuse = ["fusepy"]
+gcs = ["gcsfs"]
+git = ["pygit2"]
+github = ["requests"]
+gs = ["gcsfs"]
+gui = ["panel"]
+hdfs = ["pyarrow (>=1)"]
+http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"]
+libarchive = ["libarchive-c"]
+oci = ["ocifs"]
+s3 = ["s3fs"]
+sftp = ["paramiko"]
+smb = ["smbprotocol"]
+ssh = ["paramiko"]
+test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"]
+test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
+test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
+tqdm = ["tqdm"]
+
+[[package]]
+name = "googleapis-common-protos"
+version = "1.65.0"
+description = "Common protobufs used in Google APIs"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "googleapis_common_protos-1.65.0-py2.py3-none-any.whl", hash = "sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63"},
+    {file = "googleapis_common_protos-1.65.0.tar.gz", hash = "sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0"},
+]
+
+[package.dependencies]
+protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
+
+[package.extras]
+grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
+
+[[package]]
+name = "grpc-interceptor"
+version = "0.15.4"
+description = "Simplifies gRPC interceptors"
+optional = false
+python-versions = ">=3.7,<4.0"
+files = [
+    {file = "grpc-interceptor-0.15.4.tar.gz", hash = "sha256:1f45c0bcb58b6f332f37c637632247c9b02bc6af0fdceb7ba7ce8d2ebbfb0926"},
+    {file = "grpc_interceptor-0.15.4-py3-none-any.whl", hash = "sha256:0035f33228693ed3767ee49d937bac424318db173fef4d2d0170b3215f254d9d"},
+]
+
+[package.dependencies]
+grpcio = ">=1.49.1,<2.0.0"
+
+[package.extras]
+testing = ["protobuf (>=4.21.9)"]
+
+[[package]]
+name = "grpcio"
+version = "1.67.0"
+description = "HTTP/2-based RPC framework"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "grpcio-1.67.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:bd79929b3bb96b54df1296cd3bf4d2b770bd1df6c2bdf549b49bab286b925cdc"},
+    {file = "grpcio-1.67.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:16724ffc956ea42967f5758c2f043faef43cb7e48a51948ab593570570d1e68b"},
+    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:2b7183c80b602b0ad816315d66f2fb7887614ead950416d60913a9a71c12560d"},
+    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:efe32b45dd6d118f5ea2e5deaed417d8a14976325c93812dd831908522b402c9"},
+    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe89295219b9c9e47780a0f1c75ca44211e706d1c598242249fe717af3385ec8"},
+    {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aa8d025fae1595a207b4e47c2e087cb88d47008494db258ac561c00877d4c8f8"},
+    {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f95e15db43e75a534420e04822df91f645664bf4ad21dfaad7d51773c80e6bb4"},
+    {file = "grpcio-1.67.0-cp310-cp310-win32.whl", hash = "sha256:a6b9a5c18863fd4b6624a42e2712103fb0f57799a3b29651c0e5b8119a519d65"},
+    {file = "grpcio-1.67.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6eb68493a05d38b426604e1dc93bfc0137c4157f7ab4fac5771fd9a104bbaa6"},
+    {file = "grpcio-1.67.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:e91d154689639932305b6ea6f45c6e46bb51ecc8ea77c10ef25aa77f75443ad4"},
+    {file = "grpcio-1.67.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cb204a742997277da678611a809a8409657b1398aaeebf73b3d9563b7d154c13"},
+    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:ae6de510f670137e755eb2a74b04d1041e7210af2444103c8c95f193340d17ee"},
+    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74b900566bdf68241118f2918d312d3bf554b2ce0b12b90178091ea7d0a17b3d"},
+    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4e95e43447a02aa603abcc6b5e727d093d161a869c83b073f50b9390ecf0fa8"},
+    {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0bb94e66cd8f0baf29bd3184b6aa09aeb1a660f9ec3d85da615c5003154bc2bf"},
+    {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:82e5bd4b67b17c8c597273663794a6a46a45e44165b960517fe6d8a2f7f16d23"},
+    {file = "grpcio-1.67.0-cp311-cp311-win32.whl", hash = "sha256:7fc1d2b9fd549264ae585026b266ac2db53735510a207381be509c315b4af4e8"},
+    {file = "grpcio-1.67.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac11ecb34a86b831239cc38245403a8de25037b448464f95c3315819e7519772"},
+    {file = "grpcio-1.67.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:227316b5631260e0bef8a3ce04fa7db4cc81756fea1258b007950b6efc90c05d"},
+    {file = "grpcio-1.67.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d90cfdafcf4b45a7a076e3e2a58e7bc3d59c698c4f6470b0bb13a4d869cf2273"},
+    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:77196216d5dd6f99af1c51e235af2dd339159f657280e65ce7e12c1a8feffd1d"},
+    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15c05a26a0f7047f720da41dc49406b395c1470eef44ff7e2c506a47ac2c0591"},
+    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3840994689cc8cbb73d60485c594424ad8adb56c71a30d8948d6453083624b52"},
+    {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5a1e03c3102b6451028d5dc9f8591131d6ab3c8a0e023d94c28cb930ed4b5f81"},
+    {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:682968427a63d898759474e3b3178d42546e878fdce034fd7474ef75143b64e3"},
+    {file = "grpcio-1.67.0-cp312-cp312-win32.whl", hash = "sha256:d01793653248f49cf47e5695e0a79805b1d9d4eacef85b310118ba1dfcd1b955"},
+    {file = "grpcio-1.67.0-cp312-cp312-win_amd64.whl", hash = "sha256:985b2686f786f3e20326c4367eebdaed3e7aa65848260ff0c6644f817042cb15"},
+    {file = "grpcio-1.67.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c9a35b8bc50db35ab8e3e02a4f2a35cfba46c8705c3911c34ce343bd777813a"},
+    {file = "grpcio-1.67.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:42199e704095b62688998c2d84c89e59a26a7d5d32eed86d43dc90e7a3bd04aa"},
+    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:c4c425f440fb81f8d0237c07b9322fc0fb6ee2b29fbef5f62a322ff8fcce240d"},
+    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:323741b6699cd2b04a71cb38f502db98f90532e8a40cb675393d248126a268af"},
+    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:662c8e105c5e5cee0317d500eb186ed7a93229586e431c1bf0c9236c2407352c"},
+    {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:f6bd2ab135c64a4d1e9e44679a616c9bc944547357c830fafea5c3caa3de5153"},
+    {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:2f55c1e0e2ae9bdd23b3c63459ee4c06d223b68aeb1961d83c48fb63dc29bc03"},
+    {file = "grpcio-1.67.0-cp313-cp313-win32.whl", hash = "sha256:fd6bc27861e460fe28e94226e3673d46e294ca4673d46b224428d197c5935e69"},
+    {file = "grpcio-1.67.0-cp313-cp313-win_amd64.whl", hash = "sha256:cf51d28063338608cd8d3cd64677e922134837902b70ce00dad7f116e3998210"},
+    {file = "grpcio-1.67.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:7f200aca719c1c5dc72ab68be3479b9dafccdf03df530d137632c534bb6f1ee3"},
+    {file = "grpcio-1.67.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0892dd200ece4822d72dd0952f7112c542a487fc48fe77568deaaa399c1e717d"},
+    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:f4d613fbf868b2e2444f490d18af472ccb47660ea3df52f068c9c8801e1f3e85"},
+    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c69bf11894cad9da00047f46584d5758d6ebc9b5950c0dc96fec7e0bce5cde9"},
+    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9bca3ca0c5e74dea44bf57d27e15a3a3996ce7e5780d61b7c72386356d231db"},
+    {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:014dfc020e28a0d9be7e93a91f85ff9f4a87158b7df9952fe23cc42d29d31e1e"},
+    {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d4ea4509d42c6797539e9ec7496c15473177ce9abc89bc5c71e7abe50fc25737"},
+    {file = "grpcio-1.67.0-cp38-cp38-win32.whl", hash = "sha256:9d75641a2fca9ae1ae86454fd25d4c298ea8cc195dbc962852234d54a07060ad"},
+    {file = "grpcio-1.67.0-cp38-cp38-win_amd64.whl", hash = "sha256:cff8e54d6a463883cda2fab94d2062aad2f5edd7f06ae3ed030f2a74756db365"},
+    {file = "grpcio-1.67.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:62492bd534979e6d7127b8a6b29093161a742dee3875873e01964049d5250a74"},
+    {file = "grpcio-1.67.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eef1dce9d1a46119fd09f9a992cf6ab9d9178b696382439446ca5f399d7b96fe"},
+    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:f623c57a5321461c84498a99dddf9d13dac0e40ee056d884d6ec4ebcab647a78"},
+    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54d16383044e681f8beb50f905249e4e7261dd169d4aaf6e52eab67b01cbbbe2"},
+    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2a44e572fb762c668e4812156b81835f7aba8a721b027e2d4bb29fb50ff4d33"},
+    {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:391df8b0faac84d42f5b8dfc65f5152c48ed914e13c522fd05f2aca211f8bfad"},
+    {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfd9306511fdfc623a1ba1dc3bc07fbd24e6cfbe3c28b4d1e05177baa2f99617"},
+    {file = "grpcio-1.67.0-cp39-cp39-win32.whl", hash = "sha256:30d47dbacfd20cbd0c8be9bfa52fdb833b395d4ec32fe5cff7220afc05d08571"},
+    {file = "grpcio-1.67.0-cp39-cp39-win_amd64.whl", hash = "sha256:f55f077685f61f0fbd06ea355142b71e47e4a26d2d678b3ba27248abfe67163a"},
+    {file = "grpcio-1.67.0.tar.gz", hash = "sha256:e090b2553e0da1c875449c8e75073dd4415dd71c9bde6a406240fdf4c0ee467c"},
+]
+
+[package.extras]
+protobuf = ["grpcio-tools (>=1.67.0)"]
+
+[[package]]
+name = "grpcio-reflection"
+version = "1.62.3"
+description = "Standard Protobuf Reflection Service for gRPC"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "grpcio-reflection-1.62.3.tar.gz", hash = "sha256:cb84682933c400bddf94dd94f928d1c6570f500b6dd255973d4bfb495b82585f"},
+    {file = "grpcio_reflection-1.62.3-py3-none-any.whl", hash = "sha256:a48ef37df81a3bada78261fc92ef382f061112f989d1312398b945cc69838b9c"},
+]
+
+[package.dependencies]
+grpcio = ">=1.62.3"
+protobuf = ">=4.21.6"
+
+[[package]]
+name = "grpcio-status"
+version = "1.62.3"
+description = "Status proto mapping for gRPC"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "grpcio-status-1.62.3.tar.gz", hash = "sha256:289bdd7b2459794a12cf95dc0cb727bd4a1742c37bd823f760236c937e53a485"},
+    {file = "grpcio_status-1.62.3-py3-none-any.whl", hash = "sha256:f9049b762ba8de6b1086789d8315846e094edac2c50beaf462338b301a8fd4b8"},
+]
+
+[package.dependencies]
+googleapis-common-protos = ">=1.5.5"
+grpcio = ">=1.62.3"
+protobuf = ">=4.21.6"
+
+[[package]]
+name = "grpcio-tools"
+version = "1.62.3"
+description = "Protobuf code generator for gRPC"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "grpcio-tools-1.62.3.tar.gz", hash = "sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-win32.whl", hash = "sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5"},
+    {file = "grpcio_tools-1.62.3-cp310-cp310-win_amd64.whl", hash = "sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-win32.whl", hash = "sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5"},
+    {file = "grpcio_tools-1.62.3-cp311-cp311-win_amd64.whl", hash = "sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-win32.whl", hash = "sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b"},
+    {file = "grpcio_tools-1.62.3-cp312-cp312-win_amd64.whl", hash = "sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557"},
+    {file = "grpcio_tools-1.62.3-cp37-cp37m-win_amd64.whl", hash = "sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-win32.whl", hash = "sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6"},
+    {file = "grpcio_tools-1.62.3-cp38-cp38-win_amd64.whl", hash = "sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-win32.whl", hash = "sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61"},
+    {file = "grpcio_tools-1.62.3-cp39-cp39-win_amd64.whl", hash = "sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14"},
+]
+
+[package.dependencies]
+grpcio = ">=1.62.3"
+protobuf = ">=4.21.6,<5.0dev"
+setuptools = "*"
+
+[[package]]
+name = "hf-transfer"
+version = "0.1.8"
+description = "Speed up file transfers with the Hugging Face Hub."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "hf_transfer-0.1.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:70858f9e94286738ed300484a45beb5cfee6a7ddac4c5886f9c6fce7823ac5ab"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:38adc73f0a8526319d90f7cc5dc2d5e4bb66f487a513d94b98aa6725be732e4a"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44d2f0c08198d8d899fe9d66e86aee2dd844bd7ce33888f261373fcec81d2a54"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1de2a4ef36f9e60b3d3bec00193c0aafd75771709f2ca51b9b162373f5af3d32"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e319269e3606a5ff2979296841766649ac73598a4a8eee2a968f86c8071fea5a"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f6026cf3be6a53ea42f92172f60c1c0675baaa9073f865e671b661dde5fd157"},
+    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f865c33ada5bd3650c2b46e59979f2d7755c3f517f8d0facc78576a0c7d26406"},
+    {file = "hf_transfer-0.1.8-cp310-none-win32.whl", hash = "sha256:2054730e8d8ed21917c64be7199e06424b2bd08df1c43a72766afaed7992f2d3"},
+    {file = "hf_transfer-0.1.8-cp310-none-win_amd64.whl", hash = "sha256:2b4f1a9446ba31170b5b1eca4e916504d18378a6b5fe959896bdac8a736a5ecb"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e27c15fcc5869ad7e52bbc0bdec6106b288d1c463f8d2da92f28615a3b181361"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:871a0032d011ebc6409a73a8406b98b84ff2cd3ed7d9e1af8cdf4d660b9fab9b"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:686fa756e1e0214bb6327d33c66732c52274d94a8460beb50604ad988b391cf6"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:36a03b1b2911b0cf15b1b9d971a34b32dadcc4f2fd979aaff5979d6ce4017c34"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:079db90c81f41f4cf3227dfaaa855a9b8e9aef45bc7c2be29ce7232cd83ff881"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac08a4524127fdd14c234d4bcbe49d1c498acf5335c781714823179bcc8dc039"},
+    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:837432e73cb17274a6782b6216e8ce058aa325a475dc44a5a6a753d48b86d18a"},
+    {file = "hf_transfer-0.1.8-cp311-none-win32.whl", hash = "sha256:b180f9823dde35aba9bc0f1d0c04ac8a873baebd3732a7ffe4f11940abc7df0d"},
+    {file = "hf_transfer-0.1.8-cp311-none-win_amd64.whl", hash = "sha256:37907d2135cebcf8b6d419bb575148d89c224f16b69357f027bd29d0e85c6529"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:baf948f4f493949309cbe60529620b9b0aef854a22b6e526753364acc57c09b6"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0bce5c8bdefa478c5d5eaa646cc4ce1df5cfe764d98572ad0c6b8773e98d49f6"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54d6f8a1a86128d651a3799e1267c343d60f81f2c565d7c5416eb8e674e4cf0e"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f79fd1b0c2ed93efb4c5f684118d7a762ecdd218e170df8208c4e13d3dcd4959"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:414df35692670683bf5623498ef9d88a8df5d77e9516515da6e2b34d1054c11f"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c9798d5f951f66b96d40a7a53910260cb5874fda56cf5944dddb7c571f37ec3"},
+    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:060c661691f85a61392e57579c80eb64b5ee277434e81fb582f605c1c8ff05d5"},
+    {file = "hf_transfer-0.1.8-cp312-none-win32.whl", hash = "sha256:f7840e32379820c3e1571a480238e05ea043e970c99d2e999578004a2eb17788"},
+    {file = "hf_transfer-0.1.8-cp312-none-win_amd64.whl", hash = "sha256:9a3204ec423cc5e659872e8179f8704ad9ce2abb1e6a991f8838aedf1dc07830"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09949e86ad63ee139e463fd0dfaf401515ae70445854199f61d545514c65f744"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bf1a74552845b93ea972e6e7131ef54e56056aa54137e93a40faf3fbcb2442ff"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:959bcb3afb4ee6f2a07031a947dba98ec0b64c001bc914fbd8fc32e13a287162"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e01eecdb8162bd61dab9090fbd9f8034dd8b5755ef727a21ca8a057f80cb91ee"},
+    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50650a38e9d31f5ad8f010e4598bf304ecd99c17162e7d93f67e031571b864ee"},
+    {file = "hf_transfer-0.1.8-cp37-none-win32.whl", hash = "sha256:e29b9d1d378138f2f4eae0e93ca94af3b5d45f4532eef69f1ab97fe06f9c9d9e"},
+    {file = "hf_transfer-0.1.8-cp37-none-win_amd64.whl", hash = "sha256:cfd6cef43ae883103117a371f8ebae4e7f9637bc6fb480f1be5568e2fe22a8a7"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92a68f7a0043cca8a0de4decc760dca177530944cbab502afac503bd1b2fa01a"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e3138e408179f80a5480598e32f8e1abb564915cbde4d3bc8da52811c75dc3ea"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4544d148930ad34442d43b8fa911c8479c04a95b858b1d1f91e0b7da77082fad"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a851794b9f029965664f8c3002c957fccf21685e9397ceb4f9f19c986dee8ad3"},
+    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:791aaf87c5319ac83edb6ab2994b3db19924c49d6ff667dd3d8a610b455ff70a"},
+    {file = "hf_transfer-0.1.8-cp38-none-win32.whl", hash = "sha256:8f71e5d35d3a3160dcca12fdcc8119033aeacaa6a32838a7ad9f9cb1008bbe58"},
+    {file = "hf_transfer-0.1.8-cp38-none-win_amd64.whl", hash = "sha256:543287b4ceb1e25501580b99690f7f0df9d3631d29306f37cbd97e918c732944"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:7ce02a18bd0bb2343e707ac85b68c946bc37623ee24150c69158f6b2b2c7a98f"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:64d7f8dbd64ba183ed1df75d47c84e075ff666ceaa335bff1de16b09eaac5b80"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e7858694e11419ae27e542fb8fc0d0e54d46ff7768fe73bc359d70b8f5aa578"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bed116cd9d1edfa32c0136d7cb8e5f1afd2b32df43c49085d428f108fc8e1c8f"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e385d0da9c6b3472ab29285d2d46c9f9903205b8d108f88a82f3f85aafae0ab"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98f75fa4b86ef15433cd907807ac77d1fb39d7e7b790bfd39c7ae9c385bf0200"},
+    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1a63ad947d2901425ac0a3ed70c3696dfde27fadb0482ed763bdd5cc946b278"},
+    {file = "hf_transfer-0.1.8-cp39-none-win32.whl", hash = "sha256:3e74096915813ae842ea6a5bdf10c0fef960aa51a35a560955b3e61cdfe3db57"},
+    {file = "hf_transfer-0.1.8-cp39-none-win_amd64.whl", hash = "sha256:05ea16307bf4a5eb097cbc6e5057e4eb5e080a138af23ef639fd38857723c288"},
+    {file = "hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:928ff036c3e98e10dcfbdb4fcdfc4592d37a5cc8e365a7ba8dfd4337e849d675"},
+    {file = "hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d49ba3ce67035f460ae1924fe2feafec155cb535eec7f31ed5109c19064cd294"},
+    {file = "hf_transfer-0.1.8-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b01f5872c62cfee3ec9ca5c738818296f69f8adf84b4d8d15f2a5601d9dda339"},
+    {file = "hf_transfer-0.1.8-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:659d4212d50847a5165666bf43d67727679b4f694ef9c413613cc27093136527"},
+    {file = "hf_transfer-0.1.8.tar.gz", hash = "sha256:26d229468152e7a3ec12664cac86b8c2800695fd85f9c9a96677a775cc04f0b3"},
+]
+
+[[package]]
+name = "huggingface-hub"
+version = "0.26.1"
+description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "huggingface_hub-0.26.1-py3-none-any.whl", hash = "sha256:5927a8fc64ae68859cd954b7cc29d1c8390a5e15caba6d3d349c973be8fdacf3"},
+    {file = "huggingface_hub-0.26.1.tar.gz", hash = "sha256:414c0d9b769eecc86c70f9d939d0f48bb28e8461dd1130021542eff0212db890"},
+]
+
+[package.dependencies]
+filelock = "*"
+fsspec = ">=2023.5.0"
+packaging = ">=20.9"
+pyyaml = ">=5.1"
+requests = "*"
+tqdm = ">=4.42.1"
+typing-extensions = ">=3.7.4.3"
+
+[package.extras]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+cli = ["InquirerPy (==0.3.4)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
+hf-transfer = ["hf-transfer (>=0.1.4)"]
+inference = ["aiohttp"]
+quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.5.0)"]
+tensorflow = ["graphviz", "pydot", "tensorflow"]
+tensorflow-testing = ["keras (<3.0)", "tensorflow"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
+torch = ["safetensors[torch]", "torch"]
+typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
+
+[[package]]
+name = "idna"
+version = "3.10"
+description = "Internationalized Domain Names in Applications (IDNA)"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
+    {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
+]
+
+[package.extras]
+all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
+
+[[package]]
+name = "importlib-metadata"
+version = "8.5.0"
+description = "Read metadata from Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"},
+    {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"},
+]
+
+[package.dependencies]
+zipp = ">=3.20"
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+perf = ["ipython"]
+test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
+type = ["pytest-mypy"]
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+description = "brain-dead simple config-ini parsing"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+]
+
+[[package]]
+name = "interegular"
+version = "0.3.3"
+description = "a regex intersection checker"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c"},
+    {file = "interegular-0.3.3.tar.gz", hash = "sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600"},
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.4"
+description = "A very fast and expressive template engine."
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
+    {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
+]
+
+[package.dependencies]
+MarkupSafe = ">=2.0"
+
+[package.extras]
+i18n = ["Babel (>=2.7)"]
+
+[[package]]
+name = "joblib"
+version = "1.4.2"
+description = "Lightweight pipelining with Python functions"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
+    {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
+]
+
+[[package]]
+name = "jsonschema"
+version = "4.23.0"
+description = "An implementation of JSON Schema validation for Python"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"},
+    {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"},
+]
+
+[package.dependencies]
+attrs = ">=22.2.0"
+jsonschema-specifications = ">=2023.03.6"
+referencing = ">=0.28.4"
+rpds-py = ">=0.7.1"
+
+[package.extras]
+format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
+format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"]
+
+[[package]]
+name = "jsonschema-specifications"
+version = "2024.10.1"
+description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry"
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"},
+    {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"},
+]
+
+[package.dependencies]
+referencing = ">=0.31.0"
+
+[[package]]
+name = "lark"
+version = "1.2.2"
+description = "a modern parsing library"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "lark-1.2.2-py3-none-any.whl", hash = "sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c"},
+    {file = "lark-1.2.2.tar.gz", hash = "sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80"},
+]
+
+[package.extras]
+atomic-cache = ["atomicwrites"]
+interegular = ["interegular (>=0.3.1,<0.4.0)"]
+nearley = ["js2py"]
+regex = ["regex"]
+
+[[package]]
+name = "llvmlite"
+version = "0.43.0"
+description = "lightweight wrapper around basic LLVM functionality"
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"},
+    {file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"},
+    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d434ec7e2ce3cc8f452d1cd9a28591745de022f931d67be688a737320dfcead"},
+    {file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6912a87782acdff6eb8bf01675ed01d60ca1f2551f8176a300a886f09e836a6a"},
+    {file = "llvmlite-0.43.0-cp310-cp310-win_amd64.whl", hash = "sha256:14f0e4bf2fd2d9a75a3534111e8ebeb08eda2f33e9bdd6dfa13282afacdde0ed"},
+    {file = "llvmlite-0.43.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3e8d0618cb9bfe40ac38a9633f2493d4d4e9fcc2f438d39a4e854f39cc0f5f98"},
+    {file = "llvmlite-0.43.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0a9a1a39d4bf3517f2af9d23d479b4175ead205c592ceeb8b89af48a327ea57"},
+    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1da416ab53e4f7f3bc8d4eeba36d801cc1894b9fbfbf2022b29b6bad34a7df2"},
+    {file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977525a1e5f4059316b183fb4fd34fa858c9eade31f165427a3977c95e3ee749"},
+    {file = "llvmlite-0.43.0-cp311-cp311-win_amd64.whl", hash = "sha256:d5bd550001d26450bd90777736c69d68c487d17bf371438f975229b2b8241a91"},
+    {file = "llvmlite-0.43.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f99b600aa7f65235a5a05d0b9a9f31150c390f31261f2a0ba678e26823ec38f7"},
+    {file = "llvmlite-0.43.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:35d80d61d0cda2d767f72de99450766250560399edc309da16937b93d3b676e7"},
+    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eccce86bba940bae0d8d48ed925f21dbb813519169246e2ab292b5092aba121f"},
+    {file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6509e1507ca0760787a199d19439cc887bfd82226f5af746d6977bd9f66844"},
+    {file = "llvmlite-0.43.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a2872ee80dcf6b5dbdc838763d26554c2a18aa833d31a2635bff16aafefb9c9"},
+    {file = "llvmlite-0.43.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cd2a7376f7b3367019b664c21f0c61766219faa3b03731113ead75107f3b66c"},
+    {file = "llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18e9953c748b105668487b7c81a3e97b046d8abf95c4ddc0cd3c94f4e4651ae8"},
+    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74937acd22dc11b33946b67dca7680e6d103d6e90eeaaaf932603bec6fe7b03a"},
+    {file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9efc739cc6ed760f795806f67889923f7274276f0eb45092a1473e40d9b867"},
+    {file = "llvmlite-0.43.0-cp39-cp39-win_amd64.whl", hash = "sha256:47e147cdda9037f94b399bf03bfd8a6b6b1f2f90be94a454e3386f006455a9b4"},
+    {file = "llvmlite-0.43.0.tar.gz", hash = "sha256:ae2b5b5c3ef67354824fb75517c8db5fbe93bc02cd9671f3c62271626bc041d5"},
+]
+
+[[package]]
+name = "loguru"
+version = "0.6.0"
+description = "Python logging made (stupidly) simple"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "loguru-0.6.0-py3-none-any.whl", hash = "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"},
+    {file = "loguru-0.6.0.tar.gz", hash = "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c"},
+]
+
+[package.dependencies]
+colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
+win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
+
+[package.extras]
+dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils (==0.16)", "flake8 (>=3.7.7)", "isort (>=5.1.1)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)", "tox (>=3.9.0)"]
+
+[[package]]
+name = "markdown-it-py"
+version = "3.0.0"
+description = "Python port of markdown-it. Markdown parsing, done right!"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
+    {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
+]
+
+[package.dependencies]
+mdurl = ">=0.1,<1.0"
+
+[package.extras]
+benchmarking = ["psutil", "pytest", "pytest-benchmark"]
+code-style = ["pre-commit (>=3.0,<4.0)"]
+compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
+linkify = ["linkify-it-py (>=1,<3)"]
+plugins = ["mdit-py-plugins"]
+profiling = ["gprof2dot"]
+rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
+testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
+
+[[package]]
+name = "markupsafe"
+version = "3.0.2"
+description = "Safely add untrusted strings to HTML/XML markup."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"},
+    {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"},
+]
+
+[[package]]
+name = "marlin-kernels"
+version = "0.2.0"
+description = "Marlin quantization kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:9a5afcf19b0f5917e43353cc19873fb3c4d4d0b924e2a95a37884f9ce208d0bd"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+
+[[package]]
+name = "marlin-kernels"
+version = "0.2.0"
+description = "Marlin quantization kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:1e64fcc7ebadfaffa60091ee9201ae3daaf5c1be3be60c8c054143a3dcb72d5d"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+
+[[package]]
+name = "marlin-kernels"
+version = "0.2.0"
+description = "Marlin quantization kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:e75f3ce9b1c13a4ed43a380d88e1d34d297259452db037ec1973ec33dc2eb78e"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+
+[[package]]
+name = "marlin-kernels"
+version = "0.2.0"
+description = "Marlin quantization kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:2f99a27f70b391887ee6adffeeee7c3f4df7fac37393f9fb16d4cace2b3f6457"},
+]
+
+[package.dependencies]
+torch = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+description = "Markdown URL utilities"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
+    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
+]
+
+[[package]]
+name = "moe-kernels"
+version = "0.4.0"
+description = "MoE kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "moe_kernels-0.4.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:3fc0475bb3b9c09bbf08f6f6e9767d10eaba55b558f67a605fe70ae0cbb5e6a4"},
+]
+
+[package.dependencies]
+nvidia-ml-py = "*"
+torch = "*"
+triton = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.4.0/moe_kernels-0.4.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+
+[[package]]
+name = "moe-kernels"
+version = "0.4.0"
+description = "MoE kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "moe_kernels-0.4.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:8ca72a064ceb84a23a3437cc6e6363907ad41588877f6acb1febc010fc7beb22"},
+]
+
+[package.dependencies]
+nvidia-ml-py = "*"
+torch = "*"
+triton = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.4.0/moe_kernels-0.4.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+
+[[package]]
+name = "moe-kernels"
+version = "0.4.0"
+description = "MoE kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "moe_kernels-0.4.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:d302d6b16bb4905b2312dc68da6a6f51e87d0cd3c4bf1f23d995501162399a8e"},
+]
+
+[package.dependencies]
+nvidia-ml-py = "*"
+torch = "*"
+triton = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.4.0/moe_kernels-0.4.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+
+[[package]]
+name = "moe-kernels"
+version = "0.4.0"
+description = "MoE kernels"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "moe_kernels-0.4.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:6aee3e723efa5113c338b40e6cb20fa62da6c442c65c1a6cc97751d34158a93a"},
+]
+
+[package.dependencies]
+nvidia-ml-py = "*"
+torch = "*"
+triton = "*"
+
+[package.source]
+type = "url"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.4.0/moe_kernels-0.4.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+description = "Python library for arbitrary-precision floating-point arithmetic"
+optional = true
+python-versions = "*"
+files = [
+    {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
+    {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
+]
+
+[package.extras]
+develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"]
+docs = ["sphinx"]
+gmpy = ["gmpy2 (>=2.1.0a4)"]
+tests = ["pytest (>=4.6)"]
+
+[[package]]
+name = "multidict"
+version = "6.1.0"
+description = "multidict implementation"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"},
+    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"},
+    {file = "multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429"},
+    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160"},
+    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7"},
+    {file = "multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0"},
+    {file = "multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d"},
+    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6"},
+    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156"},
+    {file = "multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351"},
+    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3"},
+    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753"},
+    {file = "multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80"},
+    {file = "multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926"},
+    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa"},
+    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436"},
+    {file = "multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925"},
+    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6"},
+    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3"},
+    {file = "multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133"},
+    {file = "multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1"},
+    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008"},
+    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f"},
+    {file = "multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44"},
+    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4"},
+    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"},
+    {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"},
+    {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"},
+    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392"},
+    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a"},
+    {file = "multidict-6.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d"},
+    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492"},
+    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd"},
+    {file = "multidict-6.1.0-cp38-cp38-win32.whl", hash = "sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167"},
+    {file = "multidict-6.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef"},
+    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"},
+    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"},
+    {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"},
+    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"},
+    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"},
+    {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"},
+    {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"},
+    {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"},
+    {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"},
+]
+
+[package.dependencies]
+typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
+
+[[package]]
+name = "multiprocess"
+version = "0.70.15"
+description = "better multiprocessing and multithreading in Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "multiprocess-0.70.15-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:aa36c7ed16f508091438687fe9baa393a7a8e206731d321e443745e743a0d4e5"},
+    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:20e024018c46d0d1602024c613007ac948f9754659e3853b0aa705e83f6931d8"},
+    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_i686.whl", hash = "sha256:e576062981c91f0fe8a463c3d52506e598dfc51320a8dd8d78b987dfca91c5db"},
+    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:e73f497e6696a0f5433ada2b3d599ae733b87a6e8b008e387c62ac9127add177"},
+    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73db2e7b32dcc7f9b0f075c2ffa45c90b6729d3f1805f27e88534c8d321a1be5"},
+    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_i686.whl", hash = "sha256:4271647bd8a49c28ecd6eb56a7fdbd3c212c45529ad5303b40b3c65fc6928e5f"},
+    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:cf981fb998d6ec3208cb14f0cf2e9e80216e834f5d51fd09ebc937c32b960902"},
+    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:18f9f2c7063346d1617bd1684fdcae8d33380ae96b99427260f562e1a1228b67"},
+    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_i686.whl", hash = "sha256:0eac53214d664c49a34695e5824872db4006b1a465edd7459a251809c3773370"},
+    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:1a51dd34096db47fb21fa2b839e615b051d51b97af9a67afbcdaa67186b44883"},
+    {file = "multiprocess-0.70.15-py310-none-any.whl", hash = "sha256:7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a"},
+    {file = "multiprocess-0.70.15-py311-none-any.whl", hash = "sha256:134f89053d82c9ed3b73edd3a2531eb791e602d4f4156fc92a79259590bd9670"},
+    {file = "multiprocess-0.70.15-py37-none-any.whl", hash = "sha256:f7d4a1629bccb433114c3b4885f69eccc200994323c80f6feee73b0edc9199c5"},
+    {file = "multiprocess-0.70.15-py38-none-any.whl", hash = "sha256:bee9afba476c91f9ebee7beeee0601face9eff67d822e893f9a893725fbd6316"},
+    {file = "multiprocess-0.70.15-py39-none-any.whl", hash = "sha256:3e0953f5d52b4c76f1c973eaf8214554d146f2be5decb48e928e55c7a2d19338"},
+    {file = "multiprocess-0.70.15.tar.gz", hash = "sha256:f20eed3036c0ef477b07a4177cf7c1ba520d9a2677870a4f47fe026f0cd6787e"},
+]
+
+[package.dependencies]
+dill = ">=0.3.7"
+
+[[package]]
+name = "nest-asyncio"
+version = "1.6.0"
+description = "Patch asyncio to allow nested event loops"
+optional = true
+python-versions = ">=3.5"
+files = [
+    {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
+    {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
+]
+
+[[package]]
+name = "networkx"
+version = "3.2.1"
+description = "Python package for creating and manipulating graphs and networks"
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"},
+    {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"},
+]
+
+[package.extras]
+default = ["matplotlib (>=3.5)", "numpy (>=1.22)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"]
+developer = ["changelist (==0.4)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"]
+doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
+test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
+
+[[package]]
+name = "numba"
+version = "0.60.0"
+description = "compiling Python code using LLVM"
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"},
+    {file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"},
+    {file = "numba-0.60.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1527dc578b95c7c4ff248792ec33d097ba6bef9eda466c948b68dfc995c25781"},
+    {file = "numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe0b28abb8d70f8160798f4de9d486143200f34458d34c4a214114e445d7124e"},
+    {file = "numba-0.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:19407ced081d7e2e4b8d8c36aa57b7452e0283871c296e12d798852bc7d7f198"},
+    {file = "numba-0.60.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a17b70fc9e380ee29c42717e8cc0bfaa5556c416d94f9aa96ba13acb41bdece8"},
+    {file = "numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fb02b344a2a80efa6f677aa5c40cd5dd452e1b35f8d1c2af0dfd9ada9978e4b"},
+    {file = "numba-0.60.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f4fde652ea604ea3c86508a3fb31556a6157b2c76c8b51b1d45eb40c8598703"},
+    {file = "numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4142d7ac0210cc86432b818338a2bc368dc773a2f5cf1e32ff7c5b378bd63ee8"},
+    {file = "numba-0.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:cac02c041e9b5bc8cf8f2034ff6f0dbafccd1ae9590dc146b3a02a45e53af4e2"},
+    {file = "numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d7da4098db31182fc5ffe4bc42c6f24cd7d1cb8a14b59fd755bfee32e34b8404"},
+    {file = "numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38d6ea4c1f56417076ecf8fc327c831ae793282e0ff51080c5094cb726507b1c"},
+    {file = "numba-0.60.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:62908d29fb6a3229c242e981ca27e32a6e606cc253fc9e8faeb0e48760de241e"},
+    {file = "numba-0.60.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ebaa91538e996f708f1ab30ef4d3ddc344b64b5227b67a57aa74f401bb68b9d"},
+    {file = "numba-0.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:f75262e8fe7fa96db1dca93d53a194a38c46da28b112b8a4aca168f0df860347"},
+    {file = "numba-0.60.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74"},
+    {file = "numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:819a3dfd4630d95fd574036f99e47212a1af41cbcb019bf8afac63ff56834449"},
+    {file = "numba-0.60.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b"},
+    {file = "numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c151748cd269ddeab66334bd754817ffc0cabd9433acb0f551697e5151917d25"},
+    {file = "numba-0.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:3031547a015710140e8c87226b4cfe927cac199835e5bf7d4fe5cb64e814e3ab"},
+    {file = "numba-0.60.0.tar.gz", hash = "sha256:5df6158e5584eece5fc83294b949fd30b9f1125df7708862205217e068aabf16"},
+]
+
+[package.dependencies]
+llvmlite = "==0.43.*"
+numpy = ">=1.22,<2.1"
+
+[[package]]
+name = "numpy"
+version = "1.26.4"
+description = "Fundamental package for array computing in Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
+    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
+    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
+    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
+    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
+    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
+    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
+    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
+    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
+    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
+]
+
+[[package]]
+name = "nvidia-cublas-cu12"
+version = "12.1.3.1"
+description = "CUBLAS native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"},
+    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"},
+]
+
+[[package]]
+name = "nvidia-cuda-cupti-cu12"
+version = "12.1.105"
+description = "CUDA profiling tools runtime libs."
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"},
+    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"},
+]
+
+[[package]]
+name = "nvidia-cuda-nvrtc-cu12"
+version = "12.1.105"
+description = "NVRTC native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"},
+]
+
+[[package]]
+name = "nvidia-cuda-runtime-cu12"
+version = "12.1.105"
+description = "CUDA Runtime native Libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"},
+    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"},
+]
+
+[[package]]
+name = "nvidia-cudnn-cu12"
+version = "9.1.0.70"
+description = "cuDNN runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"},
+    {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"},
+]
+
+[package.dependencies]
+nvidia-cublas-cu12 = "*"
+
+[[package]]
+name = "nvidia-cufft-cu12"
+version = "11.0.2.54"
+description = "CUFFT native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"},
+    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"},
+]
+
+[[package]]
+name = "nvidia-curand-cu12"
+version = "10.3.2.106"
+description = "CURAND native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"},
+    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"},
+]
+
+[[package]]
+name = "nvidia-cusolver-cu12"
+version = "11.4.5.107"
+description = "CUDA solver native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"},
+    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"},
+]
+
+[package.dependencies]
+nvidia-cublas-cu12 = "*"
+nvidia-cusparse-cu12 = "*"
+nvidia-nvjitlink-cu12 = "*"
+
+[[package]]
+name = "nvidia-cusparse-cu12"
+version = "12.1.0.106"
+description = "CUSPARSE native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"},
+    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"},
+]
+
+[package.dependencies]
+nvidia-nvjitlink-cu12 = "*"
+
+[[package]]
+name = "nvidia-ml-py"
+version = "12.560.30"
+description = "Python Bindings for the NVIDIA Management Library"
+optional = true
+python-versions = "*"
+files = [
+    {file = "nvidia-ml-py-12.560.30.tar.gz", hash = "sha256:f0254dc7400647680a072ee02509bfd46102b60bdfeca321576d4d4817e7fe97"},
+    {file = "nvidia_ml_py-12.560.30-py3-none-any.whl", hash = "sha256:fea371c94d63e38a611c17bbb85fe400e9c8ddb9e8684a9cd0e47786a4bc3c73"},
+]
+
+[[package]]
+name = "nvidia-nccl-cu12"
+version = "2.20.5"
+description = "NVIDIA Collective Communication Library (NCCL) Runtime"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"},
+    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"},
+]
+
+[[package]]
+name = "nvidia-nvjitlink-cu12"
+version = "12.6.77"
+description = "Nvidia JIT LTO Library"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:3bf10d85bb1801e9c894c6e197e44dd137d2a0a9e43f8450e9ad13f2df0dd52d"},
+    {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9ae346d16203ae4ea513be416495167a0101d33d2d14935aa9c1829a3fb45142"},
+    {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:410718cd44962bed862a31dd0318620f6f9a8b28a6291967bcfcb446a6516771"},
+]
+
+[[package]]
+name = "nvidia-nvtx-cu12"
+version = "12.1.105"
+description = "NVIDIA Tools Extension"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"},
+    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
+]
+
+[[package]]
+name = "opentelemetry-api"
+version = "1.25.0"
+description = "OpenTelemetry Python API"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_api-1.25.0-py3-none-any.whl", hash = "sha256:757fa1aa020a0f8fa139f8959e53dec2051cc26b832e76fa839a6d76ecefd737"},
+    {file = "opentelemetry_api-1.25.0.tar.gz", hash = "sha256:77c4985f62f2614e42ce77ee4c9da5fa5f0bc1e1821085e9a47533a9323ae869"},
+]
+
+[package.dependencies]
+deprecated = ">=1.2.6"
+importlib-metadata = ">=6.0,<=7.1"
+
+[[package]]
+name = "opentelemetry-exporter-otlp"
+version = "1.25.0"
+description = "OpenTelemetry Collector Exporters"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_exporter_otlp-1.25.0-py3-none-any.whl", hash = "sha256:d67a831757014a3bc3174e4cd629ae1493b7ba8d189e8a007003cacb9f1a6b60"},
+    {file = "opentelemetry_exporter_otlp-1.25.0.tar.gz", hash = "sha256:ce03199c1680a845f82e12c0a6a8f61036048c07ec7a0bd943142aca8fa6ced0"},
+]
+
+[package.dependencies]
+opentelemetry-exporter-otlp-proto-grpc = "1.25.0"
+opentelemetry-exporter-otlp-proto-http = "1.25.0"
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-common"
+version = "1.25.0"
+description = "OpenTelemetry Protobuf encoding"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_exporter_otlp_proto_common-1.25.0-py3-none-any.whl", hash = "sha256:15637b7d580c2675f70246563363775b4e6de947871e01d0f4e3881d1848d693"},
+    {file = "opentelemetry_exporter_otlp_proto_common-1.25.0.tar.gz", hash = "sha256:c93f4e30da4eee02bacd1e004eb82ce4da143a2f8e15b987a9f603e0a85407d3"},
+]
+
+[package.dependencies]
+opentelemetry-proto = "1.25.0"
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-grpc"
+version = "1.25.0"
+description = "OpenTelemetry Collector Protobuf over gRPC Exporter"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_exporter_otlp_proto_grpc-1.25.0-py3-none-any.whl", hash = "sha256:3131028f0c0a155a64c430ca600fd658e8e37043cb13209f0109db5c1a3e4eb4"},
+    {file = "opentelemetry_exporter_otlp_proto_grpc-1.25.0.tar.gz", hash = "sha256:c0b1661415acec5af87625587efa1ccab68b873745ca0ee96b69bb1042087eac"},
+]
+
+[package.dependencies]
+deprecated = ">=1.2.6"
+googleapis-common-protos = ">=1.52,<2.0"
+grpcio = ">=1.0.0,<2.0.0"
+opentelemetry-api = ">=1.15,<2.0"
+opentelemetry-exporter-otlp-proto-common = "1.25.0"
+opentelemetry-proto = "1.25.0"
+opentelemetry-sdk = ">=1.25.0,<1.26.0"
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-http"
+version = "1.25.0"
+description = "OpenTelemetry Collector Protobuf over HTTP Exporter"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_exporter_otlp_proto_http-1.25.0-py3-none-any.whl", hash = "sha256:2eca686ee11b27acd28198b3ea5e5863a53d1266b91cda47c839d95d5e0541a6"},
+    {file = "opentelemetry_exporter_otlp_proto_http-1.25.0.tar.gz", hash = "sha256:9f8723859e37c75183ea7afa73a3542f01d0fd274a5b97487ea24cb683d7d684"},
+]
+
+[package.dependencies]
+deprecated = ">=1.2.6"
+googleapis-common-protos = ">=1.52,<2.0"
+opentelemetry-api = ">=1.15,<2.0"
+opentelemetry-exporter-otlp-proto-common = "1.25.0"
+opentelemetry-proto = "1.25.0"
+opentelemetry-sdk = ">=1.25.0,<1.26.0"
+requests = ">=2.7,<3.0"
+
+[[package]]
+name = "opentelemetry-instrumentation"
+version = "0.46b0"
+description = "Instrumentation Tools & Auto Instrumentation for OpenTelemetry Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_instrumentation-0.46b0-py3-none-any.whl", hash = "sha256:89cd721b9c18c014ca848ccd11181e6b3fd3f6c7669e35d59c48dc527408c18b"},
+    {file = "opentelemetry_instrumentation-0.46b0.tar.gz", hash = "sha256:974e0888fb2a1e01c38fbacc9483d024bb1132aad92d6d24e2e5543887a7adda"},
+]
+
+[package.dependencies]
+opentelemetry-api = ">=1.4,<2.0"
+setuptools = ">=16.0"
+wrapt = ">=1.0.0,<2.0.0"
+
+[[package]]
+name = "opentelemetry-instrumentation-grpc"
+version = "0.46b0"
+description = "OpenTelemetry gRPC instrumentation"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_instrumentation_grpc-0.46b0-py3-none-any.whl", hash = "sha256:cccfb28db07c28849709f2dcf330237fae0fca9f86971bfce27b28bb9a8b0577"},
+    {file = "opentelemetry_instrumentation_grpc-0.46b0.tar.gz", hash = "sha256:9c5738592cf82672805099826b676d352324b54e03f9ac72a1368ba0605d6ff9"},
+]
+
+[package.dependencies]
+opentelemetry-api = ">=1.12,<2.0"
+opentelemetry-instrumentation = "0.46b0"
+opentelemetry-semantic-conventions = "0.46b0"
+wrapt = ">=1.0.0,<2.0.0"
+
+[package.extras]
+instruments = ["grpcio (>=1.27,<2.0)"]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "1.25.0"
+description = "OpenTelemetry Python Proto"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_proto-1.25.0-py3-none-any.whl", hash = "sha256:f07e3341c78d835d9b86665903b199893befa5e98866f63d22b00d0b7ca4972f"},
+    {file = "opentelemetry_proto-1.25.0.tar.gz", hash = "sha256:35b6ef9dc4a9f7853ecc5006738ad40443701e52c26099e197895cbda8b815a3"},
+]
+
+[package.dependencies]
+protobuf = ">=3.19,<5.0"
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.25.0"
+description = "OpenTelemetry Python SDK"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_sdk-1.25.0-py3-none-any.whl", hash = "sha256:d97ff7ec4b351692e9d5a15af570c693b8715ad78b8aafbec5c7100fe966b4c9"},
+    {file = "opentelemetry_sdk-1.25.0.tar.gz", hash = "sha256:ce7fc319c57707ef5bf8b74fb9f8ebdb8bfafbe11898410e0d2a761d08a98ec7"},
+]
+
+[package.dependencies]
+opentelemetry-api = "1.25.0"
+opentelemetry-semantic-conventions = "0.46b0"
+typing-extensions = ">=3.7.4"
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.46b0"
+description = "OpenTelemetry Semantic Conventions"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "opentelemetry_semantic_conventions-0.36b0-py3-none-any.whl", hash = "sha256:adc05635e87b9d3e007c9f530eed487fc3ef2177d02f82f674f28ebf9aff8243"},
+    {file = "opentelemetry_semantic_conventions-0.36b0.tar.gz", hash = "sha256:829dc221795467d98b773c04096e29be038d77526dc8d6ac76f546fb6279bf01"},
+]
+
+[[package]]
+name = "optimum"
+version = "1.23.2"
+description = "Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality."
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "optimum-1.23.2-py3-none-any.whl", hash = "sha256:1b6f450b2c0203377b76cf4dd1e05e45b116d8069612fe2ad73df8b5743d5a57"},
+    {file = "optimum-1.23.2.tar.gz", hash = "sha256:245415add6e621c6c5b7c29eee968bc325fc1a982d9c42eb83d97b7ec844ef39"},
+]
+
+[package.dependencies]
+coloredlogs = "*"
+datasets = "*"
+huggingface-hub = ">=0.8.0"
+numpy = "*"
+packaging = "*"
+sympy = "*"
+torch = ">=1.11"
+transformers = {version = ">=4.29", extras = ["sentencepiece"]}
+
+[package.extras]
+amd = ["optimum-amd"]
+benchmark = ["evaluate (>=0.2.0)", "optuna", "scikit-learn", "seqeval", "torchvision", "tqdm"]
+dev = ["Pillow", "accelerate", "black (>=23.1,<24.0)", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "ruff (==0.1.5)", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
+diffusers = ["diffusers"]
+doc-build = ["accelerate"]
+exporters = ["onnx", "onnxruntime", "timm", "transformers (<4.46.0)"]
+exporters-gpu = ["onnx", "onnxruntime-gpu", "timm", "transformers (<4.46.0)"]
+exporters-tf = ["datasets (<=2.16)", "h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm", "transformers[sentencepiece] (>=4.26,<4.38)"]
+furiosa = ["optimum-furiosa"]
+graphcore = ["optimum-graphcore"]
+habana = ["optimum-habana", "transformers (>=4.45.0,<4.46.0)"]
+intel = ["optimum-intel (>=1.18.0)"]
+ipex = ["optimum-intel[ipex] (>=1.18.0)"]
+neural-compressor = ["optimum-intel[neural-compressor] (>=1.18.0)"]
+neuron = ["optimum-neuron[neuron] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
+neuronx = ["optimum-neuron[neuronx] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
+nncf = ["optimum-intel[nncf] (>=1.18.0)"]
+onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)", "transformers (<4.46.0)"]
+onnxruntime-gpu = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)", "transformers (<4.46.0)"]
+openvino = ["optimum-intel[openvino] (>=1.18.0)"]
+quality = ["black (>=23.1,<24.0)", "ruff (==0.1.5)"]
+quanto = ["optimum-quanto (>=0.2.4)"]
+tests = ["Pillow", "accelerate", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
+
+[[package]]
+name = "optimum-habana"
+version = "1.14.1"
+description = "Optimum Habana is the interface between the Hugging Face Transformers and Diffusers libraries and Habana's Gaudi processor (HPU). It provides a set of tools enabling easy model loading, training and inference on single- and multi-HPU settings for different downstream tasks."
+optional = false
+python-versions = "*"
+files = [
+    {file = "optimum_habana-1.14.1-py3-none-any.whl", hash = "sha256:557dd06a5fa2597b1487384a354c3a1618390c35e6b088f5d7a80c0611fe2bfd"},
+    {file = "optimum_habana-1.14.1.tar.gz", hash = "sha256:627626860b82452b75f73087df3e14004278ed30dacf767c100f1324675c8120"},
+]
+
+[package.dependencies]
+accelerate = ">=0.33.0,<0.34.0"
+diffusers = "0.29.2"
+huggingface-hub = ">=0.24.7"
+optimum = "*"
+sentence-transformers = {version = "3.0.1", extras = ["train"]}
+torch = "*"
+transformers = ">=4.45.2,<4.46.0"
+
+[package.extras]
+quality = ["hf-doc-builder", "ruff"]
+tests = ["GitPython", "datasets", "optuna", "parameterized", "peft", "psutil", "pytest (<8.0.0)", "safetensors", "scipy", "sentencepiece", "timm", "torchsde"]
+
+[[package]]
+name = "outlines"
+version = "0.0.34"
+description = "Probabilistic Generative Model Programming"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "outlines-0.0.34-py3-none-any.whl", hash = "sha256:911588a7e64a4f193b97fb4c501d98ccfd4e95a98f6a3ada67a280bf0c373c50"},
+    {file = "outlines-0.0.34.tar.gz", hash = "sha256:594e7204c770b47a62eb5c2ba7d25ea0ab2e16882b5f04556712a0228d3d3309"},
+]
+
+[package.dependencies]
+cloudpickle = "*"
+diskcache = "*"
+interegular = "*"
+jinja2 = "*"
+joblib = "*"
+jsonschema = "*"
+lark = "*"
+nest-asyncio = "*"
+numba = "*"
+numpy = "*"
+pydantic = ">=2.0"
+referencing = "*"
+requests = "*"
+scipy = "*"
+torch = ">=2.1.0"
+transformers = "*"
+
+[package.extras]
+serve = ["fastapi", "pydantic (>=2.0)", "ray (==2.9.0)", "uvicorn", "vllm (>=0.3.0)"]
+test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "llama-cpp-python (>=0.2.42)", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers"]
+
+[[package]]
+name = "packaging"
+version = "24.1"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
+    {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
+]
+
+[[package]]
+name = "pandas"
+version = "2.2.3"
+description = "Powerful data structures for data analysis, time series, and statistics"
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"},
+    {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
+    {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"},
+    {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"},
+    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"},
+    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"},
+    {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"},
+    {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"},
+    {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"},
+    {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"},
+    {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"},
+    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"},
+    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"},
+    {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"},
+    {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"},
+    {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"},
+    {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"},
+    {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"},
+    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"},
+    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"},
+    {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"},
+    {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"},
+    {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"},
+    {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"},
+    {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"},
+    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"},
+    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"},
+    {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"},
+    {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"},
+    {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"},
+    {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"},
+    {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
+    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
+    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
+    {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
+    {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
+    {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
+    {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
+    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
+    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
+    {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
+    {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+]
+python-dateutil = ">=2.8.2"
+pytz = ">=2020.1"
+tzdata = ">=2022.7"
+
+[package.extras]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
+
+[[package]]
+name = "peft"
+version = "0.10.0"
+description = "Parameter-Efficient Fine-Tuning (PEFT)"
+optional = true
+python-versions = ">=3.8.0"
+files = [
+    {file = "peft-0.10.0-py3-none-any.whl", hash = "sha256:d5249c97e818d3e31f92553c73c2953acd0ec12649b8b749afff7152cbc86cbb"},
+    {file = "peft-0.10.0.tar.gz", hash = "sha256:36a7628c15f88d37abb26cfc74c22468f9037ee02e9c9b65de943cfe7c672049"},
+]
+
+[package.dependencies]
+accelerate = ">=0.21.0"
+huggingface-hub = ">=0.17.0"
+numpy = ">=1.17"
+packaging = ">=20.0"
+psutil = "*"
+pyyaml = "*"
+safetensors = "*"
+torch = ">=1.13.0"
+tqdm = "*"
+transformers = "*"
+
+[package.extras]
+dev = ["black", "hf-doc-builder", "ruff (>=0.2.1,<0.3.0)"]
+docs-specific = ["black", "hf-doc-builder"]
+quality = ["black", "hf-doc-builder", "ruff (>=0.2.1,<0.3.0)"]
+test = ["black", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameterized", "pytest", "pytest-cov", "pytest-xdist", "ruff (>=0.2.1,<0.3.0)", "scipy"]
+
+[[package]]
+name = "pillow"
+version = "11.0.0"
+description = "Python Imaging Library (Fork)"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pillow-11.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:6619654954dc4936fcff82db8eb6401d3159ec6be81e33c6000dfd76ae189947"},
+    {file = "pillow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b3c5ac4bed7519088103d9450a1107f76308ecf91d6dabc8a33a2fcfb18d0fba"},
+    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a65149d8ada1055029fcb665452b2814fe7d7082fcb0c5bed6db851cb69b2086"},
+    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88a58d8ac0cc0e7f3a014509f0455248a76629ca9b604eca7dc5927cc593c5e9"},
+    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c26845094b1af3c91852745ae78e3ea47abf3dbcd1cf962f16b9a5fbe3ee8488"},
+    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:1a61b54f87ab5786b8479f81c4b11f4d61702830354520837f8cc791ebba0f5f"},
+    {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:674629ff60030d144b7bca2b8330225a9b11c482ed408813924619c6f302fdbb"},
+    {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:598b4e238f13276e0008299bd2482003f48158e2b11826862b1eb2ad7c768b97"},
+    {file = "pillow-11.0.0-cp310-cp310-win32.whl", hash = "sha256:9a0f748eaa434a41fccf8e1ee7a3eed68af1b690e75328fd7a60af123c193b50"},
+    {file = "pillow-11.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:a5629742881bcbc1f42e840af185fd4d83a5edeb96475a575f4da50d6ede337c"},
+    {file = "pillow-11.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:ee217c198f2e41f184f3869f3e485557296d505b5195c513b2bfe0062dc537f1"},
+    {file = "pillow-11.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1c1d72714f429a521d8d2d018badc42414c3077eb187a59579f28e4270b4b0fc"},
+    {file = "pillow-11.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:499c3a1b0d6fc8213519e193796eb1a86a1be4b1877d678b30f83fd979811d1a"},
+    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8b2351c85d855293a299038e1f89db92a2f35e8d2f783489c6f0b2b5f3fe8a3"},
+    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f4dba50cfa56f910241eb7f883c20f1e7b1d8f7d91c750cd0b318bad443f4d5"},
+    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:5ddbfd761ee00c12ee1be86c9c0683ecf5bb14c9772ddbd782085779a63dd55b"},
+    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:45c566eb10b8967d71bf1ab8e4a525e5a93519e29ea071459ce517f6b903d7fa"},
+    {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b4fd7bd29610a83a8c9b564d457cf5bd92b4e11e79a4ee4716a63c959699b306"},
+    {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cb929ca942d0ec4fac404cbf520ee6cac37bf35be479b970c4ffadf2b6a1cad9"},
+    {file = "pillow-11.0.0-cp311-cp311-win32.whl", hash = "sha256:006bcdd307cc47ba43e924099a038cbf9591062e6c50e570819743f5607404f5"},
+    {file = "pillow-11.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:52a2d8323a465f84faaba5236567d212c3668f2ab53e1c74c15583cf507a0291"},
+    {file = "pillow-11.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:16095692a253047fe3ec028e951fa4221a1f3ed3d80c397e83541a3037ff67c9"},
+    {file = "pillow-11.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d2c0a187a92a1cb5ef2c8ed5412dd8d4334272617f532d4ad4de31e0495bd923"},
+    {file = "pillow-11.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:084a07ef0821cfe4858fe86652fffac8e187b6ae677e9906e192aafcc1b69903"},
+    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8069c5179902dcdce0be9bfc8235347fdbac249d23bd90514b7a47a72d9fecf4"},
+    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f02541ef64077f22bf4924f225c0fd1248c168f86e4b7abdedd87d6ebaceab0f"},
+    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fcb4621042ac4b7865c179bb972ed0da0218a076dc1820ffc48b1d74c1e37fe9"},
+    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:00177a63030d612148e659b55ba99527803288cea7c75fb05766ab7981a8c1b7"},
+    {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8853a3bf12afddfdf15f57c4b02d7ded92c7a75a5d7331d19f4f9572a89c17e6"},
+    {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3107c66e43bda25359d5ef446f59c497de2b5ed4c7fdba0894f8d6cf3822dafc"},
+    {file = "pillow-11.0.0-cp312-cp312-win32.whl", hash = "sha256:86510e3f5eca0ab87429dd77fafc04693195eec7fd6a137c389c3eeb4cfb77c6"},
+    {file = "pillow-11.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:8ec4a89295cd6cd4d1058a5e6aec6bf51e0eaaf9714774e1bfac7cfc9051db47"},
+    {file = "pillow-11.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:27a7860107500d813fcd203b4ea19b04babe79448268403172782754870dac25"},
+    {file = "pillow-11.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:bcd1fb5bb7b07f64c15618c89efcc2cfa3e95f0e3bcdbaf4642509de1942a699"},
+    {file = "pillow-11.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0e038b0745997c7dcaae350d35859c9715c71e92ffb7e0f4a8e8a16732150f38"},
+    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ae08bd8ffc41aebf578c2af2f9d8749d91f448b3bfd41d7d9ff573d74f2a6b2"},
+    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d69bfd8ec3219ae71bcde1f942b728903cad25fafe3100ba2258b973bd2bc1b2"},
+    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:61b887f9ddba63ddf62fd02a3ba7add935d053b6dd7d58998c630e6dbade8527"},
+    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:c6a660307ca9d4867caa8d9ca2c2658ab685de83792d1876274991adec7b93fa"},
+    {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:73e3a0200cdda995c7e43dd47436c1548f87a30bb27fb871f352a22ab8dcf45f"},
+    {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fba162b8872d30fea8c52b258a542c5dfd7b235fb5cb352240c8d63b414013eb"},
+    {file = "pillow-11.0.0-cp313-cp313-win32.whl", hash = "sha256:f1b82c27e89fffc6da125d5eb0ca6e68017faf5efc078128cfaa42cf5cb38798"},
+    {file = "pillow-11.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:8ba470552b48e5835f1d23ecb936bb7f71d206f9dfeee64245f30c3270b994de"},
+    {file = "pillow-11.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:846e193e103b41e984ac921b335df59195356ce3f71dcfd155aa79c603873b84"},
+    {file = "pillow-11.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4ad70c4214f67d7466bea6a08061eba35c01b1b89eaa098040a35272a8efb22b"},
+    {file = "pillow-11.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ec0d5af64f2e3d64a165f490d96368bb5dea8b8f9ad04487f9ab60dc4bb6003"},
+    {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c809a70e43c7977c4a42aefd62f0131823ebf7dd73556fa5d5950f5b354087e2"},
+    {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4b60c9520f7207aaf2e1d94de026682fc227806c6e1f55bba7606d1c94dd623a"},
+    {file = "pillow-11.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1e2688958a840c822279fda0086fec1fdab2f95bf2b717b66871c4ad9859d7e8"},
+    {file = "pillow-11.0.0-cp313-cp313t-win32.whl", hash = "sha256:607bbe123c74e272e381a8d1957083a9463401f7bd01287f50521ecb05a313f8"},
+    {file = "pillow-11.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5c39ed17edea3bc69c743a8dd3e9853b7509625c2462532e62baa0732163a904"},
+    {file = "pillow-11.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:75acbbeb05b86bc53cbe7b7e6fe00fbcf82ad7c684b3ad82e3d711da9ba287d3"},
+    {file = "pillow-11.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2e46773dc9f35a1dd28bd6981332fd7f27bec001a918a72a79b4133cf5291dba"},
+    {file = "pillow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2679d2258b7f1192b378e2893a8a0a0ca472234d4c2c0e6bdd3380e8dfa21b6a"},
+    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eda2616eb2313cbb3eebbe51f19362eb434b18e3bb599466a1ffa76a033fb916"},
+    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ec184af98a121fb2da42642dea8a29ec80fc3efbaefb86d8fdd2606619045d"},
+    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:8594f42df584e5b4bb9281799698403f7af489fba84c34d53d1c4bfb71b7c4e7"},
+    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:c12b5ae868897c7338519c03049a806af85b9b8c237b7d675b8c5e089e4a618e"},
+    {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:70fbbdacd1d271b77b7721fe3cdd2d537bbbd75d29e6300c672ec6bb38d9672f"},
+    {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5178952973e588b3f1360868847334e9e3bf49d19e169bbbdfaf8398002419ae"},
+    {file = "pillow-11.0.0-cp39-cp39-win32.whl", hash = "sha256:8c676b587da5673d3c75bd67dd2a8cdfeb282ca38a30f37950511766b26858c4"},
+    {file = "pillow-11.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:94f3e1780abb45062287b4614a5bc0874519c86a777d4a7ad34978e86428b8dd"},
+    {file = "pillow-11.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:290f2cc809f9da7d6d622550bbf4c1e57518212da51b6a30fe8e0a270a5b78bd"},
+    {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1187739620f2b365de756ce086fdb3604573337cc28a0d3ac4a01ab6b2d2a6d2"},
+    {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fbbcb7b57dc9c794843e3d1258c0fbf0f48656d46ffe9e09b63bbd6e8cd5d0a2"},
+    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d203af30149ae339ad1b4f710d9844ed8796e97fda23ffbc4cc472968a47d0b"},
+    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21a0d3b115009ebb8ac3d2ebec5c2982cc693da935f4ab7bb5c8ebe2f47d36f2"},
+    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:73853108f56df97baf2bb8b522f3578221e56f646ba345a372c78326710d3830"},
+    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e58876c91f97b0952eb766123bfef372792ab3f4e3e1f1a2267834c2ab131734"},
+    {file = "pillow-11.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:224aaa38177597bb179f3ec87eeefcce8e4f85e608025e9cfac60de237ba6316"},
+    {file = "pillow-11.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:5bd2d3bdb846d757055910f0a59792d33b555800813c3b39ada1829c372ccb06"},
+    {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:375b8dd15a1f5d2feafff536d47e22f69625c1aa92f12b339ec0b2ca40263273"},
+    {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:daffdf51ee5db69a82dd127eabecce20729e21f7a3680cf7cbb23f0829189790"},
+    {file = "pillow-11.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7326a1787e3c7b0429659e0a944725e1b03eeaa10edd945a86dead1913383944"},
+    {file = "pillow-11.0.0.tar.gz", hash = "sha256:72bacbaf24ac003fea9bff9837d1eedb6088758d41e100c1552930151f677739"},
+]
+
+[package.extras]
+docs = ["furo", "olefile", "sphinx (>=8.1)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"]
+fpx = ["olefile"]
+mic = ["olefile"]
+tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
+typing = ["typing-extensions"]
+xmp = ["defusedxml"]
+
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+description = "plugin and hook calling mechanisms for python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
+]
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
+[[package]]
+name = "prometheus-client"
+version = "0.20.0"
+description = "Python client for the Prometheus monitoring system."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "prometheus_client-0.20.0-py3-none-any.whl", hash = "sha256:cde524a85bce83ca359cc837f28b8c0db5cac7aa653a588fd7e84ba061c329e7"},
+    {file = "prometheus_client-0.20.0.tar.gz", hash = "sha256:287629d00b147a32dcb2be0b9df905da599b2d82f80377083ec8463309a4bb89"},
+]
+
+[package.extras]
+twisted = ["twisted"]
+
+[[package]]
+name = "propcache"
+version = "0.2.0"
+description = "Accelerated property cache"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
+    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
+    {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"},
+    {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"},
+    {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"},
+    {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"},
+    {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"},
+    {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"},
+    {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"},
+    {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"},
+    {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"},
+    {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"},
+    {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"},
+    {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"},
+    {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"},
+    {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"},
+    {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"},
+]
+
+[[package]]
+name = "protobuf"
+version = "4.25.5"
+description = ""
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"},
+    {file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"},
+    {file = "protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173"},
+    {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d"},
+    {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331"},
+    {file = "protobuf-4.25.5-cp38-cp38-win32.whl", hash = "sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1"},
+    {file = "protobuf-4.25.5-cp38-cp38-win_amd64.whl", hash = "sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a"},
+    {file = "protobuf-4.25.5-cp39-cp39-win32.whl", hash = "sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f"},
+    {file = "protobuf-4.25.5-cp39-cp39-win_amd64.whl", hash = "sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45"},
+    {file = "protobuf-4.25.5-py3-none-any.whl", hash = "sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41"},
+    {file = "protobuf-4.25.5.tar.gz", hash = "sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584"},
+]
+
+[[package]]
+name = "psutil"
+version = "6.1.0"
+description = "Cross-platform lib for process and system monitoring in Python."
+optional = true
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+files = [
+    {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
+    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
+    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"},
+    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"},
+    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"},
+    {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"},
+    {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"},
+    {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"},
+    {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"},
+    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"},
+    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"},
+    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"},
+    {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"},
+    {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"},
+    {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"},
+    {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"},
+    {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"},
+]
+
+[package.extras]
+dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"]
+test = ["pytest", "pytest-xdist", "setuptools"]
+
+[[package]]
+name = "py-cpuinfo"
+version = "9.0.0"
+description = "Get CPU info with pure Python"
+optional = false
+python-versions = "*"
+files = [
+    {file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"},
+    {file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"},
+]
+
+[[package]]
+name = "pyarrow"
+version = "17.0.0"
+description = "Python library for Apache Arrow"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
+    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
+    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
+    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
+    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
+    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
+    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
+    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
+    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
+]
+
+[package.dependencies]
+numpy = ">=1.16.6"
+
+[package.extras]
+test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
+
+[[package]]
+name = "pydantic"
+version = "2.9.2"
+description = "Data validation using Python type hints"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12"},
+    {file = "pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f"},
+]
+
+[package.dependencies]
+annotated-types = ">=0.6.0"
+pydantic-core = "2.23.4"
+typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""}
+
+[package.extras]
+email = ["email-validator (>=2.0.0)"]
+timezone = ["tzdata"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.23.4"
+description = "Core functionality for Pydantic validation and serialization"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b10bd51f823d891193d4717448fab065733958bdb6a6b351967bd349d48d5c9b"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4fc714bdbfb534f94034efaa6eadd74e5b93c8fa6315565a222f7b6f42ca1166"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63e46b3169866bd62849936de036f901a9356e36376079b05efa83caeaa02ceb"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed1a53de42fbe34853ba90513cea21673481cd81ed1be739f7f2efb931b24916"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cfdd16ab5e59fc31b5e906d1a3f666571abc367598e3e02c83403acabc092e07"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255a8ef062cbf6674450e668482456abac99a5583bbafb73f9ad469540a3a232"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a7cd62e831afe623fbb7aabbb4fe583212115b3ef38a9f6b71869ba644624a2"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f09e2ff1f17c2b51f2bc76d1cc33da96298f0a036a137f5440ab3ec5360b624f"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e38e63e6f3d1cec5a27e0afe90a085af8b6806ee208b33030e65b6516353f1a3"},
+    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0dbd8dbed2085ed23b5c04afa29d8fd2771674223135dc9bc937f3c09284d071"},
+    {file = "pydantic_core-2.23.4-cp310-none-win32.whl", hash = "sha256:6531b7ca5f951d663c339002e91aaebda765ec7d61b7d1e3991051906ddde119"},
+    {file = "pydantic_core-2.23.4-cp310-none-win_amd64.whl", hash = "sha256:7c9129eb40958b3d4500fa2467e6a83356b3b61bfff1b414c7361d9220f9ae8f"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:77733e3892bb0a7fa797826361ce8a9184d25c8dffaec60b7ffe928153680ba8"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b84d168f6c48fabd1f2027a3d1bdfe62f92cade1fb273a5d68e621da0e44e6d"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df49e7a0861a8c36d089c1ed57d308623d60416dab2647a4a17fe050ba85de0e"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff02b6d461a6de369f07ec15e465a88895f3223eb75073ffea56b84d9331f607"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:996a38a83508c54c78a5f41456b0103c30508fed9abcad0a59b876d7398f25fd"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d97683ddee4723ae8c95d1eddac7c192e8c552da0c73a925a89fa8649bf13eea"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:216f9b2d7713eb98cb83c80b9c794de1f6b7e3145eef40400c62e86cee5f4e1e"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f783e0ec4803c787bcea93e13e9932edab72068f68ecffdf86a99fd5918878b"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d0776dea117cf5272382634bd2a5c1b6eb16767c223c6a5317cd3e2a757c61a0"},
+    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5f7a395a8cf1621939692dba2a6b6a830efa6b3cee787d82c7de1ad2930de64"},
+    {file = "pydantic_core-2.23.4-cp311-none-win32.whl", hash = "sha256:74b9127ffea03643e998e0c5ad9bd3811d3dac8c676e47db17b0ee7c3c3bf35f"},
+    {file = "pydantic_core-2.23.4-cp311-none-win_amd64.whl", hash = "sha256:98d134c954828488b153d88ba1f34e14259284f256180ce659e8d83e9c05eaa3"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f3e0da4ebaef65158d4dfd7d3678aad692f7666877df0002b8a522cdf088f231"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f69a8e0b033b747bb3e36a44e7732f0c99f7edd5cea723d45bc0d6e95377ffee"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723314c1d51722ab28bfcd5240d858512ffd3116449c557a1336cbe3919beb87"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bb2802e667b7051a1bebbfe93684841cc9351004e2badbd6411bf357ab8d5ac8"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18ca8148bebe1b0a382a27a8ee60350091a6ddaf475fa05ef50dc35b5df6327"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33e3d65a85a2a4a0dc3b092b938a4062b1a05f3a9abde65ea93b233bca0e03f2"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:128585782e5bfa515c590ccee4b727fb76925dd04a98864182b22e89a4e6ed36"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:68665f4c17edcceecc112dfed5dbe6f92261fb9d6054b47d01bf6371a6196126"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:20152074317d9bed6b7a95ade3b7d6054845d70584216160860425f4fbd5ee9e"},
+    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9261d3ce84fa1d38ed649c3638feefeae23d32ba9182963e465d58d62203bd24"},
+    {file = "pydantic_core-2.23.4-cp312-none-win32.whl", hash = "sha256:4ba762ed58e8d68657fc1281e9bb72e1c3e79cc5d464be146e260c541ec12d84"},
+    {file = "pydantic_core-2.23.4-cp312-none-win_amd64.whl", hash = "sha256:97df63000f4fea395b2824da80e169731088656d1818a11b95f3b173747b6cd9"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7530e201d10d7d14abce4fb54cfe5b94a0aefc87da539d0346a484ead376c3cc"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df933278128ea1cd77772673c73954e53a1c95a4fdf41eef97c2b779271bd0bd"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cb3da3fd1b6a5d0279a01877713dbda118a2a4fc6f0d821a57da2e464793f05"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c6dcb030aefb668a2b7009c85b27f90e51e6a3b4d5c9bc4c57631292015b0d"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:696dd8d674d6ce621ab9d45b205df149399e4bb9aa34102c970b721554828510"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2971bb5ffe72cc0f555c13e19b23c85b654dd2a8f7ab493c262071377bfce9f6"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8394d940e5d400d04cad4f75c0598665cbb81aecefaca82ca85bd28264af7f9b"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dff76e0602ca7d4cdaacc1ac4c005e0ce0dcfe095d5b5259163a80d3a10d327"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7d32706badfe136888bdea71c0def994644e09fff0bfe47441deaed8e96fdbc6"},
+    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f"},
+    {file = "pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769"},
+    {file = "pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d4488a93b071c04dc20f5cecc3631fc78b9789dd72483ba15d423b5b3689b555"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:81965a16b675b35e1d09dd14df53f190f9129c0202356ed44ab2728b1c905658"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ffa2ebd4c8530079140dd2d7f794a9d9a73cbb8e9d59ffe24c63436efa8f271"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:61817945f2fe7d166e75fbfb28004034b48e44878177fc54d81688e7b85a3665"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29d2c342c4bc01b88402d60189f3df065fb0dda3654744d5a165a5288a657368"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e11661ce0fd30a6790e8bcdf263b9ec5988e95e63cf901972107efc49218b13"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d18368b137c6295db49ce7218b1a9ba15c5bc254c96d7c9f9e924a9bc7825ad"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ec4e55f79b1c4ffb2eecd8a0cfba9955a2588497d96851f4c8f99aa4a1d39b12"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:374a5e5049eda9e0a44c696c7ade3ff355f06b1fe0bb945ea3cac2bc336478a2"},
+    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5c364564d17da23db1106787675fc7af45f2f7b58b4173bfdd105564e132e6fb"},
+    {file = "pydantic_core-2.23.4-cp38-none-win32.whl", hash = "sha256:d7a80d21d613eec45e3d41eb22f8f94ddc758a6c4720842dc74c0581f54993d6"},
+    {file = "pydantic_core-2.23.4-cp38-none-win_amd64.whl", hash = "sha256:5f5ff8d839f4566a474a969508fe1c5e59c31c80d9e140566f9a37bba7b8d556"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a4fa4fc04dff799089689f4fd502ce7d59de529fc2f40a2c8836886c03e0175a"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7df63886be5e270da67e0966cf4afbae86069501d35c8c1b3b6c168f42cb36"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcedcd19a557e182628afa1d553c3895a9f825b936415d0dbd3cd0bbcfd29b4b"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f54b118ce5de9ac21c363d9b3caa6c800341e8c47a508787e5868c6b79c9323"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d2f57d3e1379a9525c5ab067b27dbb8a0642fb5d454e17a9ac434f9ce523e3"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de6d1d1b9e5101508cb37ab0d972357cac5235f5c6533d1071964c47139257df"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1278e0d324f6908e872730c9102b0112477a7f7cf88b308e4fc36ce1bdb6d58c"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a6b5099eeec78827553827f4c6b8615978bb4b6a88e5d9b93eddf8bb6790f55"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e55541f756f9b3ee346b840103f32779c695a19826a4c442b7954550a0972040"},
+    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a5c7ba8ffb6d6f8f2ab08743be203654bb1aaa8c9dcb09f82ddd34eadb695605"},
+    {file = "pydantic_core-2.23.4-cp39-none-win32.whl", hash = "sha256:37b0fe330e4a58d3c58b24d91d1eb102aeec675a3db4c292ec3928ecd892a9a6"},
+    {file = "pydantic_core-2.23.4-cp39-none-win_amd64.whl", hash = "sha256:1498bec4c05c9c787bde9125cfdcc63a41004ff167f495063191b863399b1a29"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f455ee30a9d61d3e1a15abd5068827773d6e4dc513e795f380cdd59932c782d5"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1e90d2e3bd2c3863d48525d297cd143fe541be8bbf6f579504b9712cb6b643ec"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e203fdf807ac7e12ab59ca2bfcabb38c7cf0b33c41efeb00f8e5da1d86af480"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e08277a400de01bc72436a0ccd02bdf596631411f592ad985dcee21445bd0068"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f220b0eea5965dec25480b6333c788fb72ce5f9129e8759ef876a1d805d00801"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d06b0c8da4f16d1d1e352134427cb194a0a6e19ad5db9161bf32b2113409e728"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ba1a0996f6c2773bd83e63f18914c1de3c9dd26d55f4ac302a7efe93fb8e7433"},
+    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a5bce9d23aac8f0cf0836ecfc033896aa8443b501c58d0602dbfd5bd5b37753"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:78ddaaa81421a29574a682b3179d4cf9e6d405a09b99d93ddcf7e5239c742e21"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:883a91b5dd7d26492ff2f04f40fbb652de40fcc0afe07e8129e8ae779c2110eb"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88ad334a15b32a791ea935af224b9de1bf99bcd62fabf745d5f3442199d86d59"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233710f069d251feb12a56da21e14cca67994eab08362207785cf8c598e74577"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19442362866a753485ba5e4be408964644dd6a09123d9416c54cd49171f50744"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:624e278a7d29b6445e4e813af92af37820fafb6dcc55c012c834f9e26f9aaaef"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5ef8f42bec47f21d07668a043f077d507e5bf4e668d5c6dfe6aaba89de1a5b8"},
+    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:aea443fffa9fbe3af1a9ba721a87f926fe548d32cab71d188a6ede77d0ff244e"},
+    {file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
+
+[[package]]
+name = "pyreadline3"
+version = "3.5.4"
+description = "A python implementation of GNU readline."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"},
+    {file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"},
+]
+
+[package.extras]
+dev = ["build", "flake8", "mypy", "pytest", "twine"]
+
+[[package]]
+name = "pytest"
+version = "7.4.4"
+description = "pytest: simple powerful testing with Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
+    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
+
+[package.extras]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+description = "Extensions to the standard Python datetime module"
+optional = true
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+files = [
+    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
+    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
+]
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2024.2"
+description = "World timezone definitions, modern and historical"
+optional = true
+python-versions = "*"
+files = [
+    {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
+    {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.2"
+description = "YAML parser and emitter for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
+    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
+    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
+    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
+]
+
+[[package]]
+name = "referencing"
+version = "0.35.1"
+description = "JSON Referencing + Python"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de"},
+    {file = "referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c"},
+]
+
+[package.dependencies]
+attrs = ">=22.2.0"
+rpds-py = ">=0.7.0"
+
+[[package]]
+name = "regex"
+version = "2024.9.11"
+description = "Alternative regular expression module, to replace re."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1494fa8725c285a81d01dc8c06b55287a1ee5e0e382d8413adc0a9197aac6408"},
+    {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0e12c481ad92d129c78f13a2a3662317e46ee7ef96c94fd332e1c29131875b7d"},
+    {file = "regex-2024.9.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:16e13a7929791ac1216afde26f712802e3df7bf0360b32e4914dca3ab8baeea5"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46989629904bad940bbec2106528140a218b4a36bb3042d8406980be1941429c"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a906ed5e47a0ce5f04b2c981af1c9acf9e8696066900bf03b9d7879a6f679fc8"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9a091b0550b3b0207784a7d6d0f1a00d1d1c8a11699c1a4d93db3fbefc3ad35"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ddcd9a179c0a6fa8add279a4444015acddcd7f232a49071ae57fa6e278f1f71"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b41e1adc61fa347662b09398e31ad446afadff932a24807d3ceb955ed865cc8"},
+    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ced479f601cd2f8ca1fd7b23925a7e0ad512a56d6e9476f79b8f381d9d37090a"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:635a1d96665f84b292e401c3d62775851aedc31d4f8784117b3c68c4fcd4118d"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c0256beda696edcf7d97ef16b2a33a8e5a875affd6fa6567b54f7c577b30a137"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:3ce4f1185db3fbde8ed8aa223fc9620f276c58de8b0d4f8cc86fd1360829edb6"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:09d77559e80dcc9d24570da3745ab859a9cf91953062e4ab126ba9d5993688ca"},
+    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a22ccefd4db3f12b526eccb129390942fe874a3a9fdbdd24cf55773a1faab1a"},
+    {file = "regex-2024.9.11-cp310-cp310-win32.whl", hash = "sha256:f745ec09bc1b0bd15cfc73df6fa4f726dcc26bb16c23a03f9e3367d357eeedd0"},
+    {file = "regex-2024.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:01c2acb51f8a7d6494c8c5eafe3d8e06d76563d8a8a4643b37e9b2dd8a2ff623"},
+    {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2cce2449e5927a0bf084d346da6cd5eb016b2beca10d0013ab50e3c226ffc0df"},
+    {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b37fa423beefa44919e009745ccbf353d8c981516e807995b2bd11c2c77d268"},
+    {file = "regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:64ce2799bd75039b480cc0360907c4fb2f50022f030bf9e7a8705b636e408fad"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4cc92bb6db56ab0c1cbd17294e14f5e9224f0cc6521167ef388332604e92679"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d05ac6fa06959c4172eccd99a222e1fbf17b5670c4d596cb1e5cde99600674c4"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:040562757795eeea356394a7fb13076ad4f99d3c62ab0f8bdfb21f99a1f85664"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6113c008a7780792efc80f9dfe10ba0cd043cbf8dc9a76ef757850f51b4edc50"},
+    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e5fb5f77c8745a60105403a774fe2c1759b71d3e7b4ca237a5e67ad066c7199"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:54d9ff35d4515debf14bc27f1e3b38bfc453eff3220f5bce159642fa762fe5d4"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:df5cbb1fbc74a8305b6065d4ade43b993be03dbe0f8b30032cced0d7740994bd"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7fb89ee5d106e4a7a51bce305ac4efb981536301895f7bdcf93ec92ae0d91c7f"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a738b937d512b30bf75995c0159c0ddf9eec0775c9d72ac0202076c72f24aa96"},
+    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e28f9faeb14b6f23ac55bfbbfd3643f5c7c18ede093977f1df249f73fd22c7b1"},
+    {file = "regex-2024.9.11-cp311-cp311-win32.whl", hash = "sha256:18e707ce6c92d7282dfce370cd205098384b8ee21544e7cb29b8aab955b66fa9"},
+    {file = "regex-2024.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:313ea15e5ff2a8cbbad96ccef6be638393041b0a7863183c2d31e0c6116688cf"},
+    {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b0d0a6c64fcc4ef9c69bd5b3b3626cc3776520a1637d8abaa62b9edc147a58f7"},
+    {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:49b0e06786ea663f933f3710a51e9385ce0cba0ea56b67107fd841a55d56a231"},
+    {file = "regex-2024.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5b513b6997a0b2f10e4fd3a1313568e373926e8c252bd76c960f96fd039cd28d"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee439691d8c23e76f9802c42a95cfeebf9d47cf4ffd06f18489122dbb0a7ad64"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a8f877c89719d759e52783f7fe6e1c67121076b87b40542966c02de5503ace42"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23b30c62d0f16827f2ae9f2bb87619bc4fba2044911e2e6c2eb1af0161cdb766"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85ab7824093d8f10d44330fe1e6493f756f252d145323dd17ab6b48733ff6c0a"},
+    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8dee5b4810a89447151999428fe096977346cf2f29f4d5e29609d2e19e0199c9"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:98eeee2f2e63edae2181c886d7911ce502e1292794f4c5ee71e60e23e8d26b5d"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:57fdd2e0b2694ce6fc2e5ccf189789c3e2962916fb38779d3e3521ff8fe7a822"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d552c78411f60b1fdaafd117a1fca2f02e562e309223b9d44b7de8be451ec5e0"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a0b2b80321c2ed3fcf0385ec9e51a12253c50f146fddb2abbb10f033fe3d049a"},
+    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:18406efb2f5a0e57e3a5881cd9354c1512d3bb4f5c45d96d110a66114d84d23a"},
+    {file = "regex-2024.9.11-cp312-cp312-win32.whl", hash = "sha256:e464b467f1588e2c42d26814231edecbcfe77f5ac414d92cbf4e7b55b2c2a776"},
+    {file = "regex-2024.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:9e8719792ca63c6b8340380352c24dcb8cd7ec49dae36e963742a275dfae6009"},
+    {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c157bb447303070f256e084668b702073db99bbb61d44f85d811025fcf38f784"},
+    {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4db21ece84dfeefc5d8a3863f101995de646c6cb0536952c321a2650aa202c36"},
+    {file = "regex-2024.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:220e92a30b426daf23bb67a7962900ed4613589bab80382be09b48896d211e92"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb1ae19e64c14c7ec1995f40bd932448713d3c73509e82d8cd7744dc00e29e86"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f47cd43a5bfa48f86925fe26fbdd0a488ff15b62468abb5d2a1e092a4fb10e85"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9d4a76b96f398697fe01117093613166e6aa8195d63f1b4ec3f21ab637632963"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ea51dcc0835eea2ea31d66456210a4e01a076d820e9039b04ae8d17ac11dee6"},
+    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7aaa315101c6567a9a45d2839322c51c8d6e81f67683d529512f5bcfb99c802"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c57d08ad67aba97af57a7263c2d9006d5c404d721c5f7542f077f109ec2a4a29"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f8404bf61298bb6f8224bb9176c1424548ee1181130818fcd2cbffddc768bed8"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dd4490a33eb909ef5078ab20f5f000087afa2a4daa27b4c072ccb3cb3050ad84"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:eee9130eaad130649fd73e5cd92f60e55708952260ede70da64de420cdcad554"},
+    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6a2644a93da36c784e546de579ec1806bfd2763ef47babc1b03d765fe560c9f8"},
+    {file = "regex-2024.9.11-cp313-cp313-win32.whl", hash = "sha256:e997fd30430c57138adc06bba4c7c2968fb13d101e57dd5bb9355bf8ce3fa7e8"},
+    {file = "regex-2024.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:042c55879cfeb21a8adacc84ea347721d3d83a159da6acdf1116859e2427c43f"},
+    {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:35f4a6f96aa6cb3f2f7247027b07b15a374f0d5b912c0001418d1d55024d5cb4"},
+    {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:55b96e7ce3a69a8449a66984c268062fbaa0d8ae437b285428e12797baefce7e"},
+    {file = "regex-2024.9.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cb130fccd1a37ed894824b8c046321540263013da72745d755f2d35114b81a60"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:323c1f04be6b2968944d730e5c2091c8c89767903ecaa135203eec4565ed2b2b"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be1c8ed48c4c4065ecb19d882a0ce1afe0745dfad8ce48c49586b90a55f02366"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b5b029322e6e7b94fff16cd120ab35a253236a5f99a79fb04fda7ae71ca20ae8"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6fff13ef6b5f29221d6904aa816c34701462956aa72a77f1f151a8ec4f56aeb"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:587d4af3979376652010e400accc30404e6c16b7df574048ab1f581af82065e4"},
+    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:079400a8269544b955ffa9e31f186f01d96829110a3bf79dc338e9910f794fca"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f9268774428ec173654985ce55fc6caf4c6d11ade0f6f914d48ef4719eb05ebb"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:23f9985c8784e544d53fc2930fc1ac1a7319f5d5332d228437acc9f418f2f168"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ae2941333154baff9838e88aa71c1d84f4438189ecc6021a12c7573728b5838e"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:e93f1c331ca8e86fe877a48ad64e77882c0c4da0097f2212873a69bbfea95d0c"},
+    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:846bc79ee753acf93aef4184c040d709940c9d001029ceb7b7a52747b80ed2dd"},
+    {file = "regex-2024.9.11-cp38-cp38-win32.whl", hash = "sha256:c94bb0a9f1db10a1d16c00880bdebd5f9faf267273b8f5bd1878126e0fbde771"},
+    {file = "regex-2024.9.11-cp38-cp38-win_amd64.whl", hash = "sha256:2b08fce89fbd45664d3df6ad93e554b6c16933ffa9d55cb7e01182baaf971508"},
+    {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:07f45f287469039ffc2c53caf6803cd506eb5f5f637f1d4acb37a738f71dd066"},
+    {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4838e24ee015101d9f901988001038f7f0d90dc0c3b115541a1365fb439add62"},
+    {file = "regex-2024.9.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6edd623bae6a737f10ce853ea076f56f507fd7726bee96a41ee3d68d347e4d16"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c69ada171c2d0e97a4b5aa78fbb835e0ffbb6b13fc5da968c09811346564f0d3"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02087ea0a03b4af1ed6ebab2c54d7118127fee8d71b26398e8e4b05b78963199"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69dee6a020693d12a3cf892aba4808fe168d2a4cef368eb9bf74f5398bfd4ee8"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:297f54910247508e6e5cae669f2bc308985c60540a4edd1c77203ef19bfa63ca"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ecea58b43a67b1b79805f1a0255730edaf5191ecef84dbc4cc85eb30bc8b63b9"},
+    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eab4bb380f15e189d1313195b062a6aa908f5bd687a0ceccd47c8211e9cf0d4a"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0cbff728659ce4bbf4c30b2a1be040faafaa9eca6ecde40aaff86f7889f4ab39"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:54c4a097b8bc5bb0dfc83ae498061d53ad7b5762e00f4adaa23bee22b012e6ba"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:73d6d2f64f4d894c96626a75578b0bf7d9e56dcda8c3d037a2118fdfe9b1c664"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:e53b5fbab5d675aec9f0c501274c467c0f9a5d23696cfc94247e1fb56501ed89"},
+    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0ffbcf9221e04502fc35e54d1ce9567541979c3fdfb93d2c554f0ca583a19b35"},
+    {file = "regex-2024.9.11-cp39-cp39-win32.whl", hash = "sha256:e4c22e1ac1f1ec1e09f72e6c44d8f2244173db7eb9629cc3a346a8d7ccc31142"},
+    {file = "regex-2024.9.11-cp39-cp39-win_amd64.whl", hash = "sha256:faa3c142464efec496967359ca99696c896c591c56c53506bac1ad465f66e919"},
+    {file = "regex-2024.9.11.tar.gz", hash = "sha256:6c188c307e8433bcb63dc1915022deb553b4203a70722fc542c363bf120a01fd"},
+]
+
+[[package]]
+name = "requests"
+version = "2.32.3"
+description = "Python HTTP for Humans."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
+]
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<3"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+
+[[package]]
+name = "rich"
+version = "13.8.1"
+description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "rich-13.8.1-py3-none-any.whl", hash = "sha256:1760a3c0848469b97b558fc61c85233e3dafb69c7a071b4d60c38099d3cd4c06"},
+    {file = "rich-13.8.1.tar.gz", hash = "sha256:8260cda28e3db6bf04d2d1ef4dbc03ba80a824c88b0e7668a0f23126a424844a"},
+]
+
+[package.dependencies]
+markdown-it-py = ">=2.2.0"
+pygments = ">=2.13.0,<3.0.0"
+
+[package.extras]
+jupyter = ["ipywidgets (>=7.5.1,<9)"]
+
+[[package]]
+name = "rpds-py"
+version = "0.20.0"
+description = "Python bindings to Rust's persistent data structures (rpds)"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "rpds_py-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3ad0fda1635f8439cde85c700f964b23ed5fc2d28016b32b9ee5fe30da5c84e2"},
+    {file = "rpds_py-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9bb4a0d90fdb03437c109a17eade42dfbf6190408f29b2744114d11586611d6f"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6377e647bbfd0a0b159fe557f2c6c602c159fc752fa316572f012fc0bf67150"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb851b7df9dda52dc1415ebee12362047ce771fc36914586b2e9fcbd7d293b3e"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e0f80b739e5a8f54837be5d5c924483996b603d5502bfff79bf33da06164ee2"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a8c94dad2e45324fc74dce25e1645d4d14df9a4e54a30fa0ae8bad9a63928e3"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8e604fe73ba048c06085beaf51147eaec7df856824bfe7b98657cf436623daf"},
+    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df3de6b7726b52966edf29663e57306b23ef775faf0ac01a3e9f4012a24a4140"},
+    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf258ede5bc22a45c8e726b29835b9303c285ab46fc7c3a4cc770736b5304c9f"},
+    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:55fea87029cded5df854ca7e192ec7bdb7ecd1d9a3f63d5c4eb09148acf4a7ce"},
+    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ae94bd0b2f02c28e199e9bc51485d0c5601f58780636185660f86bf80c89af94"},
+    {file = "rpds_py-0.20.0-cp310-none-win32.whl", hash = "sha256:28527c685f237c05445efec62426d285e47a58fb05ba0090a4340b73ecda6dee"},
+    {file = "rpds_py-0.20.0-cp310-none-win_amd64.whl", hash = "sha256:238a2d5b1cad28cdc6ed15faf93a998336eb041c4e440dd7f902528b8891b399"},
+    {file = "rpds_py-0.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ac2f4f7a98934c2ed6505aead07b979e6f999389f16b714448fb39bbaa86a489"},
+    {file = "rpds_py-0.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:220002c1b846db9afd83371d08d239fdc865e8f8c5795bbaec20916a76db3318"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d7919548df3f25374a1f5d01fbcd38dacab338ef5f33e044744b5c36729c8db"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:758406267907b3781beee0f0edfe4a179fbd97c0be2e9b1154d7f0a1279cf8e5"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d61339e9f84a3f0767b1995adfb171a0d00a1185192718a17af6e124728e0f5"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1259c7b3705ac0a0bd38197565a5d603218591d3f6cee6e614e380b6ba61c6f6"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c1dc0f53856b9cc9a0ccca0a7cc61d3d20a7088201c0937f3f4048c1718a209"},
+    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7e60cb630f674a31f0368ed32b2a6b4331b8350d67de53c0359992444b116dd3"},
+    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbe982f38565bb50cb7fb061ebf762c2f254ca3d8c20d4006878766e84266272"},
+    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:514b3293b64187172bc77c8fb0cdae26981618021053b30d8371c3a902d4d5ad"},
+    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d0a26ffe9d4dd35e4dfdd1e71f46401cff0181c75ac174711ccff0459135fa58"},
+    {file = "rpds_py-0.20.0-cp311-none-win32.whl", hash = "sha256:89c19a494bf3ad08c1da49445cc5d13d8fefc265f48ee7e7556839acdacf69d0"},
+    {file = "rpds_py-0.20.0-cp311-none-win_amd64.whl", hash = "sha256:c638144ce971df84650d3ed0096e2ae7af8e62ecbbb7b201c8935c370df00a2c"},
+    {file = "rpds_py-0.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a84ab91cbe7aab97f7446652d0ed37d35b68a465aeef8fc41932a9d7eee2c1a6"},
+    {file = "rpds_py-0.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:56e27147a5a4c2c21633ff8475d185734c0e4befd1c989b5b95a5d0db699b21b"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2580b0c34583b85efec8c5c5ec9edf2dfe817330cc882ee972ae650e7b5ef739"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b80d4a7900cf6b66bb9cee5c352b2d708e29e5a37fe9bf784fa97fc11504bf6c"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50eccbf054e62a7b2209b28dc7a22d6254860209d6753e6b78cfaeb0075d7bee"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:49a8063ea4296b3a7e81a5dfb8f7b2d73f0b1c20c2af401fb0cdf22e14711a96"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea438162a9fcbee3ecf36c23e6c68237479f89f962f82dae83dc15feeceb37e4"},
+    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:18d7585c463087bddcfa74c2ba267339f14f2515158ac4db30b1f9cbdb62c8ef"},
+    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d4c7d1a051eeb39f5c9547e82ea27cbcc28338482242e3e0b7768033cb083821"},
+    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4df1e3b3bec320790f699890d41c59d250f6beda159ea3c44c3f5bac1976940"},
+    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2cf126d33a91ee6eedc7f3197b53e87a2acdac63602c0f03a02dd69e4b138174"},
+    {file = "rpds_py-0.20.0-cp312-none-win32.whl", hash = "sha256:8bc7690f7caee50b04a79bf017a8d020c1f48c2a1077ffe172abec59870f1139"},
+    {file = "rpds_py-0.20.0-cp312-none-win_amd64.whl", hash = "sha256:0e13e6952ef264c40587d510ad676a988df19adea20444c2b295e536457bc585"},
+    {file = "rpds_py-0.20.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:aa9a0521aeca7d4941499a73ad7d4f8ffa3d1affc50b9ea11d992cd7eff18a29"},
+    {file = "rpds_py-0.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a1f1d51eccb7e6c32ae89243cb352389228ea62f89cd80823ea7dd1b98e0b91"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a86a9b96070674fc88b6f9f71a97d2c1d3e5165574615d1f9168ecba4cecb24"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c8ef2ebf76df43f5750b46851ed1cdf8f109d7787ca40035fe19fbdc1acc5a7"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b74b25f024b421d5859d156750ea9a65651793d51b76a2e9238c05c9d5f203a9"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57eb94a8c16ab08fef6404301c38318e2c5a32216bf5de453e2714c964c125c8"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1940dae14e715e2e02dfd5b0f64a52e8374a517a1e531ad9412319dc3ac7879"},
+    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d20277fd62e1b992a50c43f13fbe13277a31f8c9f70d59759c88f644d66c619f"},
+    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:06db23d43f26478303e954c34c75182356ca9aa7797d22c5345b16871ab9c45c"},
+    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2a5db5397d82fa847e4c624b0c98fe59d2d9b7cf0ce6de09e4d2e80f8f5b3f2"},
+    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a35df9f5548fd79cb2f52d27182108c3e6641a4feb0f39067911bf2adaa3e57"},
+    {file = "rpds_py-0.20.0-cp313-none-win32.whl", hash = "sha256:fd2d84f40633bc475ef2d5490b9c19543fbf18596dcb1b291e3a12ea5d722f7a"},
+    {file = "rpds_py-0.20.0-cp313-none-win_amd64.whl", hash = "sha256:9bc2d153989e3216b0559251b0c260cfd168ec78b1fac33dd485750a228db5a2"},
+    {file = "rpds_py-0.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f2fbf7db2012d4876fb0d66b5b9ba6591197b0f165db8d99371d976546472a24"},
+    {file = "rpds_py-0.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1e5f3cd7397c8f86c8cc72d5a791071431c108edd79872cdd96e00abd8497d29"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce9845054c13696f7af7f2b353e6b4f676dab1b4b215d7fe5e05c6f8bb06f965"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c3e130fd0ec56cb76eb49ef52faead8ff09d13f4527e9b0c400307ff72b408e1"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b16aa0107ecb512b568244ef461f27697164d9a68d8b35090e9b0c1c8b27752"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa7f429242aae2947246587d2964fad750b79e8c233a2367f71b554e9447949c"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af0fc424a5842a11e28956e69395fbbeab2c97c42253169d87e90aac2886d751"},
+    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b8c00a3b1e70c1d3891f0db1b05292747f0dbcfb49c43f9244d04c70fbc40eb8"},
+    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:40ce74fc86ee4645d0a225498d091d8bc61f39b709ebef8204cb8b5a464d3c0e"},
+    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:4fe84294c7019456e56d93e8ababdad5a329cd25975be749c3f5f558abb48253"},
+    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:338ca4539aad4ce70a656e5187a3a31c5204f261aef9f6ab50e50bcdffaf050a"},
+    {file = "rpds_py-0.20.0-cp38-none-win32.whl", hash = "sha256:54b43a2b07db18314669092bb2de584524d1ef414588780261e31e85846c26a5"},
+    {file = "rpds_py-0.20.0-cp38-none-win_amd64.whl", hash = "sha256:a1862d2d7ce1674cffa6d186d53ca95c6e17ed2b06b3f4c476173565c862d232"},
+    {file = "rpds_py-0.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:3fde368e9140312b6e8b6c09fb9f8c8c2f00999d1823403ae90cc00480221b22"},
+    {file = "rpds_py-0.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9824fb430c9cf9af743cf7aaf6707bf14323fb51ee74425c380f4c846ea70789"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11ef6ce74616342888b69878d45e9f779b95d4bd48b382a229fe624a409b72c5"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c52d3f2f82b763a24ef52f5d24358553e8403ce05f893b5347098014f2d9eff2"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d35cef91e59ebbeaa45214861874bc6f19eb35de96db73e467a8358d701a96c"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d72278a30111e5b5525c1dd96120d9e958464316f55adb030433ea905866f4de"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4c29cbbba378759ac5786730d1c3cb4ec6f8ababf5c42a9ce303dc4b3d08cda"},
+    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6632f2d04f15d1bd6fe0eedd3b86d9061b836ddca4c03d5cf5c7e9e6b7c14580"},
+    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d0b67d87bb45ed1cd020e8fbf2307d449b68abc45402fe1a4ac9e46c3c8b192b"},
+    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ec31a99ca63bf3cd7f1a5ac9fe95c5e2d060d3c768a09bc1d16e235840861420"},
+    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22e6c9976e38f4d8c4a63bd8a8edac5307dffd3ee7e6026d97f3cc3a2dc02a0b"},
+    {file = "rpds_py-0.20.0-cp39-none-win32.whl", hash = "sha256:569b3ea770c2717b730b61998b6c54996adee3cef69fc28d444f3e7920313cf7"},
+    {file = "rpds_py-0.20.0-cp39-none-win_amd64.whl", hash = "sha256:e6900ecdd50ce0facf703f7a00df12374b74bbc8ad9fe0f6559947fb20f82364"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:617c7357272c67696fd052811e352ac54ed1d9b49ab370261a80d3b6ce385045"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9426133526f69fcaba6e42146b4e12d6bc6c839b8b555097020e2b78ce908dcc"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deb62214c42a261cb3eb04d474f7155279c1a8a8c30ac89b7dcb1721d92c3c02"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcaeb7b57f1a1e071ebd748984359fef83ecb026325b9d4ca847c95bc7311c92"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d454b8749b4bd70dd0a79f428731ee263fa6995f83ccb8bada706e8d1d3ff89d"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d807dc2051abe041b6649681dce568f8e10668e3c1c6543ebae58f2d7e617855"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3c20f0ddeb6e29126d45f89206b8291352b8c5b44384e78a6499d68b52ae511"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b7f19250ceef892adf27f0399b9e5afad019288e9be756d6919cb58892129f51"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4f1ed4749a08379555cebf4650453f14452eaa9c43d0a95c49db50c18b7da075"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dcedf0b42bcb4cfff4101d7771a10532415a6106062f005ab97d1d0ab5681c60"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:39ed0d010457a78f54090fafb5d108501b5aa5604cc22408fc1c0c77eac14344"},
+    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bb273176be34a746bdac0b0d7e4e2c467323d13640b736c4c477881a3220a989"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f918a1a130a6dfe1d7fe0f105064141342e7dd1611f2e6a21cd2f5c8cb1cfb3e"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:f60012a73aa396be721558caa3a6fd49b3dd0033d1675c6d59c4502e870fcf0c"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d2b1ad682a3dfda2a4e8ad8572f3100f95fad98cb99faf37ff0ddfe9cbf9d03"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:614fdafe9f5f19c63ea02817fa4861c606a59a604a77c8cdef5aa01d28b97921"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa518bcd7600c584bf42e6617ee8132869e877db2f76bcdc281ec6a4113a53ab"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0475242f447cc6cb8a9dd486d68b2ef7fbee84427124c232bff5f63b1fe11e5"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f90a4cd061914a60bd51c68bcb4357086991bd0bb93d8aa66a6da7701370708f"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:def7400461c3a3f26e49078302e1c1b38f6752342c77e3cf72ce91ca69fb1bc1"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:65794e4048ee837494aea3c21a28ad5fc080994dfba5b036cf84de37f7ad5074"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:faefcc78f53a88f3076b7f8be0a8f8d35133a3ecf7f3770895c25f8813460f08"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5b4f105deeffa28bbcdff6c49b34e74903139afa690e35d2d9e3c2c2fba18cec"},
+    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdfc3a892927458d98f3d55428ae46b921d1f7543b89382fdb483f5640daaec8"},
+    {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
+]
+
+[[package]]
+name = "safetensors"
+version = "0.4.5"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7"},
+    {file = "safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6885016f34bef80ea1085b7e99b3c1f92cb1be78a49839203060f67b40aee761"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:133620f443450429322f238fda74d512c4008621227fccf2f8cf4a76206fea7c"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4fb3e0609ec12d2a77e882f07cced530b8262027f64b75d399f1504ffec0ba56"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0f1dd769f064adc33831f5e97ad07babbd728427f98e3e1db6902e369122737"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6d156bdb26732feada84f9388a9f135528c1ef5b05fae153da365ad4319c4c5"},
+    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e347d77e2c77eb7624400ccd09bed69d35c0332f417ce8c048d404a096c593b"},
+    {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9f556eea3aec1d3d955403159fe2123ddd68e880f83954ee9b4a3f2e15e716b6"},
+    {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9483f42be3b6bc8ff77dd67302de8ae411c4db39f7224dec66b0eb95822e4163"},
+    {file = "safetensors-0.4.5-cp310-none-win32.whl", hash = "sha256:7389129c03fadd1ccc37fd1ebbc773f2b031483b04700923c3511d2a939252cc"},
+    {file = "safetensors-0.4.5-cp310-none-win_amd64.whl", hash = "sha256:e98ef5524f8b6620c8cdef97220c0b6a5c1cef69852fcd2f174bb96c2bb316b1"},
+    {file = "safetensors-0.4.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:21f848d7aebd5954f92538552d6d75f7c1b4500f51664078b5b49720d180e47c"},
+    {file = "safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb07000b19d41e35eecef9a454f31a8b4718a185293f0d0b1c4b61d6e4487971"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09dedf7c2fda934ee68143202acff6e9e8eb0ddeeb4cfc24182bef999efa9f42"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:59b77e4b7a708988d84f26de3ebead61ef1659c73dcbc9946c18f3b1786d2688"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d3bc83e14d67adc2e9387e511097f254bd1b43c3020440e708858c684cbac68"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39371fc551c1072976073ab258c3119395294cf49cdc1f8476794627de3130df"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6c19feda32b931cae0acd42748a670bdf56bee6476a046af20181ad3fee4090"},
+    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a659467495de201e2f282063808a41170448c78bada1e62707b07a27b05e6943"},
+    {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bad5e4b2476949bcd638a89f71b6916fa9a5cae5c1ae7eede337aca2100435c0"},
+    {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a3a315a6d0054bc6889a17f5668a73f94f7fe55121ff59e0a199e3519c08565f"},
+    {file = "safetensors-0.4.5-cp311-none-win32.whl", hash = "sha256:a01e232e6d3d5cf8b1667bc3b657a77bdab73f0743c26c1d3c5dd7ce86bd3a92"},
+    {file = "safetensors-0.4.5-cp311-none-win_amd64.whl", hash = "sha256:cbd39cae1ad3e3ef6f63a6f07296b080c951f24cec60188378e43d3713000c04"},
+    {file = "safetensors-0.4.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:473300314e026bd1043cef391bb16a8689453363381561b8a3e443870937cc1e"},
+    {file = "safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:801183a0f76dc647f51a2d9141ad341f9665602a7899a693207a82fb102cc53e"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1524b54246e422ad6fb6aea1ac71edeeb77666efa67230e1faf6999df9b2e27f"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3139098e3e8b2ad7afbca96d30ad29157b50c90861084e69fcb80dec7430461"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65573dc35be9059770808e276b017256fa30058802c29e1038eb1c00028502ea"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd33da8e9407559f8779c82a0448e2133737f922d71f884da27184549416bfed"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3685ce7ed036f916316b567152482b7e959dc754fcc4a8342333d222e05f407c"},
+    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dde2bf390d25f67908278d6f5d59e46211ef98e44108727084d4637ee70ab4f1"},
+    {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7469d70d3de970b1698d47c11ebbf296a308702cbaae7fcb993944751cf985f4"},
+    {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3a6ba28118636a130ccbb968bc33d4684c48678695dba2590169d5ab03a45646"},
+    {file = "safetensors-0.4.5-cp312-none-win32.whl", hash = "sha256:c859c7ed90b0047f58ee27751c8e56951452ed36a67afee1b0a87847d065eec6"},
+    {file = "safetensors-0.4.5-cp312-none-win_amd64.whl", hash = "sha256:b5a8810ad6a6f933fff6c276eae92c1da217b39b4d8b1bc1c0b8af2d270dc532"},
+    {file = "safetensors-0.4.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:25e5f8e2e92a74f05b4ca55686234c32aac19927903792b30ee6d7bd5653d54e"},
+    {file = "safetensors-0.4.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81efb124b58af39fcd684254c645e35692fea81c51627259cdf6d67ff4458916"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:585f1703a518b437f5103aa9cf70e9bd437cb78eea9c51024329e4fb8a3e3679"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b99fbf72e3faf0b2f5f16e5e3458b93b7d0a83984fe8d5364c60aa169f2da89"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b17b299ca9966ca983ecda1c0791a3f07f9ca6ab5ded8ef3d283fff45f6bcd5f"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76ded72f69209c9780fdb23ea89e56d35c54ae6abcdec67ccb22af8e696e449a"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2783956926303dcfeb1de91a4d1204cd4089ab441e622e7caee0642281109db3"},
+    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d94581aab8c6b204def4d7320f07534d6ee34cd4855688004a4354e63b639a35"},
+    {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:67e1e7cb8678bb1b37ac48ec0df04faf689e2f4e9e81e566b5c63d9f23748523"},
+    {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:dbd280b07e6054ea68b0cb4b16ad9703e7d63cd6890f577cb98acc5354780142"},
+    {file = "safetensors-0.4.5-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:77d9b228da8374c7262046a36c1f656ba32a93df6cc51cd4453af932011e77f1"},
+    {file = "safetensors-0.4.5-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:500cac01d50b301ab7bb192353317035011c5ceeef0fca652f9f43c000bb7f8d"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75331c0c746f03158ded32465b7d0b0e24c5a22121743662a2393439c43a45cf"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:670e95fe34e0d591d0529e5e59fd9d3d72bc77b1444fcaa14dccda4f36b5a38b"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:098923e2574ff237c517d6e840acada8e5b311cb1fa226019105ed82e9c3b62f"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ca0902d2648775089fa6a0c8fc9e6390c5f8ee576517d33f9261656f851e3f"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f0032bedc869c56f8d26259fe39cd21c5199cd57f2228d817a0e23e8370af25"},
+    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4b15f51b4f8f2a512341d9ce3475cacc19c5fdfc5db1f0e19449e75f95c7dc8"},
+    {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f6594d130d0ad933d885c6a7b75c5183cb0e8450f799b80a39eae2b8508955eb"},
+    {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:60c828a27e852ded2c85fc0f87bf1ec20e464c5cd4d56ff0e0711855cc2e17f8"},
+    {file = "safetensors-0.4.5-cp37-none-win32.whl", hash = "sha256:6d3de65718b86c3eeaa8b73a9c3d123f9307a96bbd7be9698e21e76a56443af5"},
+    {file = "safetensors-0.4.5-cp37-none-win_amd64.whl", hash = "sha256:5a2d68a523a4cefd791156a4174189a4114cf0bf9c50ceb89f261600f3b2b81a"},
+    {file = "safetensors-0.4.5-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:e7a97058f96340850da0601a3309f3d29d6191b0702b2da201e54c6e3e44ccf0"},
+    {file = "safetensors-0.4.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:63bfd425e25f5c733f572e2246e08a1c38bd6f2e027d3f7c87e2e43f228d1345"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3664ac565d0e809b0b929dae7ccd74e4d3273cd0c6d1220c6430035befb678e"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:313514b0b9b73ff4ddfb4edd71860696dbe3c1c9dc4d5cc13dbd74da283d2cbf"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31fa33ee326f750a2f2134a6174773c281d9a266ccd000bd4686d8021f1f3dac"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:09566792588d77b68abe53754c9f1308fadd35c9f87be939e22c623eaacbed6b"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309aaec9b66cbf07ad3a2e5cb8a03205663324fea024ba391594423d0f00d9fe"},
+    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:53946c5813b8f9e26103c5efff4a931cc45d874f45229edd68557ffb35ffb9f8"},
+    {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:868f9df9e99ad1e7f38c52194063a982bc88fedc7d05096f4f8160403aaf4bd6"},
+    {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9cc9449bd0b0bc538bd5e268221f0c5590bc5c14c1934a6ae359d44410dc68c4"},
+    {file = "safetensors-0.4.5-cp38-none-win32.whl", hash = "sha256:83c4f13a9e687335c3928f615cd63a37e3f8ef072a3f2a0599fa09f863fb06a2"},
+    {file = "safetensors-0.4.5-cp38-none-win_amd64.whl", hash = "sha256:b98d40a2ffa560653f6274e15b27b3544e8e3713a44627ce268f419f35c49478"},
+    {file = "safetensors-0.4.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:cf727bb1281d66699bef5683b04d98c894a2803442c490a8d45cd365abfbdeb2"},
+    {file = "safetensors-0.4.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96f1d038c827cdc552d97e71f522e1049fef0542be575421f7684756a748e457"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:139fbee92570ecea774e6344fee908907db79646d00b12c535f66bc78bd5ea2c"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c36302c1c69eebb383775a89645a32b9d266878fab619819ce660309d6176c9b"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d641f5b8149ea98deb5ffcf604d764aad1de38a8285f86771ce1abf8e74c4891"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b4db6a61d968de73722b858038c616a1bebd4a86abe2688e46ca0cc2d17558f2"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b75a616e02f21b6f1d5785b20cecbab5e2bd3f6358a90e8925b813d557666ec1"},
+    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:788ee7d04cc0e0e7f944c52ff05f52a4415b312f5efd2ee66389fb7685ee030c"},
+    {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87bc42bd04fd9ca31396d3ca0433db0be1411b6b53ac5a32b7845a85d01ffc2e"},
+    {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4037676c86365a721a8c9510323a51861d703b399b78a6b4486a54a65a975fca"},
+    {file = "safetensors-0.4.5-cp39-none-win32.whl", hash = "sha256:1500418454529d0ed5c1564bda376c4ddff43f30fce9517d9bee7bcce5a8ef50"},
+    {file = "safetensors-0.4.5-cp39-none-win_amd64.whl", hash = "sha256:9d1a94b9d793ed8fe35ab6d5cea28d540a46559bafc6aae98f30ee0867000cab"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fdadf66b5a22ceb645d5435a0be7a0292ce59648ca1d46b352f13cff3ea80410"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d42ffd4c2259f31832cb17ff866c111684c87bd930892a1ba53fed28370c918c"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd8a1f6d2063a92cd04145c7fd9e31a1c7d85fbec20113a14b487563fdbc0597"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:951d2fcf1817f4fb0ef0b48f6696688a4e852a95922a042b3f96aaa67eedc920"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ac85d9a8c1af0e3132371d9f2d134695a06a96993c2e2f0bbe25debb9e3f67a"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e3cec4a29eb7fe8da0b1c7988bc3828183080439dd559f720414450de076fcab"},
+    {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c7db3006a4915151ce1913652e907cdede299b974641a83fbc092102ac41b644"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f68bf99ea970960a237f416ea394e266e0361895753df06e3e06e6ea7907d98b"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8158938cf3324172df024da511839d373c40fbfaa83e9abf467174b2910d7b4c"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:540ce6c4bf6b58cb0fd93fa5f143bc0ee341c93bb4f9287ccd92cf898cc1b0dd"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bfeaa1a699c6b9ed514bd15e6a91e74738b71125a9292159e3d6b7f0a53d2cde"},
+    {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:01c8f00da537af711979e1b42a69a8ec9e1d7112f208e0e9b8a35d2c381085ef"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a0dd565f83b30f2ca79b5d35748d0d99dd4b3454f80e03dfb41f0038e3bdf180"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:023b6e5facda76989f4cba95a861b7e656b87e225f61811065d5c501f78cdb3f"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9633b663393d5796f0b60249549371e392b75a0b955c07e9c6f8708a87fc841f"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78dd8adfb48716233c45f676d6e48534d34b4bceb50162c13d1f0bdf6f78590a"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8e8deb16c4321d61ae72533b8451ec4a9af8656d1c61ff81aa49f966406e4b68"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:52452fa5999dc50c4decaf0c53aa28371f7f1e0fe5c2dd9129059fbe1e1599c7"},
+    {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d5f23198821e227cfc52d50fa989813513db381255c6d100927b012f0cfec63d"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f4beb84b6073b1247a773141a6331117e35d07134b3bb0383003f39971d414bb"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:68814d599d25ed2fdd045ed54d370d1d03cf35e02dce56de44c651f828fb9b7b"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0b6453c54c57c1781292c46593f8a37254b8b99004c68d6c3ce229688931a22"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adaa9c6dead67e2dd90d634f89131e43162012479d86e25618e821a03d1eb1dc"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73e7d408e9012cd17511b382b43547850969c7979efc2bc353f317abaf23c84c"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:775409ce0fcc58b10773fdb4221ed1eb007de10fe7adbdf8f5e8a56096b6f0bc"},
+    {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:834001bed193e4440c4a3950a31059523ee5090605c907c66808664c932b549c"},
+    {file = "safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310"},
+]
+
+[package.extras]
+all = ["safetensors[jax]", "safetensors[numpy]", "safetensors[paddlepaddle]", "safetensors[pinned-tf]", "safetensors[quality]", "safetensors[testing]", "safetensors[torch]"]
+dev = ["safetensors[all]"]
+jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "safetensors[numpy]"]
+mlx = ["mlx (>=0.0.9)"]
+numpy = ["numpy (>=1.21.6)"]
+paddlepaddle = ["paddlepaddle (>=2.4.1)", "safetensors[numpy]"]
+pinned-tf = ["safetensors[numpy]", "tensorflow (==2.11.0)"]
+quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"]
+tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
+testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
+torch = ["safetensors[numpy]", "torch (>=1.10)"]
+
+[[package]]
+name = "scikit-learn"
+version = "1.5.2"
+description = "A set of python modules for machine learning and data mining"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6"},
+    {file = "scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0"},
+    {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540"},
+    {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8"},
+    {file = "scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113"},
+    {file = "scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445"},
+    {file = "scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de"},
+    {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675"},
+    {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1"},
+    {file = "scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6"},
+    {file = "scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a"},
+    {file = "scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1"},
+    {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
+    {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
+    {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
+    {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
+    {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
+    {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
+    {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7"},
+    {file = "scikit_learn-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe"},
+    {file = "scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d"},
+]
+
+[package.dependencies]
+joblib = ">=1.2.0"
+numpy = ">=1.19.5"
+scipy = ">=1.6.0"
+threadpoolctl = ">=3.1.0"
+
+[package.extras]
+benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"]
+build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"]
+docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.16.0)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)"]
+examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"]
+install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"]
+maintenance = ["conda-lock (==2.5.6)"]
+tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.2.1)", "scikit-image (>=0.17.2)"]
+
+[[package]]
+name = "scipy"
+version = "1.13.1"
+description = "Fundamental algorithms for scientific computing in Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca"},
+    {file = "scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f"},
+    {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989"},
+    {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f"},
+    {file = "scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94"},
+    {file = "scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54"},
+    {file = "scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9"},
+    {file = "scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326"},
+    {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299"},
+    {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa"},
+    {file = "scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59"},
+    {file = "scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b"},
+    {file = "scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1"},
+    {file = "scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d"},
+    {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627"},
+    {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884"},
+    {file = "scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16"},
+    {file = "scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949"},
+    {file = "scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:436bbb42a94a8aeef855d755ce5a465479c721e9d684de76bf61a62e7c2b81d5"},
+    {file = "scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:8335549ebbca860c52bf3d02f80784e91a004b71b059e3eea9678ba994796a24"},
+    {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d533654b7d221a6a97304ab63c41c96473ff04459e404b83275b60aa8f4b7004"},
+    {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d"},
+    {file = "scipy-1.13.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a014c2b3697bde71724244f63de2476925596c24285c7a637364761f8710891c"},
+    {file = "scipy-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:392e4ec766654852c25ebad4f64e4e584cf19820b980bc04960bca0b0cd6eaa2"},
+    {file = "scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c"},
+]
+
+[package.dependencies]
+numpy = ">=1.22.4,<2.3"
+
+[package.extras]
+dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"]
+doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"]
+test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
+
+[[package]]
+name = "sentencepiece"
+version = "0.2.0"
+description = "SentencePiece python wrapper"
+optional = false
+python-versions = "*"
+files = [
+    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:188779e1298a1c8b8253c7d3ad729cb0a9891e5cef5e5d07ce4592c54869e227"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bed9cf85b296fa2b76fc2547b9cbb691a523864cebaee86304c43a7b4cb1b452"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7b67e724bead13f18db6e1d10b6bbdc454af574d70efbb36f27d90387be1ca3"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fde4b08cfe237be4484c6c7c2e2c75fb862cfeab6bd5449ce4caeafd97b767a"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c378492056202d1c48a4979650981635fd97875a00eabb1f00c6a236b013b5e"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1380ce6540a368de2ef6d7e6ba14ba8f3258df650d39ba7d833b79ee68a52040"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-win32.whl", hash = "sha256:a1151d6a6dd4b43e552394aed0edfe9292820272f0194bd56c7c1660a0c06c3d"},
+    {file = "sentencepiece-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:d490142b0521ef22bc1085f061d922a2a6666175bb6b42e588ff95c0db6819b2"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:17982700c4f6dbb55fa3594f3d7e5dd1c8659a274af3738e33c987d2a27c9d5c"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7c867012c0e8bcd5bdad0f791609101cb5c66acb303ab3270218d6debc68a65e"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7fd6071249c74f779c5b27183295b9202f8dedb68034e716784364443879eaa6"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f90c55a65013cbb8f4d7aab0599bf925cde4adc67ae43a0d323677b5a1c6cb"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b293734059ef656dcd65be62ff771507bea8fed0a711b6733976e1ed3add4553"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e58b47f933aca74c6a60a79dcb21d5b9e47416256c795c2d58d55cec27f9551d"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-win32.whl", hash = "sha256:c581258cf346b327c62c4f1cebd32691826306f6a41d8c4bec43b010dee08e75"},
+    {file = "sentencepiece-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:0993dbc665f4113017892f1b87c3904a44d0640eda510abcacdfb07f74286d36"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ea5f536e32ea8ec96086ee00d7a4a131ce583a1b18d130711707c10e69601cb2"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d0cb51f53b6aae3c36bafe41e86167c71af8370a039f542c43b0cce5ef24a68c"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3212121805afc58d8b00ab4e7dd1f8f76c203ddb9dc94aa4079618a31cf5da0f"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a3149e3066c2a75e0d68a43eb632d7ae728c7925b517f4c05c40f6f7280ce08"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:632f3594d3e7ac8b367bca204cb3fd05a01d5b21455acd097ea4c0e30e2f63d7"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f295105c6bdbb05bd5e1b0cafbd78ff95036f5d3641e7949455a3f4e5e7c3109"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-win32.whl", hash = "sha256:fb89f811e5efd18bab141afc3fea3de141c3f69f3fe9e898f710ae7fe3aab251"},
+    {file = "sentencepiece-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a673a72aab81fef5ebe755c6e0cc60087d1f3a4700835d40537183c1703a45f"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4547683f330289ec4f093027bfeb87f9ef023b2eb6f879fdc4a8187c7e0ffb90"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd6175f7eaec7142d2bf6f6597ce7db4c9ac89acf93fcdb17410c3a8b781eeb"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:859ba1acde782609a0910a26a60e16c191a82bf39b5621107552c0cd79fad00f"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcbbef6cc277f8f18f36959e305f10b1c620442d75addc79c21d7073ae581b50"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-win32.whl", hash = "sha256:536b934e244829e3fe6c4f198652cd82da48adb9aa145c9f00889542726dee3d"},
+    {file = "sentencepiece-0.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:0a91aaa3c769b52440df56fafda683b3aa48e3f2169cf7ee5b8c8454a7f3ae9b"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:787e480ca4c1d08c9985a7eb1eae4345c107729c99e9b5a9a00f2575fc7d4b4b"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4d158189eb2ecffea3a51edf6d25e110b3678ec47f1a40f2d541eafbd8f6250"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1e5ca43013e8935f25457a4fca47e315780172c3e821b4b13a890668911c792"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7140d9e5a74a0908493bb4a13f1f16a401297bd755ada4c707e842fbf6f0f5bf"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-win32.whl", hash = "sha256:6cf333625234f247ab357b0bd9836638405ea9082e1543d5b8408f014979dcbf"},
+    {file = "sentencepiece-0.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ff88712338b01031910e8e61e7239aff3ce8869ee31a47df63cb38aadd591bea"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20813a68d4c221b1849c62c30e1281ea81687894d894b8d4a0f4677d9311e0f5"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:926ef920ae2e8182db31d3f5d081ada57804e3e1d3a8c4ef8b117f9d9fb5a945"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:89f65f69636b7e9c015b79dff9c9985a9bc7d19ded6f79ef9f1ec920fdd73ecf"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f67eae0dbe6f2d7d6ba50a354623d787c99965f068b81e145d53240198021b0"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:98501e075f35dd1a1d5a20f65be26839fcb1938752ec61539af008a5aa6f510b"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3d1d2cc4882e8d6a1adf9d5927d7716f80617fc693385661caff21888972269"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-win32.whl", hash = "sha256:b99a308a2e5e569031ab164b74e6fab0b6f37dfb493c32f7816225f4d411a6dd"},
+    {file = "sentencepiece-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:cdb701eec783d3ec86b7cd4c763adad8eaf6b46db37ee1c36e5e6c44b3fe1b5f"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1e0f9c4d0a6b0af59b613175f019916e28ade076e21242fd5be24340d8a2f64a"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:298f21cc1366eb60311aedba3169d30f885c363ddbf44214b0a587d2908141ad"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f1ec95aa1e5dab11f37ac7eff190493fd87770f7a8b81ebc9dd768d1a3c8704"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b06b70af54daa4b4904cbb90b4eb6d35c9f3252fdc86c9c32d5afd4d30118d8"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22e37bac44dd6603388cb598c64ff7a76e41ca774646f21c23aadfbf5a2228ab"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-win32.whl", hash = "sha256:38aed822fb76435fa1f12185f10465a94ab9e51d5e8a9159e9a540ce926f0ffd"},
+    {file = "sentencepiece-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8cf876516548b5a1d6ac4745d8b554f5c07891d55da557925e5c13ff0b4e6ad"},
+    {file = "sentencepiece-0.2.0.tar.gz", hash = "sha256:a52c19171daaf2e697dc6cbe67684e0fa341b1248966f6aebb541de654d15843"},
+]
+
+[[package]]
+name = "setuptools"
+version = "75.2.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"},
+    {file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"},
+]
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"]
+core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
+type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"]
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+
+[[package]]
+name = "sympy"
+version = "1.13.3"
+description = "Computer algebra system (CAS) in Python"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73"},
+    {file = "sympy-1.13.3.tar.gz", hash = "sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9"},
+]
+
+[package.dependencies]
+mpmath = ">=1.1.0,<1.4"
+
+[package.extras]
+dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
+
+[[package]]
+name = "texttable"
+version = "1.7.0"
+description = "module to create simple ASCII tables"
+optional = true
+python-versions = "*"
+files = [
+    {file = "texttable-1.7.0-py2.py3-none-any.whl", hash = "sha256:72227d592c82b3d7f672731ae73e4d1f88cd8e2ef5b075a7a7f01a23a3743917"},
+    {file = "texttable-1.7.0.tar.gz", hash = "sha256:2d2068fb55115807d3ac77a4ca68fa48803e84ebb0ee2340f858107a36522638"},
+]
+
+[[package]]
+name = "tokenizers"
+version = "0.20.1"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tokenizers-0.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:439261da7c0a5c88bda97acb284d49fbdaf67e9d3b623c0bfd107512d22787a9"},
+    {file = "tokenizers-0.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03dae629d99068b1ea5416d50de0fea13008f04129cc79af77a2a6392792d93c"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b61f561f329ffe4b28367798b89d60c4abf3f815d37413b6352bc6412a359867"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec870fce1ee5248a10be69f7a8408a234d6f2109f8ea827b4f7ecdbf08c9fd15"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d388d1ea8b7447da784e32e3b86a75cce55887e3b22b31c19d0b186b1c677800"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:299c85c1d21135bc01542237979bf25c32efa0d66595dd0069ae259b97fb2dbe"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e96f6c14c9752bb82145636b614d5a78e9cde95edfbe0a85dad0dd5ddd6ec95c"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc9e95ad49c932b80abfbfeaf63b155761e695ad9f8a58c52a47d962d76e310f"},
+    {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f22dee205329a636148c325921c73cf3e412e87d31f4d9c3153b302a0200057b"},
+    {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2ffd9a8895575ac636d44500c66dffaef133823b6b25067604fa73bbc5ec09d"},
+    {file = "tokenizers-0.20.1-cp310-none-win32.whl", hash = "sha256:2847843c53f445e0f19ea842a4e48b89dd0db4e62ba6e1e47a2749d6ec11f50d"},
+    {file = "tokenizers-0.20.1-cp310-none-win_amd64.whl", hash = "sha256:f9aa93eacd865f2798b9e62f7ce4533cfff4f5fbd50c02926a78e81c74e432cd"},
+    {file = "tokenizers-0.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4a717dcb08f2dabbf27ae4b6b20cbbb2ad7ed78ce05a829fae100ff4b3c7ff15"},
+    {file = "tokenizers-0.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f84dad1ff1863c648d80628b1b55353d16303431283e4efbb6ab1af56a75832"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:929c8f3afa16a5130a81ab5079c589226273ec618949cce79b46d96e59a84f61"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d10766473954397e2d370f215ebed1cc46dcf6fd3906a2a116aa1d6219bfedc3"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9300fac73ddc7e4b0330acbdda4efaabf74929a4a61e119a32a181f534a11b47"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ecaf7b0e39caeb1aa6dd6e0975c405716c82c1312b55ac4f716ef563a906969"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5170be9ec942f3d1d317817ced8d749b3e1202670865e4fd465e35d8c259de83"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f1ae08fa9aea5891cbd69df29913e11d3841798e0bfb1ff78b78e4e7ea0a4"},
+    {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ee86d4095d3542d73579e953c2e5e07d9321af2ffea6ecc097d16d538a2dea16"},
+    {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:86dcd08da163912e17b27bbaba5efdc71b4fbffb841530fdb74c5707f3c49216"},
+    {file = "tokenizers-0.20.1-cp311-none-win32.whl", hash = "sha256:9af2dc4ee97d037bc6b05fa4429ddc87532c706316c5e11ce2f0596dfcfa77af"},
+    {file = "tokenizers-0.20.1-cp311-none-win_amd64.whl", hash = "sha256:899152a78b095559c287b4c6d0099469573bb2055347bb8154db106651296f39"},
+    {file = "tokenizers-0.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:407ab666b38e02228fa785e81f7cf79ef929f104bcccf68a64525a54a93ceac9"},
+    {file = "tokenizers-0.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f13a2d16032ebc8bd812eb8099b035ac65887d8f0c207261472803b9633cf3e"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e98eee4dca22849fbb56a80acaa899eec5b72055d79637dd6aa15d5e4b8628c9"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47c1bcdd61e61136087459cb9e0b069ff23b5568b008265e5cbc927eae3387ce"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128c1110e950534426e2274837fc06b118ab5f2fa61c3436e60e0aada0ccfd67"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2e2d47a819d2954f2c1cd0ad51bb58ffac6f53a872d5d82d65d79bf76b9896d"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bdd67a0e3503a9a7cf8bc5a4a49cdde5fa5bada09a51e4c7e1c73900297539bd"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689b93d2e26d04da337ac407acec8b5d081d8d135e3e5066a88edd5bdb5aff89"},
+    {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0c6a796ddcd9a19ad13cf146997cd5895a421fe6aec8fd970d69f9117bddb45c"},
+    {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3ea919687aa7001a8ff1ba36ac64f165c4e89035f57998fa6cedcfd877be619d"},
+    {file = "tokenizers-0.20.1-cp312-none-win32.whl", hash = "sha256:6d3ac5c1f48358ffe20086bf065e843c0d0a9fce0d7f0f45d5f2f9fba3609ca5"},
+    {file = "tokenizers-0.20.1-cp312-none-win_amd64.whl", hash = "sha256:b0874481aea54a178f2bccc45aa2d0c99cd3f79143a0948af6a9a21dcc49173b"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:96af92e833bd44760fb17f23f402e07a66339c1dcbe17d79a9b55bb0cc4f038e"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:65f34e5b731a262dfa562820818533c38ce32a45864437f3d9c82f26c139ca7f"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17f98fccb5c12ab1ce1f471731a9cd86df5d4bd2cf2880c5a66b229802d96145"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8c0fc3542cf9370bf92c932eb71bdeb33d2d4aeeb4126d9fd567b60bd04cb30"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b39356df4575d37f9b187bb623aab5abb7b62c8cb702867a1768002f814800c"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfdad27b0e50544f6b838895a373db6114b85112ba5c0cefadffa78d6daae563"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:094663dd0e85ee2e573126918747bdb40044a848fde388efb5b09d57bc74c680"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e4cf033a2aa207d7ac790e91adca598b679999710a632c4a494aab0fc3a1b2"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9310951c92c9fb91660de0c19a923c432f110dbfad1a2d429fbc44fa956bf64f"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05e41e302c315bd2ed86c02e917bf03a6cf7d2f652c9cee1a0eb0d0f1ca0d32c"},
+    {file = "tokenizers-0.20.1-cp37-none-win32.whl", hash = "sha256:212231ab7dfcdc879baf4892ca87c726259fa7c887e1688e3f3cead384d8c305"},
+    {file = "tokenizers-0.20.1-cp37-none-win_amd64.whl", hash = "sha256:896195eb9dfdc85c8c052e29947169c1fcbe75a254c4b5792cdbd451587bce85"},
+    {file = "tokenizers-0.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:741fb22788482d09d68e73ece1495cfc6d9b29a06c37b3df90564a9cfa688e6d"},
+    {file = "tokenizers-0.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:10be14ebd8082086a342d969e17fc2d6edc856c59dbdbddd25f158fa40eaf043"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:514cf279b22fa1ae0bc08e143458c74ad3b56cd078b319464959685a35c53d5e"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a647c5b7cb896d6430cf3e01b4e9a2d77f719c84cefcef825d404830c2071da2"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cdf379219e1e1dd432091058dab325a2e6235ebb23e0aec8d0508567c90cd01"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ba72260449e16c4c2f6f3252823b059fbf2d31b32617e582003f2b18b415c39"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:910b96ed87316e4277b23c7bcaf667ce849c7cc379a453fa179e7e09290eeb25"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e53975a6694428a0586534cc1354b2408d4e010a3103117f617cbb550299797c"},
+    {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:07c4b7be58da142b0730cc4e5fd66bb7bf6f57f4986ddda73833cd39efef8a01"},
+    {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b605c540753e62199bf15cf69c333e934077ef2350262af2ccada46026f83d1c"},
+    {file = "tokenizers-0.20.1-cp38-none-win32.whl", hash = "sha256:88b3bc76ab4db1ab95ead623d49c95205411e26302cf9f74203e762ac7e85685"},
+    {file = "tokenizers-0.20.1-cp38-none-win_amd64.whl", hash = "sha256:d412a74cf5b3f68a90c615611a5aa4478bb303d1c65961d22db45001df68afcb"},
+    {file = "tokenizers-0.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a25dcb2f41a0a6aac31999e6c96a75e9152fa0127af8ece46c2f784f23b8197a"},
+    {file = "tokenizers-0.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a12c3cebb8c92e9c35a23ab10d3852aee522f385c28d0b4fe48c0b7527d59762"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02e18da58cf115b7c40de973609c35bde95856012ba42a41ee919c77935af251"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f326a1ac51ae909b9760e34671c26cd0dfe15662f447302a9d5bb2d872bab8ab"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b4872647ea6f25224e2833b044b0b19084e39400e8ead3cfe751238b0802140"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce6238a3311bb8e4c15b12600927d35c267b92a52c881ef5717a900ca14793f7"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57b7a8880b208866508b06ce365dc631e7a2472a3faa24daa430d046fb56c885"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a908c69c2897a68f412aa05ba38bfa87a02980df70f5a72fa8490479308b1f2d"},
+    {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:da1001aa46f4490099c82e2facc4fbc06a6a32bf7de3918ba798010954b775e0"},
+    {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:42c097390e2f0ed0a5c5d569e6669dd4e9fff7b31c6a5ce6e9c66a61687197de"},
+    {file = "tokenizers-0.20.1-cp39-none-win32.whl", hash = "sha256:3d4d218573a3d8b121a1f8c801029d70444ffb6d8f129d4cca1c7b672ee4a24c"},
+    {file = "tokenizers-0.20.1-cp39-none-win_amd64.whl", hash = "sha256:37d1e6f616c84fceefa7c6484a01df05caf1e207669121c66213cb5b2911d653"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48689da7a395df41114f516208d6550e3e905e1239cc5ad386686d9358e9cef0"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:712f90ea33f9bd2586b4a90d697c26d56d0a22fd3c91104c5858c4b5b6489a79"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:359eceb6a620c965988fc559cebc0a98db26713758ec4df43fb76d41486a8ed5"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d3caf244ce89d24c87545aafc3448be15870096e796c703a0d68547187192e1"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03b03cf8b9a32254b1bf8a305fb95c6daf1baae0c1f93b27f2b08c9759f41dee"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:218e5a3561561ea0f0ef1559c6d95b825308dbec23fb55b70b92589e7ff2e1e8"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f40df5e0294a95131cc5f0e0eb91fe86d88837abfbee46b9b3610b09860195a7"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:08aaa0d72bb65058e8c4b0455f61b840b156c557e2aca57627056624c3a93976"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:998700177b45f70afeb206ad22c08d9e5f3a80639dae1032bf41e8cbc4dada4b"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62f7fbd3c2c38b179556d879edae442b45f68312019c3a6013e56c3947a4e648"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31e87fca4f6bbf5cc67481b562147fe932f73d5602734de7dd18a8f2eee9c6dd"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:956f21d359ae29dd51ca5726d2c9a44ffafa041c623f5aa33749da87cfa809b9"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:1fbbaf17a393c78d8aedb6a334097c91cb4119a9ced4764ab8cfdc8d254dc9f9"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ebe63e31f9c1a970c53866d814e35ec2ec26fda03097c486f82f3891cee60830"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:81970b80b8ac126910295f8aab2d7ef962009ea39e0d86d304769493f69aaa1e"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130e35e76f9337ed6c31be386e75d4925ea807055acf18ca1a9b0eec03d8fe23"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd28a8614f5c82a54ab2463554e84ad79526c5184cf4573bbac2efbbbcead457"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9041ee665d0fa7f5c4ccf0f81f5e6b7087f797f85b143c094126fc2611fec9d0"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:62eb9daea2a2c06bcd8113a5824af8ef8ee7405d3a71123ba4d52c79bb3d9f1a"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f861889707b54a9ab1204030b65fd6c22bdd4a95205deec7994dc22a8baa2ea4"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:89d5c337d74ea6e5e7dc8af124cf177be843bbb9ca6e58c01f75ea103c12c8a9"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:0b7f515c83397e73292accdbbbedc62264e070bae9682f06061e2ddce67cacaf"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0305fc1ec6b1e5052d30d9c1d5c807081a7bd0cae46a33d03117082e91908c"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dc611e6ac0fa00a41de19c3bf6391a05ea201d2d22b757d63f5491ec0e67faa"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5ffe0d7f7bfcfa3b2585776ecf11da2e01c317027c8573c78ebcb8985279e23"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e7edb8ec12c100d5458d15b1e47c0eb30ad606a05641f19af7563bc3d1608c14"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:de291633fb9303555793cc544d4a86e858da529b7d0b752bcaf721ae1d74b2c9"},
+    {file = "tokenizers-0.20.1.tar.gz", hash = "sha256:84edcc7cdeeee45ceedb65d518fffb77aec69311c9c8e30f77ad84da3025f002"},
+]
+
+[package.dependencies]
+huggingface-hub = ">=0.16.4,<1.0"
+
+[package.extras]
+dev = ["tokenizers[testing]"]
+docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
+testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
+
+[[package]]
+name = "tomli"
+version = "2.0.2"
+description = "A lil' TOML parser"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"},
+    {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"},
+]
+
+[[package]]
+name = "tqdm"
+version = "4.66.5"
+description = "Fast, Extensible Progress Meter"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
+    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
+[[package]]
+name = "transformers"
+version = "4.45.2"
+description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "transformers-4.45.2-py3-none-any.whl", hash = "sha256:c551b33660cfc815bae1f9f097ecfd1e65be623f13c6ee0dda372bd881460210"},
+    {file = "transformers-4.45.2.tar.gz", hash = "sha256:72bc390f6b203892561f05f86bbfaa0e234aab8e927a83e62b9d92ea7e3ae101"},
+]
+
+[package.dependencies]
+filelock = "*"
+huggingface-hub = ">=0.23.2,<1.0"
+numpy = ">=1.17"
+packaging = ">=20.0"
+pyyaml = ">=5.1"
+regex = "!=2019.12.17"
+requests = "*"
+safetensors = ">=0.4.1"
+sentencepiece = {version = ">=0.1.91,<0.1.92 || >0.1.92", optional = true, markers = "extra == \"sentencepiece\""}
+tokenizers = ">=0.20,<0.21"
+tqdm = ">=4.27"
+
+[package.extras]
+accelerate = ["accelerate (>=0.26.0)"]
+agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"]
+audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+benchmark = ["optimum-benchmark (>=0.3.0)"]
+codecarbon = ["codecarbon (==1.2.0)"]
+deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
+deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.20,<0.21)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
+flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+ftfy = ["ftfy"]
+integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"]
+ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"]
+modelcreation = ["cookiecutter (==1.7.3)"]
+natten = ["natten (>=0.14.6,<0.15.0)"]
+onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
+onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
+optuna = ["optuna"]
+quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "libcst", "rich", "ruff (==0.5.1)", "urllib3 (<2.0.0)"]
+ray = ["ray[tune] (>=2.7.0)"]
+retrieval = ["datasets (!=2.5.0)", "faiss-cpu"]
+ruff = ["ruff (==0.5.1)"]
+sagemaker = ["sagemaker (>=2.31.0)"]
+sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
+serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
+sigopt = ["sigopt"]
+sklearn = ["scikit-learn"]
+speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
+testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
+tiktoken = ["blobfile", "tiktoken"]
+timm = ["timm (<=0.9.16)"]
+tokenizers = ["tokenizers (>=0.20,<0.21)"]
+torch = ["accelerate (>=0.26.0)", "torch"]
+torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
+torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
+torchhub = ["filelock", "huggingface-hub (>=0.23.2,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.20,<0.21)", "torch", "tqdm (>=4.27)"]
+video = ["av (==9.2.0)", "decord (==0.6.0)"]
+vision = ["Pillow (>=10.0.1,<=15.0)"]
+
+[[package]]
+name = "triton"
+version = "3.0.0"
+description = "A language and compiler for custom Deep Learning operations"
+optional = true
+python-versions = "*"
+files = [
+    {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"},
+    {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"},
+    {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
+    {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
+    {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
+    {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
+    {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
+    {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
+    {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
+    {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
+]
+
+[package.dependencies]
+filelock = "*"
+
+[package.extras]
+build = ["cmake (>=3.20)", "lit"]
+tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
+tutorials = ["matplotlib", "pandas", "tabulate"]
+
+[[package]]
+name = "typer"
+version = "0.6.1"
+description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "typer-0.6.1-py3-none-any.whl", hash = "sha256:54b19e5df18654070a82f8c2aa1da456a4ac16a2a83e6dcd9f170e291c56338e"},
+    {file = "typer-0.6.1.tar.gz", hash = "sha256:2d5720a5e63f73eaf31edaa15f6ab87f35f0690f8ca233017d7d23d743a91d73"},
+]
+
+[package.dependencies]
+click = ">=7.1.1,<9.0.0"
+
+[package.extras]
+all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
+dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
+doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
+test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
+
+[[package]]
+name = "typing-extensions"
+version = "4.12.2"
+description = "Backported and Experimental Type Hints for Python 3.8+"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
+    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
+]
+
+[[package]]
+name = "tzdata"
+version = "2024.2"
+description = "Provider of IANA time zone data"
+optional = true
+python-versions = ">=2"
+files = [
+    {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"},
+    {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"},
+]
+
+[[package]]
+name = "urllib3"
+version = "2.2.3"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
+    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
+]
+
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+h2 = ["h2 (>=4,<5)"]
+socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
+zstd = ["zstandard (>=0.18.0)"]
+
+[[package]]
+name = "win32-setctime"
+version = "1.1.0"
+description = "A small Python utility to set file creation time on Windows"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"},
+    {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"},
+]
+
+[package.extras]
+dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
+
+[[package]]
+name = "wrapt"
+version = "1.16.0"
+description = "Module for decorators, wrappers and monkey patching."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"},
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"},
+    {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"},
+    {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"},
+    {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"},
+    {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"},
+    {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"},
+    {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"},
+    {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"},
+    {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"},
+    {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"},
+    {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"},
+    {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"},
+    {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"},
+    {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
+]
+
+[[package]]
+name = "xxhash"
+version = "3.5.0"
+description = "Python binding for xxHash"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"},
+    {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196"},
+    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198"},
+    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442"},
+    {file = "xxhash-3.5.0-cp310-cp310-win32.whl", hash = "sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da"},
+    {file = "xxhash-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9"},
+    {file = "xxhash-3.5.0-cp310-cp310-win_arm64.whl", hash = "sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6"},
+    {file = "xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1"},
+    {file = "xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a"},
+    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d"},
+    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839"},
+    {file = "xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da"},
+    {file = "xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58"},
+    {file = "xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3"},
+    {file = "xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00"},
+    {file = "xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6"},
+    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab"},
+    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e"},
+    {file = "xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8"},
+    {file = "xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e"},
+    {file = "xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2"},
+    {file = "xxhash-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6"},
+    {file = "xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb"},
+    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7"},
+    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c"},
+    {file = "xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637"},
+    {file = "xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43"},
+    {file = "xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b"},
+    {file = "xxhash-3.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6e5f70f6dca1d3b09bccb7daf4e087075ff776e3da9ac870f86ca316736bb4aa"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e76e83efc7b443052dd1e585a76201e40b3411fe3da7af4fe434ec51b2f163b"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33eac61d0796ca0591f94548dcfe37bb193671e0c9bcf065789b5792f2eda644"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ec70a89be933ea49222fafc3999987d7899fc676f688dd12252509434636622"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86b8e7f703ec6ff4f351cfdb9f428955859537125904aa8c963604f2e9d3e7"},
+    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0adfbd36003d9f86c8c97110039f7539b379f28656a04097e7434d3eaf9aa131"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:63107013578c8a730419adc05608756c3fa640bdc6abe806c3123a49fb829f43"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:683b94dbd1ca67557850b86423318a2e323511648f9f3f7b1840408a02b9a48c"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5d2a01dcce81789cf4b12d478b5464632204f4c834dc2d064902ee27d2d1f0ee"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:a9d360a792cbcce2fe7b66b8d51274ec297c53cbc423401480e53b26161a290d"},
+    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:f0b48edbebea1b7421a9c687c304f7b44d0677c46498a046079d445454504737"},
+    {file = "xxhash-3.5.0-cp37-cp37m-win32.whl", hash = "sha256:7ccb800c9418e438b44b060a32adeb8393764da7441eb52aa2aa195448935306"},
+    {file = "xxhash-3.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c3bc7bf8cb8806f8d1c9bf149c18708cb1c406520097d6b0a73977460ea03602"},
+    {file = "xxhash-3.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:74752ecaa544657d88b1d1c94ae68031e364a4d47005a90288f3bab3da3c970f"},
+    {file = "xxhash-3.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dee1316133c9b463aa81aca676bc506d3f80d8f65aeb0bba2b78d0b30c51d7bd"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:602d339548d35a8579c6b013339fb34aee2df9b4e105f985443d2860e4d7ffaa"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:695735deeddfb35da1677dbc16a083445360e37ff46d8ac5c6fcd64917ff9ade"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1030a39ba01b0c519b1a82f80e8802630d16ab95dc3f2b2386a0b5c8ed5cbb10"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5bc08f33c4966f4eb6590d6ff3ceae76151ad744576b5fc6c4ba8edd459fdec"},
+    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:160e0c19ee500482ddfb5d5570a0415f565d8ae2b3fd69c5dcfce8a58107b1c3"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f1abffa122452481a61c3551ab3c89d72238e279e517705b8b03847b1d93d738"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:d5e9db7ef3ecbfc0b4733579cea45713a76852b002cf605420b12ef3ef1ec148"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:23241ff6423378a731d84864bf923a41649dc67b144debd1077f02e6249a0d54"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:82b833d5563fefd6fceafb1aed2f3f3ebe19f84760fdd289f8b926731c2e6e91"},
+    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0a80ad0ffd78bef9509eee27b4a29e56f5414b87fb01a888353e3d5bda7038bd"},
+    {file = "xxhash-3.5.0-cp38-cp38-win32.whl", hash = "sha256:50ac2184ffb1b999e11e27c7e3e70cc1139047e7ebc1aa95ed12f4269abe98d4"},
+    {file = "xxhash-3.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:392f52ebbb932db566973693de48f15ce787cabd15cf6334e855ed22ea0be5b3"},
+    {file = "xxhash-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bfc8cdd7f33d57f0468b0614ae634cc38ab9202c6957a60e31d285a71ebe0301"},
+    {file = "xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0c48b6300cd0b0106bf49169c3e0536408dfbeb1ccb53180068a18b03c662ab"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe1a92cfbaa0a1253e339ccec42dbe6db262615e52df591b68726ab10338003f"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33513d6cc3ed3b559134fb307aae9bdd94d7e7c02907b37896a6c45ff9ce51bd"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eefc37f6138f522e771ac6db71a6d4838ec7933939676f3753eafd7d3f4c40bc"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a606c8070ada8aa2a88e181773fa1ef17ba65ce5dd168b9d08038e2a61b33754"},
+    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42eca420c8fa072cc1dd62597635d140e78e384a79bb4944f825fbef8bfeeef6"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:604253b2143e13218ff1ef0b59ce67f18b8bd1c4205d2ffda22b09b426386898"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6e93a5ad22f434d7876665444a97e713a8f60b5b1a3521e8df11b98309bff833"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7a46e1d6d2817ba8024de44c4fd79913a90e5f7265434cef97026215b7d30df6"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:30eb2efe6503c379b7ab99c81ba4a779748e3830241f032ab46bd182bf5873af"},
+    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c8aa771ff2c13dd9cda8166d685d7333d389fae30a4d2bb39d63ab5775de8606"},
+    {file = "xxhash-3.5.0-cp39-cp39-win32.whl", hash = "sha256:5ed9ebc46f24cf91034544b26b131241b699edbfc99ec5e7f8f3d02d6eb7fba4"},
+    {file = "xxhash-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:220f3f896c6b8d0316f63f16c077d52c412619e475f9372333474ee15133a558"},
+    {file = "xxhash-3.5.0-cp39-cp39-win_arm64.whl", hash = "sha256:a7b1d8315d9b5e9f89eb2933b73afae6ec9597a258d52190944437158b49d38e"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b"},
+    {file = "xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2b4154c00eb22e4d543f472cfca430e7962a0f1d0f3778334f2e08a7ba59363c"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d30bbc1644f726b825b3278764240f449d75f1a8bdda892e641d4a688b1494ae"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa0b72f2423e2aa53077e54a61c28e181d23effeaafd73fcb9c494e60930c8e"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:13de2b76c1835399b2e419a296d5b38dc4855385d9e96916299170085ef72f57"},
+    {file = "xxhash-3.5.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:0691bfcc4f9c656bcb96cc5db94b4d75980b9d5589f2e59de790091028580837"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:297595fe6138d4da2c8ce9e72a04d73e58725bb60f3a19048bc96ab2ff31c692"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc1276d369452040cbb943300dc8abeedab14245ea44056a2943183822513a18"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2061188a1ba352fc699c82bff722f4baacb4b4b8b2f0c745d2001e56d0dfb514"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38c384c434021e4f62b8d9ba0bc9467e14d394893077e2c66d826243025e1f81"},
+    {file = "xxhash-3.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e6a4dd644d72ab316b580a1c120b375890e4c52ec392d4aef3c63361ec4d77d1"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:531af8845aaadcadf951b7e0c1345c6b9c68a990eeb74ff9acd8501a0ad6a1c9"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ce379bcaa9fcc00f19affa7773084dd09f5b59947b3fb47a1ceb0179f91aaa1"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd1b2281d01723f076df3c8188f43f2472248a6b63118b036e641243656b1b0f"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c770750cc80e8694492244bca7251385188bc5597b6a39d98a9f30e8da984e0"},
+    {file = "xxhash-3.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b150b8467852e1bd844387459aa6fbe11d7f38b56e901f9f3b3e6aba0d660240"},
+    {file = "xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f"},
+]
+
+[[package]]
+name = "yarl"
+version = "1.16.0"
+description = "Yet another URL library"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:32468f41242d72b87ab793a86d92f885355bcf35b3355aa650bfa846a5c60058"},
+    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:234f3a3032b505b90e65b5bc6652c2329ea7ea8855d8de61e1642b74b4ee65d2"},
+    {file = "yarl-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a0296040e5cddf074c7f5af4a60f3fc42c0237440df7bcf5183be5f6c802ed5"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de6c14dd7c7c0badba48157474ea1f03ebee991530ba742d381b28d4f314d6f3"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b140e532fe0266003c936d017c1ac301e72ee4a3fd51784574c05f53718a55d8"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:019f5d58093402aa8f6661e60fd82a28746ad6d156f6c5336a70a39bd7b162b9"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c42998fd1cbeb53cd985bff0e4bc25fbe55fd6eb3a545a724c1012d69d5ec84"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c7c30fb38c300fe8140df30a046a01769105e4cf4282567a29b5cdb635b66c4"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e49e0fd86c295e743fd5be69b8b0712f70a686bc79a16e5268386c2defacaade"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:b9ca7b9147eb1365c8bab03c003baa1300599575effad765e0b07dd3501ea9af"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:27e11db3f1e6a51081a981509f75617b09810529de508a181319193d320bc5c7"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8994c42f4ca25df5380ddf59f315c518c81df6a68fed5bb0c159c6cb6b92f120"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:542fa8e09a581bcdcbb30607c7224beff3fdfb598c798ccd28a8184ffc18b7eb"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2bd6a51010c7284d191b79d3b56e51a87d8e1c03b0902362945f15c3d50ed46b"},
+    {file = "yarl-1.16.0-cp310-cp310-win32.whl", hash = "sha256:178ccb856e265174a79f59721031060f885aca428983e75c06f78aa24b91d929"},
+    {file = "yarl-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe8bba2545427418efc1929c5c42852bdb4143eb8d0a46b09de88d1fe99258e7"},
+    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d8643975a0080f361639787415a038bfc32d29208a4bf6b783ab3075a20b1ef3"},
+    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:676d96bafc8c2d0039cea0cd3fd44cee7aa88b8185551a2bb93354668e8315c2"},
+    {file = "yarl-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9525f03269e64310416dbe6c68d3b23e5d34aaa8f47193a1c45ac568cecbc49"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b37d5ec034e668b22cf0ce1074d6c21fd2a08b90d11b1b73139b750a8b0dd97"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f32c4cb7386b41936894685f6e093c8dfaf0960124d91fe0ec29fe439e201d0"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b8e265a0545637492a7e12fd7038370d66c9375a61d88c5567d0e044ded9202"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:789a3423f28a5fff46fbd04e339863c169ece97c827b44de16e1a7a42bc915d2"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1d1f45e3e8d37c804dca99ab3cf4ab3ed2e7a62cd82542924b14c0a4f46d243"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:621280719c4c5dad4c1391160a9b88925bb8b0ff6a7d5af3224643024871675f"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed097b26f18a1f5ff05f661dc36528c5f6735ba4ce8c9645e83b064665131349"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f1fe2b2e3ee418862f5ebc0c0083c97f6f6625781382f828f6d4e9b614eba9b"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:87dd10bc0618991c66cee0cc65fa74a45f4ecb13bceec3c62d78ad2e42b27a16"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4199db024b58a8abb2cfcedac7b1292c3ad421684571aeb622a02f242280e8d6"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:99a9dcd4b71dd5f5f949737ab3f356cfc058c709b4f49833aeffedc2652dac56"},
+    {file = "yarl-1.16.0-cp311-cp311-win32.whl", hash = "sha256:a9394c65ae0ed95679717d391c862dece9afacd8fa311683fc8b4362ce8a410c"},
+    {file = "yarl-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:5b9101f528ae0f8f65ac9d64dda2bb0627de8a50344b2f582779f32fda747c1d"},
+    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4ffb7c129707dd76ced0a4a4128ff452cecf0b0e929f2668ea05a371d9e5c104"},
+    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1a5e9d8ce1185723419c487758d81ac2bde693711947032cce600ca7c9cda7d6"},
+    {file = "yarl-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d743e3118b2640cef7768ea955378c3536482d95550222f908f392167fe62059"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26768342f256e6e3c37533bf9433f5f15f3e59e3c14b2409098291b3efaceacb"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1b0796168b953bca6600c5f97f5ed407479889a36ad7d17183366260f29a6b9"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:858728086914f3a407aa7979cab743bbda1fe2bdf39ffcd991469a370dd7414d"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5570e6d47bcb03215baf4c9ad7bf7c013e56285d9d35013541f9ac2b372593e7"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66ea8311422a7ba1fc79b4c42c2baa10566469fe5a78500d4e7754d6e6db8724"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:649bddcedee692ee8a9b7b6e38582cb4062dc4253de9711568e5620d8707c2a3"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3a91654adb7643cb21b46f04244c5a315a440dcad63213033826549fa2435f71"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b439cae82034ade094526a8f692b9a2b5ee936452de5e4c5f0f6c48df23f8604"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:571f781ae8ac463ce30bacebfaef2c6581543776d5970b2372fbe31d7bf31a07"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:aa7943f04f36d6cafc0cf53ea89824ac2c37acbdb4b316a654176ab8ffd0f968"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1a5cf32539373ff39d97723e39a9283a7277cbf1224f7aef0c56c9598b6486c3"},
+    {file = "yarl-1.16.0-cp312-cp312-win32.whl", hash = "sha256:a5b6c09b9b4253d6a208b0f4a2f9206e511ec68dce9198e0fbec4f160137aa67"},
+    {file = "yarl-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:1208ca14eed2fda324042adf8d6c0adf4a31522fa95e0929027cd487875f0240"},
+    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5ace0177520bd4caa99295a9b6fb831d0e9a57d8e0501a22ffaa61b4c024283"},
+    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7118bdb5e3ed81acaa2095cba7ec02a0fe74b52a16ab9f9ac8e28e53ee299732"},
+    {file = "yarl-1.16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38fec8a2a94c58bd47c9a50a45d321ab2285ad133adefbbadf3012c054b7e656"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8791d66d81ee45866a7bb15a517b01a2bcf583a18ebf5d72a84e6064c417e64b"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cf936ba67bc6c734f3aa1c01391da74ab7fc046a9f8bbfa230b8393b90cf472"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1aab176dd55b59f77a63b27cffaca67d29987d91a5b615cbead41331e6b7428"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:995d0759004c08abd5d1b81300a91d18c8577c6389300bed1c7c11675105a44d"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bc22e00edeb068f71967ab99081e9406cd56dbed864fc3a8259442999d71552"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:35b4f7842154176523e0a63c9b871168c69b98065d05a4f637fce342a6a2693a"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:7ace71c4b7a0c41f317ae24be62bb61e9d80838d38acb20e70697c625e71f120"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8f639e3f5795a6568aa4f7d2ac6057c757dcd187593679f035adbf12b892bb00"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e8be3aff14f0120ad049121322b107f8a759be76a6a62138322d4c8a337a9e2c"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:122d8e7986043d0549e9eb23c7fd23be078be4b70c9eb42a20052b3d3149c6f2"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fd9c227990f609c165f56b46107d0bc34553fe0387818c42c02f77974402c36"},
+    {file = "yarl-1.16.0-cp313-cp313-win32.whl", hash = "sha256:595ca5e943baed31d56b33b34736461a371c6ea0038d3baec399949dd628560b"},
+    {file = "yarl-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:921b81b8d78f0e60242fb3db615ea3f368827a76af095d5a69f1c3366db3f596"},
+    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab2b2ac232110a1fdb0d3ffcd087783edd3d4a6ced432a1bf75caf7b7be70916"},
+    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7f8713717a09acbfee7c47bfc5777e685539fefdd34fa72faf504c8be2f3df4e"},
+    {file = "yarl-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cdcffe1dbcb4477d2b4202f63cd972d5baa155ff5a3d9e35801c46a415b7f71a"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a91217208306d82357c67daeef5162a41a28c8352dab7e16daa82e3718852a7"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ab3ed42c78275477ea8e917491365e9a9b69bb615cb46169020bd0aa5e2d6d3"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:707ae579ccb3262dfaef093e202b4c3fb23c3810e8df544b1111bd2401fd7b09"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7a852d1cd0b8d8b37fc9d7f8581152add917a98cfe2ea6e241878795f917ae"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3f1cc3d3d4dc574bebc9b387f6875e228ace5748a7c24f49d8f01ac1bc6c31b"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5ff96da263740779b0893d02b718293cc03400c3a208fc8d8cd79d9b0993e532"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:3d375a19ba2bfe320b6d873f3fb165313b002cef8b7cc0a368ad8b8a57453837"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:62c7da0ad93a07da048b500514ca47b759459ec41924143e2ddb5d7e20fd3db5"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:147b0fcd0ee33b4b5f6edfea80452d80e419e51b9a3f7a96ce98eaee145c1581"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:504e1fe1cc4f170195320eb033d2b0ccf5c6114ce5bf2f617535c01699479bca"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bdcf667a5dec12a48f669e485d70c54189f0639c2157b538a4cffd24a853624f"},
+    {file = "yarl-1.16.0-cp39-cp39-win32.whl", hash = "sha256:e9951afe6557c75a71045148890052cb942689ee4c9ec29f5436240e1fcc73b7"},
+    {file = "yarl-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d7aaa8ff95d0840e289423e7dc35696c2b058d635f945bf05b5cd633146b027"},
+    {file = "yarl-1.16.0-py3-none-any.whl", hash = "sha256:e6980a558d8461230c457218bd6c92dfc1d10205548215c2c21d79dc8d0a96f3"},
+    {file = "yarl-1.16.0.tar.gz", hash = "sha256:b6f687ced5510a9a2474bbae96a4352e5ace5fa34dc44a217b0537fec1db00b4"},
+]
+
+[package.dependencies]
+idna = ">=2.0"
+multidict = ">=4.0"
+propcache = ">=0.2.0"
+
+[[package]]
+name = "zipp"
+version = "3.20.2"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"},
+    {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"},
+]
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
+type = ["pytest-mypy"]
+
+[metadata]
+lock-version = "2.0"
+python-versions = ">=3.9,<3.13"
+content-hash = "d169e52a2daaf41488cf7191e3f4ff209d39125941a6d0e656001aae534b1e8d"
diff --git a/backends/gaudi/server/pyproject.toml b/backends/gaudi/server/pyproject.toml
new file mode 100644
index 00000000..c61ac030
--- /dev/null
+++ b/backends/gaudi/server/pyproject.toml
@@ -0,0 +1,42 @@
+[tool.poetry]
+name = "text-generation-server"
+version = "2.0.4"
+description = "Text Generation Inference Python gRPC Server"
+authors = ["Olivier Dehaene <olivier@huggingface.co>"]
+
+[tool.poetry.scripts]
+text-generation-server = 'text_generation_server.cli:app'
+
+[tool.poetry.dependencies]
+python = ">=3.9,<3.13"
+protobuf = "^3.20.3"
+grpcio = "^1.51.1"
+grpcio-status = "*"
+grpcio-reflection = "*"
+grpc-interceptor = "^0.15.0"
+typer = "^0.7.0"
+loguru = "^0.6.0"
+opentelemetry-api = "^1.15.0"
+opentelemetry-exporter-otlp = "^1.15.0"
+opentelemetry-instrumentation-grpc = "^0.36b0"
+hf-transfer = "^0.1.2"
+sentencepiece = "^0.1.97"
+peft = "^0.10"
+optimum-habana = "1.15.0"
+transformers = "4.45.2"
+numpy = "1.26.4"
+accelerate = "0.33.0"
+outlines= { version = "^0.0.36", optional = true }
+prometheus-client = "^0.20.0"
+py-cpuinfo = "^9.0.0"
+
+[tool.poetry.group.dev.dependencies]
+grpcio-tools = "*"
+pytest = "^7.3.0"
+
+[tool.pytest.ini_options]
+markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/backends/gaudi/server/requirements.txt b/backends/gaudi/server/requirements.txt
new file mode 100644
index 00000000..07414490
--- /dev/null
+++ b/backends/gaudi/server/requirements.txt
@@ -0,0 +1,89 @@
+accelerate==0.33.0 ; python_version >= "3.9" and python_version < "3.13"
+aiohappyeyeballs==2.4.3 ; python_version >= "3.9" and python_version < "3.13"
+aiohttp==3.10.10 ; python_version >= "3.9" and python_version < "3.13"
+aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
+async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.11"
+attrs==24.2.0 ; python_version >= "3.9" and python_version < "3.13"
+backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
+click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
+colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
+coloredlogs==15.0.1 ; python_version >= "3.9" and python_version < "3.13"
+datasets==3.0.1 ; python_version >= "3.9" and python_version < "3.13"
+deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
+diffusers==0.31.0 ; python_version >= "3.9" and python_version < "3.13"
+dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
+frozenlist==1.4.1 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
+fsspec[http]==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
+grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13"
+hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.26.1 ; python_version >= "3.9" and python_version < "3.13"
+humanfriendly==10.0 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
+importlib-metadata==8.5.0 ; python_version >= "3.9" and python_version < "3.13"
+jinja2==3.1.4 ; python_version >= "3.9" and python_version < "3.13"
+joblib==1.4.2 ; python_version >= "3.9" and python_version < "3.13"
+loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
+markupsafe==3.0.2 ; python_version >= "3.9" and python_version < "3.13"
+mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
+multidict==6.1.0 ; python_version >= "3.9" and python_version < "3.13"
+multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "3.13"
+networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+optimum-habana==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+optimum==1.23.2 ; python_version >= "3.9" and python_version < "3.13"
+packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
+pandas==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
+peft==0.10.0 ; python_version >= "3.9" and python_version < "3.13"
+pillow==11.0.0 ; python_version >= "3.9" and python_version < "3.13"
+prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
+propcache==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==3.20.3 ; python_version >= "3.9" and python_version < "3.13"
+psutil==6.1.0 ; python_version >= "3.9" and python_version < "3.13"
+py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
+pyarrow==17.0.0 ; python_version >= "3.9" and python_version < "3.13"
+pyreadline3==3.5.4 ; sys_platform == "win32" and python_version >= "3.9" and python_version < "3.13"
+python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_version < "3.13"
+pytz==2024.2 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
+scikit-learn==1.5.2 ; python_version >= "3.9" and python_version < "3.13"
+scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
+sentence-transformers[train]==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
+sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
+six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
+sympy==1.12.1 ; python_version >= "3.9" and python_version < "3.13"
+threadpoolctl==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers[sentencepiece]==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+triton==3.0.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.13" and python_version >= "3.9"
+typer==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
+tzdata==2024.2 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
+win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
+wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
+xxhash==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
+yarl==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
+zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/backends/gaudi/server/tests/conftest.py b/backends/gaudi/server/tests/conftest.py
new file mode 100644
index 00000000..d99771f8
--- /dev/null
+++ b/backends/gaudi/server/tests/conftest.py
@@ -0,0 +1,23 @@
+import pytest
+import os
+from text_generation_server.pb import generate_pb2
+
+os.environ["USE_PREFIX_CACHING"] = "1"
+os.environ["ATTENTION"] = "flashinfer"
+
+
+@pytest.fixture
+def default_pb_parameters():
+    return generate_pb2.NextTokenChooserParameters(
+        temperature=1.0,
+        repetition_penalty=1.0,
+        top_k=0,
+        top_p=1.0,
+        typical_p=1.0,
+        do_sample=False,
+    )
+
+
+@pytest.fixture
+def default_pb_stop_parameters():
+    return generate_pb2.StoppingCriteriaParameters(stop_sequences=[], max_new_tokens=10)
diff --git a/backends/gaudi/server/tests/models/test_bloom.py b/backends/gaudi/server/tests/models/test_bloom.py
new file mode 100644
index 00000000..dff5239b
--- /dev/null
+++ b/backends/gaudi/server/tests/models/test_bloom.py
@@ -0,0 +1,363 @@
+import pytest
+import torch
+
+from copy import copy
+from transformers import AutoTokenizer
+
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.causal_lm import CausalLMBatch
+from text_generation_server.utils import weight_hub_files, download_weights
+from text_generation_server.models.bloom import BloomCausalLMBatch, BLOOMSharded
+from text_generation_server.models.custom_modeling.bloom_modeling import (
+    BloomForCausalLM,
+)
+
+
+@pytest.fixture(scope="session")
+def default_bloom():
+    model_id = "bigscience/bloom-560m"
+    revision = "main"
+    filenames = weight_hub_files(model_id, revision, ".safetensors")
+    download_weights(filenames, model_id, revision)
+    return BLOOMSharded(
+        model_id,
+        model_class=BloomForCausalLM,
+    )
+
+
+@pytest.fixture(scope="session")
+def bloom_560m_tokenizer():
+    return AutoTokenizer.from_pretrained("bigscience/bloom-560m", padding_side="left")
+
+
+@pytest.fixture
+def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
+    return generate_pb2.Request(
+        id=0,
+        inputs="Test",
+        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
+        prefill_logprobs=True,
+        truncate=100,
+        parameters=default_pb_parameters,
+        stopping_parameters=default_pb_stop_parameters,
+    )
+
+
+@pytest.fixture
+def default_pb_batch(default_pb_request):
+    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
+
+
+@pytest.fixture
+def default_bloom_batch(default_pb_batch, bloom_560m_tokenizer):
+    return BloomCausalLMBatch.from_pb(
+        default_pb_batch, bloom_560m_tokenizer, torch.float32, torch.device("cpu")
+    )
+
+
+@pytest.fixture
+def default_multi_requests_bloom_batch(default_pb_request, bloom_560m_tokenizer):
+    req_0 = copy(default_pb_request)
+    req_0.id = 1
+    req_1 = default_pb_request
+    req_1.id = 2
+    req_1.stopping_parameters.max_new_tokens = 5
+
+    batch_pb = generate_pb2.Batch(id=0, requests=[req_0, req_1], size=2)
+    return BloomCausalLMBatch.from_pb(
+        batch_pb, bloom_560m_tokenizer, torch.float32, torch.device("cpu")
+    )
+
+
+def test_batch_from_pb(default_pb_batch, default_bloom_batch):
+    batch = default_bloom_batch
+
+    assert batch.batch_id == default_pb_batch.id
+    assert batch.requests == default_pb_batch.requests
+
+    assert len(batch.input_ids) == default_pb_batch.size
+    assert batch.input_ids[0][-1] == 10264
+    assert torch.all(batch.input_ids[0][:-1] == 3)
+
+    assert batch.attention_mask[0][0] == 1
+    assert torch.all(batch.attention_mask[0][1:] == 0)
+
+    assert batch.past_key_values is None
+
+    assert all(
+        [
+            torch.equal(input_ids, all_input_ids[:, 0])
+            for input_ids, all_input_ids in zip(batch.input_ids, batch.all_input_ids)
+        ]
+    )
+
+    assert batch.input_lengths == [1]
+
+    assert len(batch) == default_pb_batch.size
+    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)
+
+    assert batch.max_input_length == batch.input_lengths[0]
+
+
+def test_batch_concatenate_no_prefill(default_bloom_batch):
+    with pytest.raises(ValueError):
+        BloomCausalLMBatch.concatenate([default_bloom_batch, default_bloom_batch])
+
+
+def test_causal_lm_batch_type(default_bloom):
+    assert default_bloom.batch_type == BloomCausalLMBatch
+
+
+def test_causal_lm_generate_token(default_bloom, default_bloom_batch):
+    sequence_length = len(default_bloom_batch.all_input_ids[0])
+    generations, next_batch, _ = default_bloom.generate_token(default_bloom_batch)
+
+    assert len(generations) == len(default_bloom_batch)
+    assert isinstance(next_batch, CausalLMBatch)
+    assert not next_batch.keys_head_dim_last
+
+    assert len(next_batch.all_input_ids) == len(next_batch)
+    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
+    assert len(next_batch.attention_mask[0]) == 11
+    assert torch.all(next_batch.all_input_ids[0][-2:] == 10264)
+    assert torch.all(next_batch.all_input_ids[0][:-2] == 3)
+
+    assert torch.all(next_batch.attention_mask[0][:2] == 1)
+    assert torch.all(next_batch.attention_mask[0][2:] == 0)
+
+    assert next_batch.input_ids.shape == (len(next_batch), 1)
+    assert next_batch.input_ids[0, 0] == 10264
+
+    assert next_batch.input_lengths == [2]
+    assert next_batch.max_input_length == next_batch.input_lengths[0]
+
+    assert next_batch.past_key_values is not None
+    assert all(
+        [p[0].shape == (16, 64, sequence_length) for p in next_batch.past_key_values]
+    )
+    assert all(
+        [p[1].shape == (16, sequence_length, 64) for p in next_batch.past_key_values]
+    )
+    assert all([generation.generated_text is None for generation in generations])
+    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
+    assert all(
+        [
+            token_id.item() == 10264
+            for generation in generations
+            for token_id in generation.tokens.token_ids
+        ]
+    )
+    assert all(
+        [
+            token_text == "Test"
+            for generation in generations
+            for token_text in generation.tokens.texts
+        ]
+    )
+    assert generations[0].request_id == 0
+
+
+def test_causal_lm_generate_token_completion(default_bloom, default_bloom_batch):
+    next_batch = default_bloom_batch
+    for _ in range(default_bloom_batch.stopping_criterias[0].max_new_tokens - 1):
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
+        assert len(generations) == len(default_bloom_batch)
+
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert (
+        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
+    )
+    assert generations[0].request_id == default_bloom_batch.requests[0].id
+    assert (
+        generations[0].generated_text.generated_tokens
+        == default_bloom_batch.stopping_criterias[0].max_new_tokens
+    )
+
+
+def test_causal_lm_generate_token_completion_multi(
+    default_bloom, default_multi_requests_bloom_batch
+):
+    next_batch = default_multi_requests_bloom_batch
+
+    for i in range(
+        default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 1
+    ):
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
+        assert len(generations) == len(default_multi_requests_bloom_batch)
+
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
+    assert next_batch is not None
+
+    assert len(generations) == 2
+    assert generations[1].generated_text.text == "TestTestTestTestTest"
+    assert (
+        generations[1].request_id == default_multi_requests_bloom_batch.requests[1].id
+    )
+    assert (
+        generations[1].generated_text.generated_tokens
+        == default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
+    )
+    # Copy stopping_criterias before filtering
+    stopping_criterias = default_multi_requests_bloom_batch.stopping_criterias.copy()
+
+    next_batch = next_batch.filter([next_batch.requests[0].id])
+
+    for _ in range(
+        stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
+    ):
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert (
+        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
+    )
+    assert (
+        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
+    )
+    assert (
+        generations[0].generated_text.generated_tokens
+        == default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
+    )
+
+
+def test_batch_concatenate(
+    default_bloom, default_bloom_batch, default_multi_requests_bloom_batch
+):
+    next_batch_0 = default_bloom_batch
+    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)
+    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)
+
+    next_batch_1 = default_multi_requests_bloom_batch
+    _, next_batch_1, _ = default_bloom.generate_token(next_batch_1)
+
+    # Clone past_key_values before concatenating to compare after,
+    # because they are removed from the concatenated batches
+    next_batch_0_past_key_values = [
+        (k.clone(), v.clone()) for (k, v) in next_batch_0.past_key_values
+    ]
+    next_batch_1_past_key_values = [
+        (k.clone(), v.clone()) for (k, v) in next_batch_1.past_key_values
+    ]
+
+    next_batch = BloomCausalLMBatch.concatenate([next_batch_0, next_batch_1])
+
+    assert torch.equal(next_batch.all_input_ids[0], next_batch_0.all_input_ids[0])
+    assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
+    assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])
+
+    assert torch.all(
+        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
+    )
+    assert torch.all(
+        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
+    )
+    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)
+
+    assert next_batch.batch_id == 0
+    assert torch.all(next_batch.input_ids == 10264)
+
+    assert next_batch.input_lengths == [3, 2, 2]
+    assert next_batch.max_input_length == 3
+
+    assert next_batch.requests[0] == next_batch_0.requests[0]
+    assert next_batch.requests[1:] == list(next_batch_1.requests)
+
+    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
+    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
+
+    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
+    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias
+
+    assert next_batch.past_key_values is not None
+    assert all([p[0].shape == (3, 16, 64, 2) for p in next_batch.past_key_values])
+    assert all([p[1].shape == (3, 16, 2, 64) for p in next_batch.past_key_values])
+
+    for i, past in enumerate(next_batch.past_key_values):
+        assert torch.equal(next_batch_0_past_key_values[i][0][:, :, -2:], past[0][0])
+        assert torch.equal(
+            next_batch_1_past_key_values[i][0][:, :, -1:],
+            past[0][1:, :, :, -1].reshape(-1, 64, 1),
+        )
+
+        assert torch.equal(next_batch_0_past_key_values[i][1][:, -2:, :], past[1][0])
+        assert torch.equal(
+            next_batch_1_past_key_values[i][1][:, -1:, :],
+            past[1][1:, :, -1, :].reshape(-1, 1, 64),
+        )
+
+    for _ in range(
+        default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 2
+    ):
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
+    assert next_batch is not None
+
+    assert len(generations) == 3
+    assert generations[2].generated_text.text == "TestTestTestTestTest"
+    assert (
+        generations[2].request_id == default_multi_requests_bloom_batch.requests[1].id
+    )
+    assert (
+        generations[2].generated_text.generated_tokens
+        == default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
+    )
+
+    next_batch = next_batch.filter(
+        [next_batch.requests[0].id, next_batch.requests[1].id]
+    )
+
+    for _ in range(
+        default_bloom_batch.stopping_criterias[0].max_new_tokens
+        - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
+        - 2
+    ):
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
+    assert next_batch is not None
+
+    assert len(generations) == 2
+    assert (
+        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
+    )
+    assert generations[0].request_id == default_bloom_batch.requests[0].id
+    assert (
+        generations[0].generated_text.generated_tokens
+        == default_bloom_batch.stopping_criterias[0].max_new_tokens
+    )
+
+    next_batch = next_batch.filter([next_batch.requests[1].id])
+
+    for _ in range(
+        default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
+        - default_bloom_batch.stopping_criterias[0].max_new_tokens
+        - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
+        - 4
+    ):
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert (
+        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
+    )
+    assert (
+        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
+    )
+    assert (
+        generations[0].generated_text.generated_tokens
+        == default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
+    )
diff --git a/backends/gaudi/server/tests/models/test_causal_lm.py b/backends/gaudi/server/tests/models/test_causal_lm.py
new file mode 100644
index 00000000..11ca5efe
--- /dev/null
+++ b/backends/gaudi/server/tests/models/test_causal_lm.py
@@ -0,0 +1,354 @@
+import pytest
+import torch
+
+from copy import copy
+from transformers import AutoTokenizer
+
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.causal_lm import CausalLM, CausalLMBatch
+
+
+@pytest.fixture(scope="session")
+def default_causal_lm():
+    return CausalLM.fallback("gpt2")
+
+
+@pytest.fixture(scope="session")
+def gpt2_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
+    tokenizer.pad_token_id = 50256
+    return tokenizer
+
+
+@pytest.fixture
+def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
+    return generate_pb2.Request(
+        id=0,
+        inputs="Test",
+        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
+        prefill_logprobs=True,
+        truncate=100,
+        parameters=default_pb_parameters,
+        stopping_parameters=default_pb_stop_parameters,
+    )
+
+
+@pytest.fixture
+def default_pb_batch(default_pb_request):
+    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
+
+
+@pytest.fixture
+def default_causal_lm_batch(default_pb_batch, gpt2_tokenizer):
+    return CausalLMBatch.from_pb(
+        default_pb_batch, gpt2_tokenizer, torch.float32, torch.device("cpu")
+    )
+
+
+@pytest.fixture
+def default_multi_requests_causal_lm_batch(default_pb_request, gpt2_tokenizer):
+    req_0 = copy(default_pb_request)
+    req_0.id = 1
+    req_1 = default_pb_request
+    req_1.id = 2
+    req_1.stopping_parameters.max_new_tokens = 5
+
+    batch_pb = generate_pb2.Batch(id=1, requests=[req_0, req_1], size=2)
+    return CausalLMBatch.from_pb(
+        batch_pb, gpt2_tokenizer, torch.float32, torch.device("cpu")
+    )
+
+
+def test_batch_from_pb(default_pb_batch, default_causal_lm_batch):
+    batch = default_causal_lm_batch
+
+    assert batch.batch_id == default_pb_batch.id
+    assert batch.requests == default_pb_batch.requests
+
+    assert len(batch.input_ids) == default_pb_batch.size
+    assert batch.input_ids[0][-1] == 14402
+    assert torch.all(batch.input_ids[0][:-1] == 50256)
+
+    assert batch.attention_mask[0, 0] == 1
+    assert torch.all(batch.attention_mask[0, 1:] == 0)
+
+    assert batch.past_key_values is None
+
+    assert all(
+        [
+            torch.equal(input_ids, all_input_ids[:, 0])
+            for input_ids, all_input_ids in zip(batch.input_ids, batch.all_input_ids)
+        ]
+    )
+
+    assert batch.input_lengths == [1]
+
+    assert len(batch) == default_pb_batch.size
+    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)
+
+    assert batch.max_input_length == batch.input_lengths[0]
+
+
+def test_batch_concatenate_no_prefill(default_causal_lm_batch):
+    with pytest.raises(ValueError):
+        CausalLMBatch.concatenate([default_causal_lm_batch, default_causal_lm_batch])
+
+
+def test_causal_lm_batch_type(default_causal_lm):
+    assert default_causal_lm.batch_type == CausalLMBatch
+
+
+def test_causal_lm_generate_token(default_causal_lm, default_causal_lm_batch):
+    sequence_length = len(default_causal_lm_batch.all_input_ids[0])
+    generations, next_batch, _ = default_causal_lm.generate_token(
+        default_causal_lm_batch
+    )
+
+    assert len(generations) == len(next_batch)
+    assert isinstance(next_batch, CausalLMBatch)
+
+    assert len(next_batch.all_input_ids) == len(next_batch)
+    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
+    assert len(next_batch.attention_mask[0]) == 11
+    assert next_batch.all_input_ids[0][-1] == 13
+    assert next_batch.all_input_ids[0][-2] == 14402
+    assert torch.all(next_batch.all_input_ids[0][:-2] == 50256)
+
+    assert torch.all(next_batch.attention_mask[0][0:2] == 1)
+    assert torch.all(next_batch.attention_mask[0][2:] == 0)
+
+    assert next_batch.input_ids.shape == (len(next_batch), 1)
+    assert next_batch.input_ids[0, 0] == 13
+
+    assert next_batch.input_lengths == [2]
+    assert next_batch.max_input_length == next_batch.input_lengths[0]
+
+    assert next_batch.past_key_values is not None
+    assert all(
+        [p[0].shape == (1, 12, sequence_length, 64) for p in next_batch.past_key_values]
+    )
+    assert all(
+        [p[1].shape == (1, 12, sequence_length, 64) for p in next_batch.past_key_values]
+    )
+    assert all([generation.generated_text is None for generation in generations])
+    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
+    assert all(
+        [
+            token_id.item() == 13
+            for generation in generations
+            for token_id in generation.tokens.token_ids
+        ]
+    )
+    assert all(
+        [
+            token_text == "."
+            for generation in generations
+            for token_text in generation.tokens.texts
+        ]
+    )
+    assert generations[0].request_id == 0
+
+
+def test_causal_lm_generate_token_completion(
+    default_causal_lm, default_causal_lm_batch
+):
+    next_batch = default_causal_lm_batch
+    for _ in range(default_causal_lm_batch.stopping_criterias[0].max_new_tokens - 1):
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
+    assert generations[0].request_id == default_causal_lm_batch.requests[0].id
+    assert (
+        generations[0].generated_text.generated_tokens
+        == default_causal_lm_batch.stopping_criterias[0].max_new_tokens
+    )
+
+
+def test_causal_lm_generate_token_completion_multi(
+    default_causal_lm, default_multi_requests_causal_lm_batch
+):
+    next_batch = default_multi_requests_causal_lm_batch
+
+    for i in range(
+        default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 1
+    ):
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+    assert next_batch is not None
+
+    assert len(generations) == 2
+    assert generations[1].generated_text.text == ".java:784)"
+    assert (
+        generations[1].request_id
+        == default_multi_requests_causal_lm_batch.requests[1].id
+    )
+    assert (
+        generations[1].generated_text.generated_tokens
+        == default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
+    )
+    # Copy stopping_criterias before filtering
+    stopping_criterias = (
+        default_multi_requests_causal_lm_batch.stopping_criterias.copy()
+    )
+
+    next_batch = next_batch.filter([next_batch.requests[0].id])
+
+    for _ in range(
+        stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
+    ):
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
+    assert (
+        generations[0].request_id
+        == default_multi_requests_causal_lm_batch.requests[0].id
+    )
+    assert (
+        generations[0].generated_text.generated_tokens
+        == default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
+    )
+
+
+def test_batch_concatenate(
+    default_causal_lm, default_causal_lm_batch, default_multi_requests_causal_lm_batch
+):
+    next_batch_0 = default_causal_lm_batch
+    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)
+    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)
+
+    next_batch_1 = default_multi_requests_causal_lm_batch
+    _, next_batch_1, _ = default_causal_lm.generate_token(next_batch_1)
+
+    # Clone past_key_values before concatenating to compare after,
+    # because they are removed from the concatenated batches
+    next_batch_0_past_key_values = [
+        (k.clone(), v.clone()) for (k, v) in next_batch_0.past_key_values
+    ]
+    next_batch_1_past_key_values = [
+        (k.clone(), v.clone()) for (k, v) in next_batch_1.past_key_values
+    ]
+
+    next_batch = CausalLMBatch.concatenate([next_batch_0, next_batch_1])
+
+    assert torch.equal(next_batch.all_input_ids[0], next_batch_0.all_input_ids[0])
+    assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
+    assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])
+
+    assert torch.all(
+        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
+    )
+    assert torch.all(
+        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
+    )
+    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)
+
+    assert next_batch.batch_id == 0
+    assert next_batch.input_ids[0, 0] == 12355
+    assert torch.all(next_batch.input_ids[1:] == 13)
+
+    assert next_batch.input_lengths == [3, 2, 2]
+    assert next_batch.max_input_length == 3
+
+    assert next_batch.requests[0] == next_batch_0.requests[0]
+    assert next_batch.requests[1:] == list(next_batch_1.requests)
+
+    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
+    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
+
+    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
+    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias
+
+    assert next_batch.past_key_values is not None
+    assert all([p[0].shape == (3, 12, 2, 64) for p in next_batch.past_key_values])
+    assert all([p[1].shape == (3, 12, 2, 64) for p in next_batch.past_key_values])
+
+    for i, past in enumerate(next_batch.past_key_values):
+        assert torch.equal(next_batch_0_past_key_values[i][0][0, :, -2:], past[0][0])
+        assert torch.equal(
+            next_batch_1_past_key_values[i][0][:, :, -1:], past[0][1:, :, -1:, :]
+        )
+
+        assert torch.equal(next_batch_0_past_key_values[i][1][0, :, -2:], past[1][0])
+        assert torch.equal(
+            next_batch_1_past_key_values[i][1][:, :, -1:], past[1][1:, :, -1:, :]
+        )
+
+    for _ in range(
+        default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 2
+    ):
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+    assert next_batch is not None
+
+    assert len(generations) == 3
+    assert generations[2].generated_text.text == ".java:784)"
+    assert (
+        generations[2].request_id
+        == default_multi_requests_causal_lm_batch.requests[1].id
+    )
+    assert (
+        generations[2].generated_text.generated_tokens
+        == default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
+    )
+
+    next_batch = next_batch.filter(
+        [next_batch.requests[0].id, next_batch.requests[1].id]
+    )
+
+    for _ in range(
+        default_causal_lm_batch.stopping_criterias[0].max_new_tokens
+        - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
+        - 2
+    ):
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+    assert next_batch is not None
+
+    assert len(generations) == 2
+    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
+    assert generations[0].request_id == default_causal_lm_batch.requests[0].id
+    assert (
+        generations[0].generated_text.generated_tokens
+        == default_causal_lm_batch.stopping_criterias[0].max_new_tokens
+    )
+
+    next_batch = next_batch.filter([next_batch.requests[1].id])
+
+    for _ in range(
+        default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
+        - default_causal_lm_batch.stopping_criterias[0].max_new_tokens
+        - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
+        - 4
+    ):
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
+    assert (
+        generations[0].request_id
+        == default_multi_requests_causal_lm_batch.requests[0].id
+    )
+    assert (
+        generations[0].generated_text.generated_tokens
+        == default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
+    )
diff --git a/backends/gaudi/server/tests/models/test_model.py b/backends/gaudi/server/tests/models/test_model.py
new file mode 100644
index 00000000..8441e8c6
--- /dev/null
+++ b/backends/gaudi/server/tests/models/test_model.py
@@ -0,0 +1,83 @@
+import pytest
+import torch
+
+from transformers import AutoTokenizer
+
+from text_generation_server.models import Model
+
+
+def get_test_model():
+    class TestModel(Model):
+        def batch_type(self):
+            raise NotImplementedError
+
+        def generate_token(self, batch):
+            raise NotImplementedError
+
+    tokenizer = AutoTokenizer.from_pretrained("huggingface/llama-7b")
+
+    model = TestModel(
+        "test_model_id",
+        torch.nn.Linear(1, 1),
+        tokenizer,
+        False,
+        torch.float32,
+        torch.device("cpu"),
+    )
+    return model
+
+
+@pytest.mark.private
+def test_decode_streaming_english_spaces():
+    model = get_test_model()
+    truth = "Hello here, this is a simple test"
+    all_input_ids = [15043, 1244, 29892, 445, 338, 263, 2560, 1243]
+    assert (
+        all_input_ids == model.tokenizer(truth, add_special_tokens=False)["input_ids"]
+    )
+
+    decoded_text = ""
+    offset = 0
+    token_offset = 0
+    for i in range(len(all_input_ids)):
+        text, offset, token_offset = model.decode_token(
+            all_input_ids[: i + 1], offset, token_offset
+        )
+        decoded_text += text
+
+    assert decoded_text == truth
+
+
+@pytest.mark.private
+def test_decode_streaming_chinese_utf8():
+    model = get_test_model()
+    truth = "我很感谢你的热情"
+    all_input_ids = [
+        30672,
+        232,
+        193,
+        139,
+        233,
+        135,
+        162,
+        235,
+        179,
+        165,
+        30919,
+        30210,
+        234,
+        134,
+        176,
+        30993,
+    ]
+
+    decoded_text = ""
+    offset = 0
+    token_offset = 0
+    for i in range(len(all_input_ids)):
+        text, offset, token_offset = model.decode_token(
+            all_input_ids[: i + 1], offset, token_offset
+        )
+        decoded_text += text
+
+    assert decoded_text == truth
diff --git a/backends/gaudi/server/tests/models/test_santacoder.py b/backends/gaudi/server/tests/models/test_santacoder.py
new file mode 100644
index 00000000..d5c91bff
--- /dev/null
+++ b/backends/gaudi/server/tests/models/test_santacoder.py
@@ -0,0 +1,108 @@
+import pytest
+
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.causal_lm import CausalLMBatch, CausalLM
+
+
+@pytest.fixture(scope="session")
+def default_santacoder():
+    return CausalLM.fallback(model_id="bigcode/santacoder")
+
+
+@pytest.fixture
+def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
+    return generate_pb2.Request(
+        id=0,
+        inputs="def",
+        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="def")]),
+        prefill_logprobs=True,
+        truncate=100,
+        parameters=default_pb_parameters,
+        stopping_parameters=default_pb_stop_parameters,
+    )
+
+
+@pytest.fixture
+def default_pb_batch(default_pb_request):
+    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
+
+
+@pytest.fixture
+def default_fim_pb_request(default_pb_parameters, default_pb_stop_parameters):
+    return generate_pb2.Request(
+        id=0,
+        inputs="<fim-prefix>def<fim-suffix>world<fim-middle>",
+        input_chunks=generate_pb2.Input(
+            chunks=[
+                generate_pb2.InputChunk(
+                    text="<fim-prefix>def<fim-suffix>world<fim-middle>"
+                )
+            ]
+        ),
+        prefill_logprobs=True,
+        truncate=100,
+        parameters=default_pb_parameters,
+        stopping_parameters=default_pb_stop_parameters,
+    )
+
+
+@pytest.fixture
+def default_fim_pb_batch(default_fim_pb_request):
+    return generate_pb2.Batch(id=0, requests=[default_fim_pb_request], size=1)
+
+
+@pytest.mark.skip
+def test_santacoder_generate_token_completion(default_santacoder, default_pb_batch):
+    batch = CausalLMBatch.from_pb(
+        default_pb_batch,
+        default_santacoder.tokenizer,
+        default_santacoder.dtype,
+        default_santacoder.device,
+    )
+    next_batch = batch
+
+    for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
+        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert generations[0].generated_text.text == " test_get_all_users_with_"
+    assert generations[0].request_id == batch.requests[0].id
+    assert (
+        generations[0].generated_text.generated_tokens
+        == batch.stopping_criterias[0].max_new_tokens
+    )
+
+
+@pytest.mark.skip
+def test_fim_santacoder_generate_token_completion(
+    default_santacoder, default_fim_pb_batch
+):
+    batch = CausalLMBatch.from_pb(
+        default_fim_pb_batch,
+        default_santacoder.tokenizer,
+        default_santacoder.dtype,
+        default_santacoder.device,
+    )
+    next_batch = batch
+
+    for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
+        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert (
+        generations[0].generated_text.text
+        == """ineProperty(exports, "__esModule", { value"""
+    )
+    assert generations[0].request_id == batch.requests[0].id
+    assert (
+        generations[0].generated_text.generated_tokens
+        == batch.stopping_criterias[0].max_new_tokens
+    )
diff --git a/backends/gaudi/server/tests/models/test_seq2seq_lm.py b/backends/gaudi/server/tests/models/test_seq2seq_lm.py
new file mode 100644
index 00000000..5ba7c64c
--- /dev/null
+++ b/backends/gaudi/server/tests/models/test_seq2seq_lm.py
@@ -0,0 +1,365 @@
+import pytest
+import torch
+
+from copy import copy
+
+from transformers import AutoTokenizer
+
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.seq2seq_lm import Seq2SeqLM, Seq2SeqLMBatch
+
+
+@pytest.fixture(scope="session")
+def mt0_small_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained(
+        "bigscience/mt0-small", padding_side="left"
+    )
+    tokenizer.bos_token_id = 0
+    return tokenizer
+
+
+@pytest.fixture(scope="session")
+def default_seq2seq_lm():
+    return Seq2SeqLM.fallback("bigscience/mt0-small")
+
+
+@pytest.fixture
+def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
+    return generate_pb2.Request(
+        id=0,
+        inputs="Test",
+        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
+        prefill_logprobs=True,
+        truncate=100,
+        parameters=default_pb_parameters,
+        stopping_parameters=default_pb_stop_parameters,
+    )
+
+
+@pytest.fixture
+def default_pb_batch(default_pb_request):
+    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
+
+
+@pytest.fixture
+def default_seq2seq_lm_batch(default_pb_batch, mt0_small_tokenizer):
+    return Seq2SeqLMBatch.from_pb(
+        default_pb_batch, mt0_small_tokenizer, torch.float32, torch.device("cpu")
+    )
+
+
+@pytest.fixture
+def default_multi_requests_seq2seq_lm_batch(default_pb_request, mt0_small_tokenizer):
+    req_0 = copy(default_pb_request)
+    req_0.id = 1
+    req_1 = default_pb_request
+    req_1.id = 2
+    req_1.stopping_parameters.max_new_tokens = 5
+
+    batch_pb = generate_pb2.Batch(id=0, requests=[req_0, req_1], size=2)
+    return Seq2SeqLMBatch.from_pb(
+        batch_pb, mt0_small_tokenizer, torch.float32, torch.device("cpu")
+    )
+
+
+def test_batch_from_pb(default_pb_batch, default_seq2seq_lm_batch):
+    batch = default_seq2seq_lm_batch
+    sequence_length = len(default_seq2seq_lm_batch.input_ids[0])
+
+    assert batch.batch_id == default_pb_batch.id
+    assert batch.requests == default_pb_batch.requests
+
+    assert batch.input_ids.shape == (default_pb_batch.size, sequence_length)
+    assert batch.input_ids[0][-2] == 4268
+    assert batch.input_ids[0][-1] == 1
+    assert torch.all(batch.input_ids[0][:-2] == 0)
+
+    assert torch.all(batch.attention_mask[0][-2:] == 1)
+    assert torch.all(batch.attention_mask[0][:-2] == 0)
+
+    assert len(batch.decoder_input_ids) == default_pb_batch.size
+    assert batch.decoder_attention_mask is None
+    assert batch.encoder_last_hidden_state is None
+
+    assert batch.past_key_values is None
+
+    assert batch.input_lengths == [2]
+    assert batch.decoder_input_lengths == [1]
+
+    assert len(batch) == default_pb_batch.size
+    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)
+
+    assert batch.max_input_length == batch.input_lengths[0]
+    assert batch.max_decoder_input_length == batch.decoder_input_lengths[0]
+
+
+def test_batch_concatenate_no_prefill(default_seq2seq_lm_batch):
+    with pytest.raises(ValueError):
+        Seq2SeqLMBatch.concatenate([default_seq2seq_lm_batch, default_seq2seq_lm_batch])
+
+
+def test_seq2seq_lm_batch_type(default_seq2seq_lm):
+    assert default_seq2seq_lm.batch_type == Seq2SeqLMBatch
+
+
+def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_lm_batch):
+    sequence_length = len(default_seq2seq_lm_batch.input_ids[0])
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(
+        default_seq2seq_lm_batch
+    )
+
+    assert len(generations) == len(next_batch)
+    assert isinstance(next_batch, Seq2SeqLMBatch)
+
+    assert next_batch.input_ids is None
+    assert torch.equal(
+        next_batch.attention_mask, default_seq2seq_lm_batch.attention_mask
+    )
+    assert next_batch.input_lengths == default_seq2seq_lm_batch.input_lengths
+    assert next_batch.max_input_length == default_seq2seq_lm_batch.max_input_length
+    assert (
+        next_batch.next_token_choosers == default_seq2seq_lm_batch.next_token_choosers
+    )
+    assert next_batch.stopping_criterias == default_seq2seq_lm_batch.stopping_criterias
+
+    assert len(next_batch.decoder_input_ids) == len(next_batch)
+    assert next_batch.all_decoder_input_ids[0][0] == 0
+    assert next_batch.all_decoder_input_ids[0][1] == 259
+    assert next_batch.decoder_attention_mask is None
+    assert next_batch.encoder_last_hidden_state.shape == (1, sequence_length, 512)
+
+    assert next_batch.decoder_input_lengths == [2]
+    assert next_batch.max_decoder_input_length == 2
+
+    assert next_batch.past_key_values is not None
+    assert all(
+        [p[0].shape == (len(next_batch), 6, 1, 64) for p in next_batch.past_key_values]
+    )
+    assert all(
+        [p[1].shape == (len(next_batch), 6, 1, 64) for p in next_batch.past_key_values]
+    )
+    assert all(
+        [
+            p[2].shape == (len(next_batch), 6, sequence_length, 64)
+            for p in next_batch.past_key_values
+        ]
+    )
+    assert all(
+        [
+            p[3].shape == (len(next_batch), 6, sequence_length, 64)
+            for p in next_batch.past_key_values
+        ]
+    )
+    assert all([generation.generated_text is None for generation in generations])
+    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
+    assert all(
+        [
+            token_id.item() == 259
+            for generation in generations
+            for token_id in generation.tokens.token_ids
+        ]
+    )
+    assert all(
+        [
+            token_text == " "
+            for generation in generations
+            for token_text in generation.tokens.texts
+        ]
+    )
+    assert generations[0].request_id == 0
+
+
+def test_seq2seq_lm_generate_token_completion(
+    default_seq2seq_lm, default_seq2seq_lm_batch
+):
+    next_batch = default_seq2seq_lm_batch
+    for _ in range(6):
+        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert generations[0].generated_text.text == "a few weeks"
+    assert generations[0].request_id == default_seq2seq_lm_batch.requests[0].id
+    assert generations[0].generated_text.generated_tokens == 7
+
+
+def test_seq2seq_lm_generate_token_completion_multi(
+    default_seq2seq_lm, default_multi_requests_seq2seq_lm_batch
+):
+    next_batch = default_multi_requests_seq2seq_lm_batch
+
+    for i in range(4):
+        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+    assert next_batch is not None
+
+    assert len(generations) == 2
+    assert generations[1].generated_text.text == "a few "
+    assert (
+        generations[1].request_id
+        == default_multi_requests_seq2seq_lm_batch.requests[1].id
+    )
+    assert generations[1].generated_text.generated_tokens == 5
+
+    next_batch = next_batch.filter([next_batch.requests[0].id])
+
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+    assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert generations[0].generated_text.text == "a few weeks"
+    assert (
+        generations[0].request_id
+        == default_multi_requests_seq2seq_lm_batch.requests[0].id
+    )
+    assert generations[0].generated_text.generated_tokens == 7
+
+
+def test_batch_concatenate(
+    default_seq2seq_lm,
+    default_seq2seq_lm_batch,
+    default_multi_requests_seq2seq_lm_batch,
+):
+    next_batch_0 = default_seq2seq_lm_batch
+    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)
+    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)
+
+    next_batch_1 = default_multi_requests_seq2seq_lm_batch
+    _, next_batch_1, _ = default_seq2seq_lm.generate_token(next_batch_1)
+
+    # Copy hidden state because it is removed from the concatenated branches
+    next_batch_0_encoder_last_hidden_state = next_batch_0.encoder_last_hidden_state
+    next_batch_1_encoder_last_hidden_state = next_batch_1.encoder_last_hidden_state
+
+    # Clone past_key_values before concatenating to compare after,
+    # because they are removed from the concatenated batches
+    next_batch_0_past_key_values = [
+        [t.clone() for t in layer] for layer in next_batch_0.past_key_values
+    ]
+    next_batch_1_past_key_values = [
+        [t.clone() for t in layer] for layer in next_batch_1.past_key_values
+    ]
+
+    next_batch = Seq2SeqLMBatch.concatenate([next_batch_0, next_batch_1])
+
+    assert next_batch.batch_id == 0
+
+    assert torch.equal(
+        next_batch.decoder_input_ids[0], next_batch_0.decoder_input_ids[0]
+    )
+    assert next_batch.all_decoder_input_ids[1][0] == 0
+    assert next_batch.all_decoder_input_ids[2][0] == 0
+    assert torch.equal(
+        next_batch.decoder_input_ids[1:, -2:], next_batch_1.decoder_input_ids
+    )
+
+    assert torch.all(next_batch.decoder_attention_mask[0, :3] == 1)
+    assert torch.all(next_batch.decoder_attention_mask[0, 3:] == 0)
+    assert torch.all(next_batch.decoder_attention_mask[1:, 0] == 0)
+    assert torch.all(next_batch.decoder_attention_mask[1:, 1:3] == 1)
+
+    assert torch.equal(
+        next_batch.encoder_last_hidden_state[0],
+        next_batch_0_encoder_last_hidden_state[0, -2:],
+    )
+    assert torch.equal(
+        next_batch.encoder_last_hidden_state[1:],
+        next_batch_1_encoder_last_hidden_state[:, -2:],
+    )
+
+    assert next_batch.input_lengths == [2, 2, 2]
+    assert next_batch.decoder_input_lengths == [3, 2, 2]
+    assert next_batch.max_input_length == 2
+    assert next_batch.max_decoder_input_length == 3
+
+    assert next_batch.requests[0] == next_batch_0.requests[0]
+    assert next_batch.requests[1:] == list(next_batch_1.requests)
+
+    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
+    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
+
+    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
+    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias
+
+    assert next_batch.past_key_values is not None
+    assert all(
+        [p[0].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
+    )
+    assert all(
+        [p[1].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
+    )
+    assert all(
+        [p[2].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
+    )
+    assert all(
+        [p[3].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
+    )
+
+    for i, past in enumerate(next_batch.past_key_values):
+        assert torch.equal(next_batch_0_past_key_values[i][0][0, :, -2:, :], past[0][0])
+        assert torch.equal(
+            next_batch_1_past_key_values[i][0][:, :, -1:, :], past[0][1:, :, -1:, :]
+        )
+
+        assert torch.equal(next_batch_0_past_key_values[i][1][0, :, -2:, :], past[1][0])
+        assert torch.equal(
+            next_batch_1_past_key_values[i][1][:, :, -1:, :], past[1][1:, :, -1:, :]
+        )
+
+        assert torch.equal(next_batch_0_past_key_values[i][2][0, :, -2:, :], past[2][0])
+        assert torch.equal(
+            next_batch_1_past_key_values[i][2][:, :, -2:, :], past[2][1:]
+        )
+
+        assert torch.equal(next_batch_0_past_key_values[i][3][0, :, -2:, :], past[3][0])
+        assert torch.equal(
+            next_batch_1_past_key_values[i][3][:, :, -2:, :], past[3][1:]
+        )
+
+    for _ in range(3):
+        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+        assert len(generations) == len(next_batch)
+
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+    assert next_batch is not None
+
+    assert len(generations) == 3
+    assert generations[2].generated_text.text == "a few "
+    assert (
+        generations[2].request_id
+        == default_multi_requests_seq2seq_lm_batch.requests[1].id
+    )
+    assert generations[2].generated_text.generated_tokens == 5
+
+    next_batch = next_batch.filter(
+        [next_batch.requests[0].id, next_batch.requests[1].id]
+    )
+
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+    assert next_batch is not None
+
+    assert len(generations) == 2
+    assert generations[0].generated_text.text == "a few weeks"
+    assert generations[0].request_id == default_seq2seq_lm_batch.requests[0].id
+    assert generations[0].generated_text.generated_tokens == 7
+
+    next_batch = next_batch.filter([next_batch.requests[1].id])
+
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
+    assert next_batch is None
+
+    assert len(generations) == 1
+    assert generations[0].generated_text.text == "a few weeks"
+    assert (
+        generations[0].request_id
+        == default_multi_requests_seq2seq_lm_batch.requests[0].id
+    )
+    assert generations[0].generated_text.generated_tokens == 7
diff --git a/backends/gaudi/server/tests/utils/test_adapter.py b/backends/gaudi/server/tests/utils/test_adapter.py
new file mode 100644
index 00000000..a27c1055
--- /dev/null
+++ b/backends/gaudi/server/tests/utils/test_adapter.py
@@ -0,0 +1,247 @@
+import pytest
+from unittest.mock import Mock
+from text_generation_server.utils.adapter import (
+    get_attn_weights,
+    get_mlp_weights,
+    parse_lora_adapters,
+    AdapterInfo,
+)
+
+
+def test_parse_lora_adapters_empty():
+    assert parse_lora_adapters(None) == []
+    assert parse_lora_adapters("") == []
+
+
+def test_parse_lora_adapters_single():
+    result = parse_lora_adapters("adapter1")
+    assert result == [AdapterInfo(id="adapter1", path=None, revision=None)]
+
+
+def test_parse_lora_adapters_with_path():
+    result = parse_lora_adapters("adapter1=path/to/adapter1")
+    assert result == [
+        AdapterInfo(id="adapter1", path="path/to/adapter1", revision=None)
+    ]
+
+
+def test_parse_lora_adapters_with_path_and_revision():
+    result = parse_lora_adapters("adapter1=path/to/adapter1@main")
+    assert result == [
+        AdapterInfo(id="adapter1", path="path/to/adapter1", revision="main")
+    ]
+
+
+def test_parse_lora_adapters_multiple():
+    result = parse_lora_adapters(
+        "adapter1,adapter2=path/to/adapter2,adapter3=path/to/adapter3@dev"
+    )
+    assert result == [
+        AdapterInfo(id="adapter1", path=None, revision=None),
+        AdapterInfo(id="adapter2", path="path/to/adapter2", revision=None),
+        AdapterInfo(id="adapter3", path="path/to/adapter3", revision="dev"),
+    ]
+
+
+def test_parse_lora_adapters_invalid_format():
+    try:
+        parse_lora_adapters("adapter1,invalid=format=test,adapter3")
+        assert False, "Should have raised ValueError"
+    except ValueError as e:
+        assert str(e) == "Invalid LoRA adapter format: invalid=format=test"
+
+
+def test_get_attn_weights():
+    # create a mock layer
+    mock_layer = Mock()
+    mock_layer.self_attn.query_key_value = Mock()
+    mock_layer.self_attn.o_proj = Mock()
+
+    # call the function
+    result = get_attn_weights(2, mock_layer)
+
+    # assert the result
+    expected = {
+        (2, "q_proj"): (
+            "model.layers.2.self_attn.q_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "k_proj"): (
+            "model.layers.2.self_attn.k_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "qkv_proj"): (
+            "model.layers.2.self_attn.qkv_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "v_proj"): (
+            "model.layers.2.self_attn.v_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
+    }
+    assert result == expected
+
+
+def test_get_mlp_weights_with_gate_up_proj():
+    # create a mock layer with gate_up_proj
+    mock_layer = Mock()
+    mock_layer.mlp.gate_up_proj = Mock()
+    mock_layer.mlp.down_proj = Mock()
+
+    # call the function
+    result = get_mlp_weights(3, mock_layer)
+
+    # assert the result
+    expected = {
+        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
+        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
+        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
+    }
+    assert result == expected
+
+
+def test_get_mlp_weights_without_gate_up_proj():
+    # create a mock layer without gate_up_proj
+    mock_layer = Mock()
+    mock_layer.mlp = Mock(spec=[])
+
+    # call the function
+    result = get_mlp_weights(1, mock_layer)
+
+    # assert the result
+    assert result == {}
+
+
+@pytest.mark.parametrize("layer_index", [0, 1, 5])
+def test_get_attn_weights_different_layers(layer_index):
+    mock_layer = Mock()
+    mock_layer.self_attn.query_key_value = Mock()
+    mock_layer.self_attn.o_proj = Mock()
+
+    result = get_attn_weights(layer_index, mock_layer)
+
+    for k in ["q", "k", "v"]:
+        assert (layer_index, f"{k}_proj") in result
+        assert (
+            result[(layer_index, f"{k}_proj")][0]
+            == f"model.layers.{layer_index}.self_attn.{k}_proj"
+        )
+
+    assert (layer_index, "o_proj") in result
+    assert (
+        result[(layer_index, "o_proj")][0]
+        == f"model.layers.{layer_index}.self_attn.o_proj"
+    )
+
+
+@pytest.mark.parametrize("layer_index", [0, 1, 5])
+def test_get_mlp_weights_different_layers(layer_index):
+    mock_layer = Mock()
+    mock_layer.mlp.gate_up_proj = Mock()
+    mock_layer.mlp.down_proj = Mock()
+
+    result = get_mlp_weights(layer_index, mock_layer)
+
+    for k in ["gate", "up", "down"]:
+        assert (layer_index, f"{k}_proj") in result
+        assert (
+            result[(layer_index, f"{k}_proj")][0]
+            == f"model.layers.{layer_index}.mlp.{k}_proj"
+        )
+
+
+def test_get_attn_weights_llama_compatibility():
+    mock_layer = Mock()
+    mock_layer.self_attn.query_key_value = Mock()
+    mock_layer.self_attn.o_proj = Mock()
+
+    result = get_attn_weights(2, mock_layer)
+
+    expected = {
+        (2, "q_proj"): (
+            "model.layers.2.self_attn.q_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "k_proj"): (
+            "model.layers.2.self_attn.k_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "qkv_proj"): (
+            "model.layers.2.self_attn.qkv_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "v_proj"): (
+            "model.layers.2.self_attn.v_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
+    }
+    assert result == expected
+
+
+def test_get_mlp_weights_llama_compatibility():
+    mock_layer = Mock()
+    mock_layer.mlp.gate_up_proj = Mock()
+    mock_layer.mlp.down_proj = Mock()
+
+    result = get_mlp_weights(3, mock_layer)
+
+    expected = {
+        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
+        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
+        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
+    }
+    assert result == expected
+
+
+def test_get_attn_weights_gemma_compatibility():
+    mock_layer = Mock()
+    mock_layer.self_attn.query_key_value = Mock()
+    mock_layer.self_attn.o_proj = Mock()
+
+    result = get_attn_weights(2, mock_layer)
+
+    expected = {
+        (2, "q_proj"): (
+            "model.layers.2.self_attn.q_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "k_proj"): (
+            "model.layers.2.self_attn.k_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "qkv_proj"): (
+            "model.layers.2.self_attn.qkv_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "v_proj"): (
+            "model.layers.2.self_attn.v_proj",
+            mock_layer.self_attn.query_key_value,
+        ),
+        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
+    }
+    assert result == expected
+
+
+def test_get_mlp_weights_gemma_compatibility():
+    mock_layer = Mock()
+    mock_layer.mlp.gate_proj = Mock()
+    mock_layer.mlp.up_proj = Mock()
+    mock_layer.mlp.down_proj = Mock()
+
+    # ensure that the mock_layer.mlp.gate_up_proj attribute does not exist.
+    # This is necessary because the use of `Mock` automatically creates any
+    # attributes that are accessed, even if they don't exist in the actual
+    # implementation. If `gate_up_proj` were created, `get_mlp_weights` might
+    # follow the wrong execution path and return an incorrect result.
+    del mock_layer.mlp.gate_up_proj
+
+    result = get_mlp_weights(3, mock_layer)
+
+    expected = {
+        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_proj),
+        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.up_proj),
+        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
+    }
+    assert result == expected
diff --git a/backends/gaudi/server/tests/utils/test_convert.py b/backends/gaudi/server/tests/utils/test_convert.py
new file mode 100644
index 00000000..ba6c5702
--- /dev/null
+++ b/backends/gaudi/server/tests/utils/test_convert.py
@@ -0,0 +1,21 @@
+from text_generation_server.utils.hub import (
+    download_weights,
+    weight_hub_files,
+    weight_files,
+)
+
+from text_generation_server.utils.convert import convert_files
+
+
+def test_convert_files():
+    model_id = "bigscience/bloom-560m"
+    pt_filenames = weight_hub_files(model_id, extension=".bin")
+    local_pt_files = download_weights(pt_filenames, model_id)
+    local_st_files = [
+        p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors" for p in local_pt_files
+    ]
+    convert_files(local_pt_files, local_st_files, discard_names=[])
+
+    found_st_files = weight_files(model_id)
+
+    assert all([p in found_st_files for p in local_st_files])
diff --git a/backends/gaudi/server/tests/utils/test_hub.py b/backends/gaudi/server/tests/utils/test_hub.py
new file mode 100644
index 00000000..291a41b0
--- /dev/null
+++ b/backends/gaudi/server/tests/utils/test_hub.py
@@ -0,0 +1,103 @@
+import os
+import tempfile
+
+import pytest
+
+import huggingface_hub.constants
+
+import text_generation_server.utils.hub
+from text_generation_server.utils.hub import (
+    weight_hub_files,
+    download_weights,
+    weight_files,
+    EntryNotFoundError,
+    LocalEntryNotFoundError,
+    RevisionNotFoundError,
+)
+
+
+@pytest.fixture()
+def offline():
+    current_value = text_generation_server.utils.hub.HF_HUB_OFFLINE
+    text_generation_server.utils.hub.HF_HUB_OFFLINE = True
+    yield "offline"
+    text_generation_server.utils.hub.HF_HUB_OFFLINE = current_value
+
+
+@pytest.fixture()
+def fresh_cache():
+    with tempfile.TemporaryDirectory() as d:
+        current_value = huggingface_hub.constants.HUGGINGFACE_HUB_CACHE
+        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = d
+        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = d
+        os.environ["HUGGINGFACE_HUB_CACHE"] = d
+        yield
+        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = current_value
+        os.environ["HUGGINGFACE_HUB_CACHE"] = current_value
+        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = current_value
+
+
+@pytest.fixture()
+def prefetched():
+    model_id = "bert-base-uncased"
+    huggingface_hub.snapshot_download(
+        repo_id=model_id,
+        revision="main",
+        local_files_only=False,
+        repo_type="model",
+        allow_patterns=["*.safetensors"],
+    )
+    yield model_id
+
+
+def test_weight_hub_files_offline_error(offline, fresh_cache):
+    # If the model is not prefetched then it will raise an error
+    with pytest.raises(EntryNotFoundError):
+        weight_hub_files("gpt2")
+
+
+def test_weight_hub_files_offline_ok(prefetched, offline):
+    # If the model is prefetched then we should be able to get the weight files from local cache
+    filenames = weight_hub_files(prefetched)
+    root = None
+    assert len(filenames) == 1
+    for f in filenames:
+        curroot, filename = os.path.split(f)
+        if root is None:
+            root = curroot
+        else:
+            assert root == curroot
+        assert filename == "model.safetensors"
+
+
+def test_weight_hub_files():
+    filenames = weight_hub_files("bigscience/bloom-560m")
+    assert filenames == ["model.safetensors"]
+
+
+def test_weight_hub_files_llm():
+    filenames = weight_hub_files("bigscience/bloom")
+    assert filenames == [f"model_{i:05d}-of-00072.safetensors" for i in range(1, 73)]
+
+
+def test_weight_hub_files_empty():
+    with pytest.raises(EntryNotFoundError):
+        weight_hub_files("bigscience/bloom", extension=".errors")
+
+
+def test_download_weights():
+    model_id = "bigscience/bloom-560m"
+    filenames = weight_hub_files(model_id)
+    files = download_weights(filenames, model_id)
+    local_files = weight_files("bigscience/bloom-560m")
+    assert files == local_files
+
+
+def test_weight_files_revision_error():
+    with pytest.raises(RevisionNotFoundError):
+        weight_files("bigscience/bloom-560m", revision="error")
+
+
+def test_weight_files_not_cached_error(fresh_cache):
+    with pytest.raises(LocalEntryNotFoundError):
+        weight_files("bert-base-uncased")
diff --git a/backends/gaudi/server/tests/utils/test_layers.py b/backends/gaudi/server/tests/utils/test_layers.py
new file mode 100644
index 00000000..118540ee
--- /dev/null
+++ b/backends/gaudi/server/tests/utils/test_layers.py
@@ -0,0 +1,82 @@
+import torch
+from text_generation_server.layers import (
+    TensorParallelEmbedding,
+)
+
+
+class ProcessGroup:
+    def __init__(self, rank: int, world_size: int):
+        self._rank = rank
+        self.world_size = world_size
+
+    def size(self) -> int:
+        return self.world_size
+
+    def rank(self) -> int:
+        return self._rank
+
+
+class Weights:
+    def __init__(self, rank: int, world_size: int, vocab_size: int, hidden_dim: int):
+        self.weight = (
+            torch.arange(vocab_size * hidden_dim).float().view(vocab_size, hidden_dim)
+        )
+        self.process_group = ProcessGroup(rank, world_size)
+
+    def get_partial_sharded(self, name: str, dim: int):
+        assert dim == 0
+
+        rank = self.process_group.rank()
+        world_size = self.process_group.size()
+        size = self.weight.shape[dim]
+
+        block_size = (size + world_size - 1) // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        return self.weight[start:stop]
+
+    def get_shape(self, name: str):
+        return self.weight.shape
+
+
+def test_weight_hub_files_offline_error():
+
+    vocab_size = 17
+    weights = Weights(
+        rank=0,
+        world_size=1,
+        vocab_size=vocab_size,
+        hidden_dim=256,
+    )
+    embeddings = TensorParallelEmbedding("", weights)
+
+    input_ids = torch.arange(vocab_size)
+    output = embeddings.forward(input_ids)
+    assert embeddings.min_id == 0
+    assert embeddings.max_id == 17
+    torch.testing.assert_close(output, torch.arange(256 * 17).float().view(17, 256))
+
+    weights_0_2 = Weights(rank=0, world_size=2, vocab_size=vocab_size, hidden_dim=256)
+    weights_1_2 = Weights(rank=1, world_size=2, vocab_size=vocab_size, hidden_dim=256)
+    embeddings_0_2 = TensorParallelEmbedding("", weights_0_2, reduce=False)
+    assert embeddings_0_2.min_id == 0
+    assert embeddings_0_2.max_id == 9
+    torch.testing.assert_close(
+        embeddings_0_2.weight,
+        torch.cat([torch.arange(9 * 256), torch.zeros(256)], dim=0)
+        .view(10, 256)
+        .float(),
+    )
+    embeddings_1_2 = TensorParallelEmbedding("", weights_1_2, reduce=False)
+    assert embeddings_1_2.min_id == 9
+    assert embeddings_1_2.max_id == 17
+    torch.testing.assert_close(
+        embeddings_1_2.weight,
+        torch.cat([torch.arange(8 * 256) + 9 * 256, torch.zeros(256)], dim=0)
+        .view(9, 256)
+        .float(),
+    )
+    output_tp_0 = embeddings_0_2.forward(input_ids)
+    output_tp_1 = embeddings_1_2.forward(input_ids)
+
+    torch.testing.assert_close(output, output_tp_0 + output_tp_1)
diff --git a/backends/gaudi/server/tests/utils/test_tokens.py b/backends/gaudi/server/tests/utils/test_tokens.py
new file mode 100644
index 00000000..5db32776
--- /dev/null
+++ b/backends/gaudi/server/tests/utils/test_tokens.py
@@ -0,0 +1,88 @@
+import torch
+from text_generation_server.utils.tokens import (
+    StopSequenceCriteria,
+    StoppingCriteria,
+    FinishReason,
+    batch_top_tokens,
+)
+
+
+def test_stop_sequence_criteria():
+    criteria = StopSequenceCriteria("/test;")
+
+    assert not criteria("/")
+    assert not criteria("/test")
+    assert criteria("/test;")
+    assert not criteria("/test; ")
+
+
+def test_stop_sequence_criteria_escape():
+    criteria = StopSequenceCriteria("<|stop|>")
+
+    assert not criteria("<")
+    assert not criteria("<|stop")
+    assert criteria("<|stop|>")
+    assert not criteria("<|stop|> ")
+
+
+def test_stopping_criteria():
+    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
+    assert criteria(65827, "/test") == (False, None)
+    assert criteria(30, ";") == (True, FinishReason.FINISH_REASON_STOP_SEQUENCE)
+
+
+def test_stopping_criteria_eos():
+    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
+    assert criteria(1, "") == (False, None)
+    assert criteria(0, "") == (True, FinishReason.FINISH_REASON_EOS_TOKEN)
+
+
+def test_stopping_criteria_max():
+    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
+    assert criteria(1, "") == (False, None)
+    assert criteria(1, "") == (False, None)
+    assert criteria(1, "") == (False, None)
+    assert criteria(1, "") == (False, None)
+    assert criteria(1, "") == (True, FinishReason.FINISH_REASON_LENGTH)
+
+
+def test_batch_top_tokens():
+    top_n_tokens = [0, 2, 3, 4, 5]
+    top_n_tokens_tensor = torch.tensor(top_n_tokens)
+    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5)
+    accepted_ids = torch.ones_like(top_n_tokens_tensor)
+
+    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
+        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
+    )
+
+    assert topn_tok_ids[0] == [[]]
+    assert topn_tok_ids[1] == [[0, 3]]
+    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
+    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
+    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]
+
+    assert topn_tok_logprobs[0] == [[]]
+    assert topn_tok_logprobs[1] == [[-1, -2]]
+    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
+    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
+    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]
+
+    # Now let's make second member of the batch be speculated
+    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5 * 2)
+    accepted_ids[1] = 2
+    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
+        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
+    )
+
+    assert topn_tok_ids[0] == [[]]
+    assert topn_tok_ids[1] == [[0, 3], [0, 3]]
+    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
+    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
+    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]
+
+    assert topn_tok_logprobs[0] == [[]]
+    assert topn_tok_logprobs[1] == [[-1, -2], [-1, -2]]
+    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
+    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
+    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]
diff --git a/backends/gaudi/server/tests/utils/test_watermark.py b/backends/gaudi/server/tests/utils/test_watermark.py
new file mode 100644
index 00000000..5d0aaf05
--- /dev/null
+++ b/backends/gaudi/server/tests/utils/test_watermark.py
@@ -0,0 +1,54 @@
+# test_watermark_logits_processor.py
+
+import os
+import numpy as np
+import torch
+from text_generation_server.utils.watermark import WatermarkLogitsProcessor
+
+
+GAMMA = os.getenv("WATERMARK_GAMMA", 0.5)
+DELTA = os.getenv("WATERMARK_DELTA", 2.0)
+
+
+def test_seed_rng():
+    input_ids = [101, 2036, 3731, 102, 2003, 103]
+    processor = WatermarkLogitsProcessor()
+    processor._seed_rng(input_ids)
+    assert isinstance(processor.rng, torch.Generator)
+
+
+def test_get_greenlist_ids():
+    input_ids = [101, 2036, 3731, 102, 2003, 103]
+    processor = WatermarkLogitsProcessor()
+    result = processor._get_greenlist_ids(input_ids, 10, torch.device("cpu"))
+    assert max(result) <= 10
+    assert len(result) == int(10 * 0.5)
+
+
+def test_calc_greenlist_mask():
+    processor = WatermarkLogitsProcessor()
+    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
+    greenlist_token_ids = torch.tensor([2, 3])
+    result = processor._calc_greenlist_mask(scores, greenlist_token_ids)
+    assert result.tolist() == [[False, False, False, False], [False, False, True, True]]
+    assert result.shape == scores.shape
+
+
+def test_bias_greenlist_logits():
+    processor = WatermarkLogitsProcessor()
+    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
+    green_tokens_mask = torch.tensor(
+        [[False, False, True, True], [False, False, False, True]]
+    )
+    greenlist_bias = 2.0
+    result = processor._bias_greenlist_logits(scores, green_tokens_mask, greenlist_bias)
+    assert np.allclose(result.tolist(), [[0.5, 0.3, 2.2, 2.8], [0.1, 0.2, 0.7, 2.9]])
+    assert result.shape == scores.shape
+
+
+def test_call():
+    input_ids = [101, 2036, 3731, 102, 2003, 103]
+    processor = WatermarkLogitsProcessor()
+    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
+    result = processor(input_ids, scores)
+    assert result.shape == scores.shape
diff --git a/backends/gaudi/server/tests/utils/test_weights.py b/backends/gaudi/server/tests/utils/test_weights.py
new file mode 100644
index 00000000..556fcea1
--- /dev/null
+++ b/backends/gaudi/server/tests/utils/test_weights.py
@@ -0,0 +1,1177 @@
+import pytest
+import torch
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    Weights,
+    WeightsLoader,
+)
+from text_generation_server.layers.gptq import GPTQWeight, GPTQWeightsLoader
+from text_generation_server.layers.exl2 import Exl2Weight, Exl2WeightsLoader
+from text_generation_server.layers.marlin.marlin import (
+    MarlinWeight,
+    MarlinWeightsLoader,
+)
+from types import SimpleNamespace
+from typing import List, Optional, Dict, Union
+from pathlib import Path
+
+
+@pytest.fixture
+def gptq_weights_loader():
+    return GPTQWeightsLoader(
+        bits=4,
+        groupsize=-1,
+        desc_act=False,
+        quant_method="gptq",
+        quantize="gptq",
+        sym=True,
+    )
+
+
+@pytest.fixture
+def gptq_weights_loader_awq():
+    return GPTQWeightsLoader(
+        bits=4,
+        groupsize=-1,
+        desc_act=False,
+        quant_method="awq",
+        quantize="awq",
+        sym=True,
+    )
+
+
+@pytest.fixture
+def marlin_weights_loader():
+    return MarlinWeightsLoader(bits=4, is_marlin_24=False)
+
+
+dummy_file_system = {
+    "test_weights": {
+        "layer.0.weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_weights_2": {
+        "layer.1337.weight": torch.tensor(
+            [
+                [1, 2, 3, 4],
+                [5, 6, 7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_get_weights_col_packed": {
+        "weight.weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_get_multi_weights_col": {
+        "weight.weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_get_weights_row": {
+        "weight.weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_get_weights_col_gptq": {
+        "weight.qweight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        "weight.qzeros": torch.tensor(
+            [
+                [0, 1],
+                [1, 0],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.scales": torch.tensor(
+            [
+                [100.0, 100.0],
+                [100.0, 100.0],
+            ],
+            dtype=torch.float16,
+        ),
+        "gptq_bits": torch.tensor([8], dtype=torch.float32),
+        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
+    },
+    "test_get_weights_col_marlin": {
+        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        "weight.s": torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    },
+    "test_get_weights_row_gptq": {
+        "weight.qweight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        "weight.qzeros": torch.tensor(
+            [
+                [0, 1],
+                [1, 0],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.scales": torch.tensor(
+            [
+                [100.0, 100.0],
+                [100.0, 100.0],
+            ],
+            dtype=torch.float16,
+        ),
+        "gptq_bits": torch.tensor([8], dtype=torch.float32),
+        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
+    },
+    "test_get_multi_weights_col_gptq": {
+        "weight.qweight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        "weight.qzeros": torch.tensor(
+            [
+                [0, 1],
+                [1, 0],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.scales": torch.tensor(
+            [
+                [100.0, 100.0],
+                [100.0, 100.0],
+            ],
+            dtype=torch.float16,
+        ),
+        "gptq_bits": torch.tensor([8], dtype=torch.float32),
+        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
+    },
+    "test_get_weights_col_packed_gptq": {
+        "weight.qweight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        "weight.qzeros": torch.tensor(
+            [
+                [0, 1],
+                [1, 0],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.scales": torch.tensor(
+            [
+                [100.0, 100.0],
+                [100.0, 100.0],
+            ],
+            dtype=torch.float16,
+        ),
+        "gptq_bits": torch.tensor([8], dtype=torch.float32),
+        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
+    },
+    "test_get_weights_col_packed_exl2": {
+        "weight.q_weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
+        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
+        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
+        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
+    },
+    "test_get_weights_row_exl2": {
+        "weight.q_weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
+        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
+        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
+        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
+    },
+    "test_get_multi_weights_col_exl2": {
+        "weight.q_weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
+        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
+        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
+        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
+    },
+    "test_get_weights_col_exl2": {
+        "weight.q_weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
+        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
+        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
+        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
+    },
+    "test_get_weights_row_marlin": {
+        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
+    },
+    "test_get_multi_weights_col_marlin": {
+        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
+    },
+    "test_get_weights_col_packed_marlin": {
+        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
+    },
+}
+
+
+class MockSlice:
+    def __init__(self, tensor):
+        self.tensor = tensor
+
+    def get_shape(self):
+        return self.tensor.shape
+
+    def __getitem__(self, idx):
+        return self.tensor[idx]
+
+
+def mock_get_slice(tensor_name, filename):
+    tensor = dummy_file_system[filename][tensor_name]
+    return MockSlice(tensor)
+
+
+def mock_handle(filename, device, dtype):
+    return SimpleNamespace(
+        get_slice=lambda tensor_name: mock_get_slice(tensor_name, filename)
+    )
+
+
+class MockSafeOpen:
+    def __init__(self, filename, framework, dummy_fs):
+        self.filename = filename
+        self.framework = framework
+        self.dummy_fs = dummy_fs
+
+    def keys(self):
+        return list(self.dummy_fs[self.filename].keys())
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+class MockWeights(Weights):
+    def __init__(
+        self,
+        filenames: List[Union[Path, str]],
+        device,
+        dtype,
+        process_group,
+        dummy_fs,
+        aliases: Optional[Dict[str, List[str]]] = None,
+        prefix: Optional[str] = None,
+        weights_loader: Optional[WeightsLoader] = None,
+    ):
+        routing = {}
+        self.dummy_fs = dummy_fs
+        for filename in filenames:
+            with MockSafeOpen(filename, framework="pytorch", dummy_fs=dummy_fs) as f:
+                for k in f.keys():
+                    if k in routing:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+        if aliases is None:
+            aliases = {}
+        self.aliases = aliases
+        self.routing = routing
+        self.device = device
+        self.dtype = dtype
+        self.process_group = process_group
+        self.prefix = prefix
+        self.weights_loader = (
+            # We don't need to get linear layers, so just wrap raw tensors.
+            DefaultWeightsLoader(lambda x: x)
+            if weights_loader is None
+            else weights_loader
+        )
+        self._handles = {}
+
+    def _get_handle(self, filename: Union[Path, str]):
+        if filename in self._handles:
+            return self._handles[filename]
+        else:
+            handle = mock_handle(filename, self.device, self.dtype)
+            self._handles[filename] = handle
+            return handle
+
+    def get_shape(self, tensor_name: str):
+        filename, _ = self.get_filename(tensor_name)
+        handle = self._get_handle(filename)
+        return handle.get_slice(tensor_name).get_shape()
+
+    def get_tensor(self, tensor_name: str):
+        filename, _ = self.get_filename(tensor_name)
+        handle = self._get_handle(filename)
+        return handle.get_slice(tensor_name).tensor
+
+
+dummy_process_group = SimpleNamespace(rank=lambda: 0, size=lambda: 1)
+
+
+def test_weights():
+    weights = MockWeights(
+        [
+            "test_weights",
+            "test_weights_2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+    assert weights.get_shape("layer.0.weight") == (2, 2)
+    assert weights.get_tensor("layer.1337.weight").shape == (2, 4)
+
+
+def test_get_tensor():
+    weights = MockWeights(
+        [
+            "test_weights",
+            "test_weights_2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+    assert torch.allclose(
+        weights.get_tensor("layer.0.weight"),
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+    assert torch.allclose(
+        weights.get_tensor("layer.1337.weight"),
+        torch.tensor(
+            [
+                [1, 2, 3, 4],
+                [5, 6, 7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_weights_col_packed():
+
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    block_sizes = 1
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        block_sizes=block_sizes,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_weights_col_packed_block_size():
+
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    block_sizes = 2
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        block_sizes=block_sizes,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_weights_col_packed_block_size_arr():
+
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    block_sizes = [1, 1]
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        block_sizes=block_sizes,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_multi_weights_col():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefixes = ["weight", "weight"]
+
+    w = weights.get_multi_weights_col(
+        prefixes=prefixes,
+        dim=0,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_weights_row():
+    weights = MockWeights(
+        [
+            "test_get_weights_row",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+
+    w = weights.get_weights_row(
+        prefix=prefix,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]],
+            dtype=torch.float32,
+        ),
+    )
+
+
+# test_get_weights_col
+
+
+def test_get_weights_col_awq(gptq_weights_loader_awq):
+    weights = MockWeights(
+        [
+            "test_get_weights_col_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader_awq,
+    )
+
+    prefix = "weight"
+
+    w = weights.get_weights_col(
+        prefix=prefix,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor(
+            [[100.0, 100.0], [100.0, 100.0]],
+            dtype=torch.float16,
+        ),
+        g_idx=None,
+        bits=8.0,
+        groupsize=2.0,
+        use_awq_kernel=True,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_weights_col_gtpq(gptq_weights_loader):
+    weights = MockWeights(
+        [
+            "test_get_weights_col_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader,
+    )
+
+    prefix = "weight"
+
+    w = weights.get_weights_col(
+        prefix=prefix,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        bits=8.0,
+        groupsize=2.0,
+        use_awq_kernel=False,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_weights_col_exl2():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_exl2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=Exl2WeightsLoader(),
+    )
+
+    prefix = "weight"
+
+    w = weights.get_weights_col(
+        prefix=prefix,
+    )
+
+    scaled_scale_max = 0.3906 * 256
+    expected_weight = Exl2Weight(
+        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        q_scale=torch.tensor([8], dtype=torch.int32),
+        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
+        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
+        q_groups=torch.tensor([4], dtype=torch.int16),
+    )
+
+    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
+    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
+    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
+    assert torch.allclose(
+        w.q_scale_max, expected_weight.q_scale_max
+    ), "q_scale_max mismatch"
+    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
+
+
+def test_get_weights_col_marlin(marlin_weights_loader):
+    weights = MockWeights(
+        [
+            "test_get_weights_col_marlin",
+        ],
+        device="cpu",
+        dtype=torch.float16,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=marlin_weights_loader,
+    )
+
+    prefix = "weight"
+
+    w = weights.get_weights_col(
+        prefix=prefix,
+    )
+
+    expected_weight = MarlinWeight(
+        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    )
+
+    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
+    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
+
+
+# test_get_weights_col_packed
+
+
+def test_get_weights_col_packed_awq(gptq_weights_loader_awq):
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader_awq,
+    )
+
+    prefix = "weight"
+    block_sizes = 1
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        block_sizes=block_sizes,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=None,
+        bits=8.0,
+        groupsize=2.0,
+        use_awq_kernel=True,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+@pytest.mark.skip(reason="Review expected functionality")
+def test_get_weights_col_packed_exl2():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed_exl2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=Exl2WeightsLoader(),
+    )
+
+    prefix = "weight"
+    block_sizes = 1
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        block_sizes=block_sizes,
+    )
+
+    scaled_scale_max = 0.3906 * 256
+    expected_weight = Exl2Weight(
+        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        q_scale=torch.tensor([8], dtype=torch.int32),
+        q_invperm=torch.tensor([1], dtype=torch.int16),
+        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
+        q_groups=torch.tensor([4], dtype=torch.int16),
+    )
+
+    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
+    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
+    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
+    assert torch.allclose(
+        w.q_scale_max, expected_weight.q_scale_max
+    ), "q_scale_max mismatch"
+    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
+
+
+def test_get_weights_col_packed_gptq(gptq_weights_loader):
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader,
+    )
+
+    prefixes = ["weight"]
+
+    w = weights.get_multi_weights_col(
+        prefixes=prefixes,
+        dim=0,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        bits=8.0,
+        groupsize=2.0,
+        use_awq_kernel=False,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_weights_col_packed_marlin(marlin_weights_loader):
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed_marlin",
+        ],
+        device="cpu",
+        dtype=torch.float16,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=marlin_weights_loader,
+    )
+
+    prefix = "weight"
+
+    w = weights.get_multi_weights_col(
+        prefixes=[prefix],
+        dim=0,
+    )
+
+    expected_weight = MarlinWeight(
+        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    )
+
+    print(expected_weight)
+
+    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
+    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
+
+
+# test_get_multi_weights_col
+
+
+def test_get_multi_weights_col_awq(gptq_weights_loader_awq):
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader_awq,
+    )
+
+    prefixes = ["weight"]
+
+    w = weights.get_multi_weights_col(
+        prefixes=prefixes,
+        dim=0,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=None,
+        bits=8.0,
+        groupsize=2.0,
+        use_awq_kernel=True,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_multi_weights_col_exl2():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col_exl2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=Exl2WeightsLoader(),
+    )
+
+    prefix = "weight"
+
+    try:
+        weights.get_multi_weights_col(
+            prefixes=[prefix],
+            dim=0,
+        )
+    except ValueError as e:
+        assert e.args[0] == "get_multi_weights_col is not supported for exl2"
+
+
+def test_get_multi_weights_col_gptq(gptq_weights_loader):
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader,
+    )
+
+    prefixes = ["weight"]
+
+    w = weights.get_multi_weights_col(
+        prefixes=prefixes,
+        dim=0,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        bits=8.0,
+        groupsize=2.0,
+        use_awq_kernel=False,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_multi_weights_col_marlin(marlin_weights_loader):
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col_marlin",
+        ],
+        device="cpu",
+        dtype=torch.float16,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=marlin_weights_loader,
+    )
+
+    prefix = "weight"
+
+    w = weights.get_multi_weights_col(
+        prefixes=[prefix],
+        dim=0,
+    )
+
+    expected_weight = MarlinWeight(
+        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    )
+
+    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
+    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
+
+
+# test_get_weights_row
+
+
+def test_get_weights_row_awq(gptq_weights_loader_awq):
+    weights = MockWeights(
+        [
+            "test_get_weights_row_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader_awq,
+    )
+
+    prefix = "weight"
+
+    w = weights.get_weights_row(
+        prefix=prefix,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=None,
+        bits=8.0,
+        groupsize=2.0,
+        use_awq_kernel=True,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_weights_row_exl2():
+    weights = MockWeights(
+        [
+            "test_get_weights_row_exl2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=Exl2WeightsLoader(),
+    )
+
+    prefix = "weight"
+
+    w = weights.get_weights_row(
+        prefix=prefix,
+    )
+    print(w)
+
+    scaled_scale_max = 0.3906 * 256
+    expected_weight = Exl2Weight(
+        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        q_scale=torch.tensor([8], dtype=torch.int32),
+        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
+        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
+        q_groups=torch.tensor([4], dtype=torch.int16),
+    )
+
+    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
+    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
+    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
+    assert torch.allclose(
+        w.q_scale_max, expected_weight.q_scale_max
+    ), "q_scale_max mismatch"
+    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
+
+
+def test_get_weights_row_gptq(gptq_weights_loader):
+    weights = MockWeights(
+        [
+            "test_get_weights_row_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=gptq_weights_loader,
+    )
+
+    prefix = "weight"
+
+    w = weights.get_weights_row(
+        prefix=prefix,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        bits=8.0,
+        groupsize=2.0,
+        use_awq_kernel=False,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_weights_row_marlin(marlin_weights_loader):
+    weights = MockWeights(
+        [
+            "test_get_weights_row_marlin",
+        ],
+        device="cpu",
+        dtype=torch.float16,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+        weights_loader=marlin_weights_loader,
+    )
+
+    prefix = "weight"
+
+    w = weights.get_weights_row(
+        prefix=prefix,
+    )
+
+    expected_weight = MarlinWeight(
+        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    )
+
+    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
+    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
diff --git a/backends/gaudi/server/text_generation_server/__init__.py b/backends/gaudi/server/text_generation_server/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backends/gaudi/server/text_generation_server/adapters/__init__.py b/backends/gaudi/server/text_generation_server/adapters/__init__.py
new file mode 100644
index 00000000..8697cb9e
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/adapters/__init__.py
@@ -0,0 +1,13 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/__init__.py
+# License:  Apache License Version 2.0, January 2004
+
+from text_generation_server.adapters.weights import (
+    AdapterBatchData,
+    AdapterBatchMetadata,
+)
+
+__all__ = [
+    "AdapterBatchData",
+    "AdapterBatchMetadata",
+]
diff --git a/backends/gaudi/server/text_generation_server/adapters/config.py b/backends/gaudi/server/text_generation_server/adapters/config.py
new file mode 100644
index 00000000..b7e27090
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/adapters/config.py
@@ -0,0 +1,30 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/config.py
+# License:  Apache License Version 2.0, January 2004
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Dict, Set, Tuple
+
+import torch
+
+from text_generation_server.adapters.weights import AdapterWeights
+
+
+@dataclass
+class ModuleMap:
+    module_name: str
+    module_weights: Dict[str, Tuple[torch.Tensor, str]]
+
+
+@dataclass
+class AdapterConfig(ABC):
+    base_model_name_or_path: str
+
+    @abstractmethod
+    def map_weights_for_model(
+        self,
+        adapter_weights: Dict[int, AdapterWeights],
+        weight_names: Tuple[str],
+    ) -> Tuple[ModuleMap, Set[str]]:
+        pass
diff --git a/backends/gaudi/server/text_generation_server/adapters/lora.py b/backends/gaudi/server/text_generation_server/adapters/lora.py
new file mode 100644
index 00000000..a00338e7
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/adapters/lora.py
@@ -0,0 +1,471 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/lora.py
+# License:  Apache License Version 2.0, January 2004
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
+
+import torch
+from peft import LoraConfig as _LoraConfig
+from torch.distributed import ProcessGroup
+
+from text_generation_server.adapters.config import AdapterConfig, ModuleMap
+
+from text_generation_server.adapters.weights import (
+    AdapterBatchMetadata,
+    AdapterWeights,
+    BatchAdapterWeights,
+)
+from text_generation_server.utils.sgmv import (
+    BGMV_MAX_RANK,
+    MAX_RANK_CUSTOM,
+    get_tmp_tensors,
+    orient_for_rank,
+    pad_rank,
+    use_cutlass_shrink,
+)
+
+
+def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
+    block_size = size // world_size
+    start = offset + rank * block_size
+    stop = offset + (rank + 1) * block_size
+    return start, stop
+
+
+def shard_on_dim(
+    t: torch.Tensor, dim: int, process_group: torch.distributed.ProcessGroup
+):
+    world_size = process_group.size()
+    rank = process_group.rank()
+
+    size = t.shape[dim]
+    start, stop = get_start_stop_idxs_for_rank(0, size, rank, world_size)
+
+    if dim == 0:
+        tensor = t[start:stop]
+    elif dim == 1:
+        tensor = t[:, start:stop]
+    else:
+        raise NotImplementedError("Let's make that generic when needed")
+
+    return tensor
+
+
+def shard_lora_weights(
+    weights_a: List[torch.Tensor],
+    weights_b: List[torch.Tensor],
+    split_dim: int,
+    process_group: ProcessGroup,
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    # [hidden_size, r]
+    weights_a = [
+        shard_on_dim(w, dim=split_dim, process_group=process_group) for w in weights_a
+    ]
+
+    # [r, hidden_size]
+    weights_b = [shard_on_dim(w, dim=1, process_group=process_group) for w in weights_b]
+
+    return weights_a, weights_b
+
+
+@dataclass
+class LoraConfig(AdapterConfig):
+    r: int
+    target_modules: Optional[Union[List[str], str]]
+    fan_in_fan_out: bool
+    lora_alpha: int
+    use_rslora: bool
+
+    def map_weights_for_model(
+        self,
+        adapter_weights: Dict[int, AdapterWeights],
+        weight_names: Tuple[str],
+    ) -> Tuple[ModuleMap, Set[str]]:
+        adapter_weight_names = set()
+        module_map = {}
+        for weight_name in weight_names:
+            lora_a_name = f"base_model.model.{weight_name}.lora_A.weight"
+            lora_b_name = f"base_model.model.{weight_name}.lora_B.weight"
+            if lora_a_name not in adapter_weights or lora_b_name not in adapter_weights:
+                continue
+
+            module_map[weight_name] = {
+                "lora_A": (adapter_weights[lora_a_name], lora_a_name),
+                "lora_B": (adapter_weights[lora_b_name], lora_b_name),
+            }
+            adapter_weight_names.add(lora_a_name)
+            adapter_weight_names.add(lora_b_name)
+        return module_map, adapter_weight_names
+
+    @classmethod
+    def load(cls, adapter_id: str, api_token: str) -> "LoraConfig":
+        hf_config = _LoraConfig.from_pretrained(adapter_id, token=api_token)
+        return cls(
+            base_model_name_or_path=hf_config.base_model_name_or_path,
+            r=hf_config.r,
+            target_modules=hf_config.target_modules,
+            fan_in_fan_out=hf_config.fan_in_fan_out,
+            lora_alpha=hf_config.lora_alpha,
+            use_rslora=(
+                hf_config.use_rslora if hasattr(hf_config, "use_rslora") else False
+            ),
+        )
+
+
+class LoraWeights(AdapterWeights):
+    """LoRA weights for a single adapter merged across all layers."""
+
+    def __init__(
+        self,
+        weights_a: List[torch.Tensor],
+        weights_b: List[torch.Tensor],
+        adapter_config: LoraConfig,
+    ):
+        self.lora_a_r = weights_a[0].size(1) if len(weights_a) > 0 else 1
+        self.lora_b_r = weights_b[0].size(0) if len(weights_a) > 0 else 1
+
+        self._use_cutlass_shrink = use_cutlass_shrink(self.lora_a_r)
+        self._is_transposed = False
+
+        # [num_layers, hidden_size, r]
+        weights_a = [orient_for_rank(w, w.size(1)).contiguous() for w in weights_a]
+        self._weights_a = torch.stack(weights_a)
+
+        # [num_layers, r, hidden_size]
+        self._weights_b = torch.stack(weights_b)
+
+        self.adapter_config = adapter_config
+
+    @property
+    def weights_a(self) -> torch.Tensor:
+        if self._is_transposed:
+            self._transpose_weights()
+        return self._weights_a
+
+    @property
+    def weights_b(self) -> torch.Tensor:
+        if self._is_transposed:
+            self._transpose_weights()
+        return self._weights_b
+
+    @property
+    def weights_a_t(self) -> torch.Tensor:
+        if not self._is_transposed:
+            self._transpose_weights()
+        return self._weights_a
+
+    @property
+    def weights_b_t(self) -> torch.Tensor:
+        if not self._is_transposed:
+            self._transpose_weights()
+        return self._weights_b
+
+    def _transpose_weights(self):
+        if self._use_cutlass_shrink:
+            # If we're not using the cutlass shrink, then both SGMV and BGMV use the same orientation
+            self._weights_a = self._weights_a.transpose(1, 2).contiguous()
+        self._weights_b = self._weights_b.transpose(1, 2).contiguous()
+        self._is_transposed = not self._is_transposed
+
+    @classmethod
+    def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]:
+        return [BatchLoraWeights]
+
+    # prepare pre-loaded lora weights for use in the model.
+    #
+    # this method processes and organizes lora weights for a specific layer type across all layers:
+    # - uses `config` (LoraConfig) to apply lora-specific settings like scaling factor.
+    # - retrieves weights from `module_map` based on the `layer_type`.
+    # - processes `nlayers` number of layers.
+    # - converts weights to the specified `dtype`.
+    # - shards weights across `world_size` number of processes using the `process_group`.
+    # - maps weights to specific layers using `target_to_layer`.
+    # - tracks `unused_weight_names` to identify any unused weights.
+    #
+    # the method handles weight transposition, scaling, and padding to ensure compatibility
+    # with SGMV or BGMV operations.
+    @classmethod
+    def prepare_weights(
+        cls,
+        config: LoraConfig,
+        module_map: Dict[str, Dict],
+        layer_type: str,
+        unused_weight_names: Set[str],
+        nlayers: int,
+        dtype: torch.dtype,
+        world_size: int,
+        process_group: ProcessGroup,
+        target_to_layer: Dict[str, Tuple[str, torch.Tensor]],
+    ) -> Optional[AdapterWeights]:
+        lora_a_list = [None] * nlayers
+        lora_b_list = [None] * nlayers
+
+        for layer_id in range(nlayers):
+            key = (layer_id, layer_type)
+            weight_name, layer = target_to_layer[key]
+            base_weight = layer.base_layer.linear.weight
+            base_device = base_weight.device
+
+            if weight_name not in module_map:
+                # There is no LoRA weight for this layer type in the adapter
+                return None
+
+            lora_a, lora_a_name = module_map[weight_name]["lora_A"]
+            lora_a = lora_a.to(base_device, dtype)
+
+            lora_b, lora_b_name = module_map[weight_name]["lora_B"]
+            lora_b = lora_b.to(base_device, dtype)
+
+            scale = get_scaling_factor(
+                config.lora_alpha,
+                config.r,
+                uses_rslora=config.use_rslora,
+            )
+
+            unused_weight_names.discard(lora_a_name)
+            unused_weight_names.discard(lora_b_name)
+
+            # Merge scaling factor into lora_b due to associativity of matrix multiplication:
+            # (A * B) * C = A * (B * C)
+            lora_a_list[layer_id] = lora_a.transpose(0, 1)
+            lora_b_list[layer_id] = lora_b.transpose(0, 1) * scale
+
+        # pad lora ranks to be compatible with sgmv
+        lora_a_list = [pad_rank(w, dim=1, world_size=world_size) for w in lora_a_list]
+        lora_b_list = [pad_rank(w, dim=0, world_size=world_size) for w in lora_b_list]
+
+        if lora_a_list:
+            # update rank if it was padded
+            padded_rank = lora_a_list[0].size(1)
+            config.r = padded_rank
+
+        return LoraWeights(
+            *shard_lora_weights(
+                weights_a=lora_a_list,
+                weights_b=lora_b_list,
+                split_dim=0 if layer_type in {"o_proj", "down_proj", "lm_head"} else 1,
+                process_group=process_group,
+            ),
+            config,
+        )
+
+
+@dataclass
+class RankSegments:
+    rank: int
+
+    lora_a_ptr: torch.Tensor
+    lora_b_ptr: torch.Tensor
+
+    # prefill (sgmv)
+    tmp_shrink: torch.Tensor
+    tmp_expand: torch.Tensor
+    segment_starts: torch.Tensor
+    segment_ends: torch.Tensor
+
+    # decode (bgmv)
+    indices: torch.Tensor
+
+
+@dataclass
+class BatchLoraWeights(BatchAdapterWeights):
+    lora_a: Dict[int, torch.Tensor]
+    lora_b: Dict[int, torch.Tensor]
+    adapter_index_configs: Dict[int, LoraConfig]
+    rank_data: Dict[int, RankSegments]
+    use_sgmv: bool
+
+    def has_adapter(self, adapter_index: int) -> bool:
+        return adapter_index in self.adapter_index_configs
+
+    def can_vectorize(self, pg: ProcessGroup) -> bool:
+        return all(
+            rank_data.rank // pg.size() <= MAX_RANK_CUSTOM
+            for rank_data in self.rank_data.values()
+        )
+
+    @classmethod
+    def load(
+        self,
+        adapter_weights: Dict[int, AdapterWeights],
+        meta: AdapterBatchMetadata,
+        prefill: bool,
+        prefill_head_indices: Optional[torch.Tensor],
+    ) -> Optional["BatchLoraWeights"]:
+        adapter_weights = {k: _convert_lora(v) for k, v in adapter_weights.items()}
+        adapter_weights = {
+            k: v for k, v in adapter_weights.items() if isinstance(v, LoraWeights)
+        }
+        if not adapter_weights:
+            return None
+
+        first_weights = next(iter(adapter_weights.values()))
+        device = first_weights.weights_a.device
+        segment_indices = meta.segment_indices
+
+        lora_a = {
+            idx: adapter_weights[idx].weights_a
+            for idx in segment_indices
+            if idx in adapter_weights
+        }
+        lora_b = {
+            idx: adapter_weights[idx].weights_b
+            for idx in segment_indices
+            if idx in adapter_weights
+        }
+
+        max_rank = max(
+            (
+                adapter_weights[idx].lora_a_r
+                for idx in segment_indices
+                if idx in adapter_weights
+            ),
+            default=0,
+        )
+
+        if prefill or max_rank > BGMV_MAX_RANK:
+            use_sgmv = True
+            lora_a_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_a.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+            lora_b_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_b.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+        else:
+            use_sgmv = False
+            lora_a_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_a_t.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+            lora_b_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_b_t.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+
+        adapter_index_configs = {
+            idx: adapter_weights[idx].adapter_config
+            for idx in segment_indices
+            if idx in adapter_weights
+        }
+
+        adapter_to_segment = {v: k for k, v in enumerate(segment_indices)}
+
+        rank_indices = defaultdict(list)
+        for segment_idx, adapter_idx in enumerate(segment_indices):
+            if adapter_idx not in adapter_weights:
+                continue
+            rank_indices[adapter_weights[adapter_idx].lora_a_r].append(segment_idx)
+
+        if prefill_head_indices is not None:
+            j, prefill_head_segment_starts, prefill_head_segment_ends = 1, [0], [0]
+            for head_index in prefill_head_indices:
+                # j cannot go out of bounds as that would mean there are tokens without corresponding adapters
+                if head_index < meta.adapter_segments[j]:
+                    prefill_head_segment_ends[-1] += 1
+                else:
+                    prefill_head_segment_starts.append(prefill_head_segment_ends[-1])
+                    prefill_head_segment_ends.append(prefill_head_segment_ends[-1] + 1)
+                    j += 1
+
+        rank_data = {}
+        for rank, indices in rank_indices.items():
+            tmp_shrink = None
+            tmp_expand = None
+            segment_starts = None
+            segment_ends = None
+            batch_indices = None
+
+            if use_sgmv:
+                lora_a_ptr_indices = lora_a_ptr[indices]
+                tmp_shrink, tmp_expand = get_tmp_tensors(
+                    lora_a_ptr_indices.size(0), rank, device
+                )
+                segment_starts = meta.adapter_segments[indices]
+                segment_ends = meta.adapter_segments[[i + 1 for i in indices]]
+                if prefill_head_indices is not None:
+                    for i, segment_index in enumerate(indices):
+                        segment_starts[i] = prefill_head_segment_starts[segment_index]
+                        segment_ends[i] = prefill_head_segment_ends[segment_index]
+            else:
+                rank_indices = set(indices)
+                batch_indices = [
+                    adapter_to_segment[idx] for idx in meta.adapter_indices.tolist()
+                ]
+                batch_indices = [
+                    idx if idx in rank_indices else -1 for idx in batch_indices
+                ]
+                batch_indices = torch.tensor(
+                    batch_indices, dtype=torch.int64, device=device
+                )
+
+            rank_data[rank] = RankSegments(
+                rank=rank,
+                tmp_shrink=tmp_shrink,
+                tmp_expand=tmp_expand,
+                lora_a_ptr=lora_a_ptr[indices],
+                lora_b_ptr=lora_b_ptr[indices],
+                segment_starts=segment_starts,
+                segment_ends=segment_ends,
+                indices=batch_indices,
+            )
+
+        return BatchLoraWeights(
+            lora_a=lora_a,
+            lora_b=lora_b,
+            adapter_index_configs=adapter_index_configs,
+            rank_data=rank_data,
+            use_sgmv=use_sgmv,
+        )
+
+
+def get_scaling_factor(
+    lora_alpha: int,
+    r: int,
+    uses_rslora: bool = False,
+) -> float:
+    """Computes the scaling factor for the lora weights."""
+    if uses_rslora:
+        return lora_alpha / (r**0.5)
+    return lora_alpha / r
+
+
+def _convert_lora(v: AdapterWeights) -> AdapterWeights:
+    if hasattr(v, "lora_weights"):
+        return v.lora_weights
+    return v
diff --git a/backends/gaudi/server/text_generation_server/adapters/weights.py b/backends/gaudi/server/text_generation_server/adapters/weights.py
new file mode 100644
index 00000000..da75dbcd
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/adapters/weights.py
@@ -0,0 +1,146 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/weights.py
+# License:  Apache License Version 2.0, January 2004
+
+from abc import ABC, abstractclassmethod
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Type
+
+import torch
+
+
+@dataclass
+class AdapterBatchMetadata:
+    # [batch_size]
+    adapter_indices: torch.Tensor
+
+    # [num_adapters]
+    adapter_set: Set[int]
+
+    # [num_segments + 1]
+    adapter_segments: torch.Tensor
+
+    # [num_segments]
+    # maps from segment index to adapter index, i.e.:
+    # segment_indices[s] == adapter_indices[i]
+    segment_indices: List[int]
+
+
+class AdapterWeights(ABC):
+    @abstractclassmethod
+    def get_batch_types(cls) -> List[Type["BatchAdapterWeights"]]:
+        pass
+
+    @property
+    def speculative_tokens(self) -> int:
+        return 0
+
+
+class BatchAdapterWeights(ABC):
+    @abstractclassmethod
+    def has_adapter(self, adapter_index: int) -> bool:
+        pass
+
+    @abstractclassmethod
+    def load(
+        cls,
+        adapter_weights: Dict[int, AdapterWeights],
+        meta: "AdapterBatchMetadata",
+        prefill: bool,
+        prefill_head_indices: torch.Tensor,
+    ) -> Optional["BatchAdapterWeights"]:
+        pass
+
+
+class LayerAdapterWeights:
+    """Adapter weights that apply to a particular layer."""
+
+    def __init__(self):
+        self.adapter_weights: Dict[int, AdapterWeights] = {}
+
+    def add_adapter(self, adapter_idx: int, weights: AdapterWeights):
+        self.adapter_weights[adapter_idx] = weights
+
+    def remove_adapter(self, adapter_idx: int):
+        if adapter_idx not in self.adapter_weights:
+            return
+        del self.adapter_weights[adapter_idx]
+
+    def is_empty(self) -> bool:
+        return len(self.adapter_weights) == 0
+
+    def get_data(
+        self,
+        meta: AdapterBatchMetadata,
+        prefill: bool,
+        prefill_head_indices: Optional[torch.Tensor],
+    ) -> Dict[str, BatchAdapterWeights]:
+        # bucket adapters by batch class
+        adapter_batch_types: Dict[
+            Type[BatchAdapterWeights], Dict[int, AdapterWeights]
+        ] = defaultdict(dict)
+        for adapter_index, adapter_weights in self.adapter_weights.items():
+            for batch_type in adapter_weights.get_batch_types():
+                adapter_batch_types[batch_type][adapter_index] = adapter_weights
+
+        batch_data = {}
+        for batch_type, adapter_weights in adapter_batch_types.items():
+            batched_weights = batch_type.load(
+                adapter_weights, meta, prefill, prefill_head_indices
+            )
+            if batched_weights is not None:
+                batch_data = batched_weights
+        return batch_data
+
+
+@dataclass
+class AdapterBatchData:
+    meta: AdapterBatchMetadata
+
+    # layer type -> adapter type -> batch weight data
+    data: Dict[str, Dict[str, BatchAdapterWeights]]
+
+    prefill: bool
+
+    @staticmethod
+    def from_meta(
+        meta: AdapterBatchMetadata,
+        weights: Dict[str, LayerAdapterWeights],
+        prefill: bool,
+        prefill_head_indices: Optional[torch.Tensor],
+    ) -> "AdapterBatchData":
+        data = {}
+        for k, v in weights.items():
+            if v.is_empty():
+                continue
+            data[k] = v.get_data(
+                meta, prefill, prefill_head_indices if k == "lm_head" else None
+            )
+        return AdapterBatchData(meta=meta, data=data, prefill=prefill)
+
+    def ranks(self) -> Set[int]:
+        # TODO(travis): refactor to be less coupled to lora implementation
+        ranks = set()
+        for lora_data in self.data.values():
+            if lora_data is None:
+                continue
+
+            for rank_data in lora_data.rank_data.values():
+                ranks.add(rank_data.rank)
+
+        return ranks
+
+    def layer_names(self) -> Set[str]:
+        return set(self.data.keys())
+
+    def adapter_keys(self) -> Set[str]:
+        adapter_keys = set()
+        for layer_data in self.data.values():
+            adapter_keys.update(layer_data.keys())
+        return adapter_keys
+
+    @property
+    def max_rank(self) -> int:
+        ranks = self.ranks()
+        return max(ranks) if len(ranks) > 0 else 0
diff --git a/backends/gaudi/server/text_generation_server/cache.py b/backends/gaudi/server/text_generation_server/cache.py
new file mode 100644
index 00000000..4504733e
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/cache.py
@@ -0,0 +1,34 @@
+import torch
+
+from typing import Dict, Optional, TypeVar
+
+from text_generation_server.models.types import Batch
+
+B = TypeVar("B", bound=Batch)
+
+
+class Cache:
+    def __init__(self):
+        self.cache: Dict[int, B] = {}
+
+    def pop(self, batch_id: int) -> Optional[B]:
+        return self.cache.pop(batch_id, None)
+
+    def set(self, entry: B):
+        if entry is not None:
+            self.cache[entry.batch_id] = entry
+
+    def delete(self, batch_id: int):
+        batch = self.pop(batch_id)
+        if batch is not None:
+            del batch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    def clear(self):
+        keys = list(self.cache.keys())
+        for k in keys:
+            self.delete(k)
+
+    def __len__(self):
+        return len(self.cache.keys())
diff --git a/backends/gaudi/server/text_generation_server/cli.py b/backends/gaudi/server/text_generation_server/cli.py
new file mode 100644
index 00000000..700f763e
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/cli.py
@@ -0,0 +1,429 @@
+import os
+import psutil
+import signal
+import sys
+import typer
+
+from pathlib import Path
+from loguru import logger
+from typing import Optional
+from enum import Enum
+from huggingface_hub import hf_hub_download
+from text_generation_server.utils.adapter import parse_lora_adapters
+
+
+app = typer.Typer()
+
+
+class Quantization(str, Enum):
+    bitsandbytes = "bitsandbytes"
+    bitsandbytes_nf4 = "bitsandbytes-nf4"
+    bitsandbytes_fp4 = "bitsandbytes-fp4"
+    gptq = "gptq"
+    awq = "awq"
+    eetq = "eetq"
+    exl2 = "exl2"
+    fp8 = "fp8"
+    marlin = "marlin"
+
+
+class Dtype(str, Enum):
+    float16 = "float16"
+    bloat16 = "bfloat16"
+
+
+@app.command()
+def serve(
+    model_id: str,
+    revision: Optional[str] = None,
+    sharded: bool = False,
+    quantize: Optional[Quantization] = None,
+    speculate: Optional[int] = None,
+    dtype: Optional[Dtype] = None,
+    trust_remote_code: bool = False,
+    uds_path: Path = "/tmp/text-generation-server",
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    otlp_endpoint: Optional[str] = None,
+    otlp_service_name: str = "text-generation-inference.server",
+    max_input_tokens: Optional[int] = None,
+):
+    if sharded:
+        # assert (
+        #     os.getenv("RANK", None) is not None
+        # ), "RANK must be set when sharded is True"
+        assert (
+            os.getenv("WORLD_SIZE", None) is not None
+        ), "WORLD_SIZE must be set when sharded is True"
+        assert (
+            os.getenv("MASTER_ADDR", None) is not None
+        ), "MASTER_ADDR must be set when sharded is True"
+        assert (
+            os.getenv("MASTER_PORT", None) is not None
+        ), "MASTER_PORT must be set when sharded is True"
+
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    # Import here after the logger is added to log potential import exceptions
+    from text_generation_server import server
+    from text_generation_server.tracing import setup_tracing
+
+    # Setup OpenTelemetry distributed tracing
+    if otlp_endpoint is not None:
+        setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)
+
+    lora_adapters = parse_lora_adapters(os.getenv("LORA_ADAPTERS"))
+
+    # TODO: enable lora with cuda graphs. for now disable cuda graphs if lora is enabled
+    # and warn the user
+    if lora_adapters:
+        logger.warning("LoRA adapters enabled (experimental feature).")
+
+        if "CUDA_GRAPHS" in os.environ:
+            logger.warning(
+                "LoRA adapters incompatible with CUDA Graphs. Disabling CUDA Graphs."
+            )
+            global CUDA_GRAPHS
+            CUDA_GRAPHS = None
+
+    # Downgrade enum into str for easier management later on
+    quantize = None if quantize is None else quantize.value
+    dtype = "bfloat16" if dtype is None else dtype.value
+    logger.info(f"quantize={quantize}")
+    if dtype is not None and quantize not in {
+        None,
+        "bitsandbytes",
+        "bitsandbytes-nf4",
+        "bitsandbytes-fp4",
+    }:
+        raise RuntimeError(
+            "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
+        )
+
+    logger.info("CLI SHARDED = {} DTYPE = {}".format(sharded, dtype))
+
+    if sharded:
+        tgi_file = Path(__file__).resolve().parent / "tgi_service.py"
+        num_shard = int(os.getenv("WORLD_SIZE", "1"))
+        logger.info("CLI SHARDED = {}".format(num_shard))
+        import subprocess
+
+        cmd = (
+            f"deepspeed --num_nodes 1 --num_gpus {num_shard} --no_local_rank {tgi_file}"
+        )
+        cmd += f" --model_id {model_id} --revision {revision} --sharded {sharded}"
+        cmd += f" --dtype {dtype} --trust_remote_code {trust_remote_code} --uds_path {uds_path}"
+        cmd += f" --quantize {quantize} --max_input_tokens {max_input_tokens}"
+        if speculate is not None:
+            cmd += f"--speculate {speculate}"
+        logger.info("CLI server start deepspeed ={} ".format(cmd))
+        sys.stdout.flush()
+        sys.stderr.flush()
+        with subprocess.Popen(cmd, shell=True, executable="/bin/bash") as proc:
+            do_terminate = False
+            current_handler = signal.getsignal(signal.SIGTERM)
+
+            def terminate_handler(sig, frame):
+                nonlocal do_terminate
+                do_terminate = True
+                if callable(current_handler):
+                    current_handler(sig, frame)
+
+            signal.signal(signal.SIGTERM, terminate_handler)
+
+            finished = False
+            while not finished:
+                try:
+                    if do_terminate:
+                        parent = psutil.Process(proc.pid)
+                        all_procs = parent.children(recursive=True) + [parent]
+                        for p in all_procs:
+                            try:
+                                p.terminate()
+                            except psutil.NoSuchProcess:
+                                pass
+                        _, alive = psutil.wait_procs(all_procs, timeout=30)
+                        for p in alive:
+                            p.kill()
+
+                        do_terminate = False
+
+                    proc.wait(timeout=3)
+                except subprocess.TimeoutExpired:
+                    pass
+                else:
+                    finished = True
+
+            sys.stdout.flush()
+            sys.stderr.flush()
+            if proc.returncode != 0:
+                logger.error(f"{cmd}  exited with status = {proc.returncode}")
+                return proc.returncode
+    else:
+        server.serve(
+            model_id,
+            lora_adapters,
+            revision,
+            sharded,
+            quantize,
+            speculate,
+            dtype,
+            trust_remote_code,
+            uds_path,
+            max_input_tokens,
+        )
+
+
+@app.command()
+def download_weights(
+    model_id: str,
+    revision: Optional[str] = None,
+    extension: str = ".safetensors",
+    auto_convert: bool = True,
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    trust_remote_code: bool = False,
+    merge_lora: bool = False,
+):
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    # Import here after the logger is added to log potential import exceptions
+    from text_generation_server import utils
+
+    # Test if files were already download
+    try:
+        utils.weight_files(model_id, revision, extension)
+        logger.info("Files are already present on the host. " "Skipping download.")
+        return
+    # Local files not found
+    except (utils.LocalEntryNotFoundError, FileNotFoundError, utils.EntryNotFoundError):
+        pass
+
+    is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(
+        "WEIGHTS_CACHE_OVERRIDE", None
+    ) is not None
+
+    if not is_local_model:
+        # TODO: maybe reverse the default value of merge_lora?
+        # currently by default we don't merge the weights with the base model
+        if merge_lora:
+            try:
+                hf_hub_download(
+                    model_id, revision=revision, filename="adapter_config.json"
+                )
+                utils.download_and_unload_peft(
+                    model_id, revision, trust_remote_code=trust_remote_code
+                )
+                is_local_model = True
+                utils.weight_files(model_id, revision, extension)
+                return
+            except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+                pass
+        else:
+            try:
+                utils.peft.download_peft(
+                    model_id, revision, trust_remote_code=trust_remote_code
+                )
+            except Exception:
+                pass
+
+        try:
+            import json
+
+            config = hf_hub_download(
+                model_id, revision=revision, filename="config.json"
+            )
+            with open(config, "r") as f:
+                config = json.load(f)
+
+            base_model_id = config.get("base_model_name_or_path", None)
+            if base_model_id and base_model_id != model_id:
+                try:
+                    logger.info(f"Downloading parent model {base_model_id}")
+                    download_weights(
+                        model_id=base_model_id,
+                        revision="main",
+                        extension=extension,
+                        auto_convert=auto_convert,
+                        logger_level=logger_level,
+                        json_output=json_output,
+                        trust_remote_code=trust_remote_code,
+                    )
+                except Exception:
+                    pass
+        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+            pass
+
+        # Try to download weights from the hub
+        try:
+            filenames = utils.weight_hub_files(model_id, revision, extension)
+            utils.download_weights(filenames, model_id, revision)
+            # Successfully downloaded weights
+            return
+
+        # No weights found on the hub with this extension
+        except utils.EntryNotFoundError as e:
+            # Check if we want to automatically convert to safetensors or if we can use .bin weights instead
+            if not extension == ".safetensors" or not auto_convert:
+                raise e
+
+    elif (Path(model_id) / "adapter_config.json").exists():
+        # Try to load as a local PEFT model
+        try:
+            utils.download_and_unload_peft(
+                model_id, revision, trust_remote_code=trust_remote_code
+            )
+            utils.weight_files(model_id, revision, extension)
+            return
+        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+            pass
+    elif (Path(model_id) / "config.json").exists():
+        # Try to load as a local Medusa model
+        try:
+            import json
+
+            config = Path(model_id) / "config.json"
+            with open(config, "r") as f:
+                config = json.load(f)
+
+            base_model_id = config.get("base_model_name_or_path", None)
+            if base_model_id:
+                try:
+                    logger.info(f"Downloading parent model {base_model_id}")
+                    download_weights(
+                        model_id=base_model_id,
+                        revision="main",
+                        extension=extension,
+                        auto_convert=auto_convert,
+                        logger_level=logger_level,
+                        json_output=json_output,
+                        trust_remote_code=trust_remote_code,
+                    )
+                except Exception:
+                    pass
+        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+            pass
+
+    # Try to see if there are local pytorch weights
+    try:
+        # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
+        try:
+            local_pt_files = utils.weight_files(model_id, revision, ".bin")
+        except Exception:
+            local_pt_files = utils.weight_files(model_id, revision, ".pt")
+
+    # No local pytorch weights
+    except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+        if extension == ".safetensors":
+            logger.warning(
+                f"No safetensors weights found for model {model_id} at revision {revision}. "
+                f"Downloading PyTorch weights."
+            )
+
+        # Try to see if there are pytorch weights on the hub
+        pt_filenames = utils.weight_hub_files(model_id, revision, ".bin")
+        # Download pytorch weights
+        local_pt_files = utils.download_weights(pt_filenames, model_id, revision)
+
+    if auto_convert:
+        if not trust_remote_code:
+            logger.warning(
+                "🚨🚨BREAKING CHANGE in 2.0🚨🚨: Safetensors conversion is disabled without `--trust-remote-code` because "
+                "Pickle files are unsafe and can essentially contain remote code execution!"
+                "Please check for more information here: https://huggingface.co/docs/text-generation-inference/basic_tutorials/safety",
+            )
+
+        logger.warning(
+            f"No safetensors weights found for model {model_id} at revision {revision}. "
+            f"Converting PyTorch weights to safetensors."
+        )
+
+        # Safetensors final filenames
+        local_st_files = [
+            p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors"
+            for p in local_pt_files
+        ]
+        try:
+            import transformers
+            import json
+
+            if is_local_model:
+                config_filename = os.path.join(model_id, "config.json")
+            else:
+                config_filename = hf_hub_download(
+                    model_id, revision=revision, filename="config.json"
+                )
+            with open(config_filename, "r") as f:
+                config = json.load(f)
+            architecture = config["architectures"][0]
+
+            class_ = getattr(transformers, architecture)
+
+            # Name for this varible depends on transformers version.
+            discard_names = getattr(class_, "_tied_weights_keys", [])
+
+        except Exception:
+            discard_names = []
+        # Convert pytorch weights to safetensors
+        utils.convert_files(local_pt_files, local_st_files, discard_names)
+
+
+@app.command()
+def quantize(
+    model_id: str,
+    output_dir: str,
+    revision: Optional[str] = None,
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    trust_remote_code: bool = False,
+    upload_to_model_id: Optional[str] = None,
+    percdamp: float = 0.01,
+    act_order: bool = False,
+    groupsize: int = 128,
+):
+    if revision is None:
+        revision = "main"
+    download_weights(
+        model_id=model_id,
+        revision=revision,
+        logger_level=logger_level,
+        json_output=json_output,
+    )
+    from text_generation_server.layers.gptq.quantize import quantize
+
+    quantize(
+        model_id=model_id,
+        bits=4,
+        groupsize=groupsize,
+        output_dir=output_dir,
+        revision=revision,
+        trust_remote_code=trust_remote_code,
+        upload_to_model_id=upload_to_model_id,
+        percdamp=percdamp,
+        act_order=act_order,
+        sym=True,
+    )
+
+
+if __name__ == "__main__":
+    app()
diff --git a/backends/gaudi/server/text_generation_server/habana_quantization_env.py b/backends/gaudi/server/text_generation_server/habana_quantization_env.py
new file mode 100644
index 00000000..b03b7e26
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/habana_quantization_env.py
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
+import os
+import habana_frameworks.torch as htorch
+
+quant_config = os.getenv("QUANT_CONFIG", "")
+is_quantization_enabled = quant_config != ""
+
+if is_quantization_enabled:
+    os.environ.setdefault("ENABLE_EXPERIMENTAL_FLAGS", "true")
+    os.environ.setdefault("USE_DEFAULT_QUANT_PARAM", "true")
+    os.environ.setdefault("UPDATE_GRAPH_OUTPUT_MME", "false")
+    os.environ.setdefault("ENABLE_CALC_DYNAMIC_RANGE", "false")
+    os.environ.setdefault("UPDATE_MME_OUTPUT_PRECISION_FILTER", "v_proj,matmul_av")
+    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
+
+
+def patch_scoped_linear_all_reduce(model):
+    from deepspeed.module_inject.layers import LinearAllreduce
+    from optimum.habana.transformers.models.modeling_all_models import (
+        ScopedLinearAllReduce,
+    )
+
+    for name, module in model.named_children():
+        if type(module) is LinearAllreduce:
+            SL = ScopedLinearAllReduce(mod=module)
+            setattr(model, name, SL)
+        patch_scoped_linear_all_reduce(module)
+
+
+def setup_quantization(model):
+    if is_quantization_enabled:
+        htorch.core.quantization._mark_params_as_const(model)
+        htorch.core.quantization._check_params_as_const(model)
+        htorch.core.hpu_initialize(model)
+    return model
+
+
+def prepare_model_for_quantization(model):
+    if is_quantization_enabled:
+        if model.config.model_type in [
+            "llama",
+            "falcon",
+            "qwen2",
+            "starcoder2",
+            "gemma",
+        ]:
+            patch_scoped_linear_all_reduce(model)
+        from neural_compressor.torch.quantization import FP8Config, convert
+
+        config = FP8Config.from_json_file(quant_config)
+        model = convert(model, config)
+    return model
diff --git a/backends/gaudi/server/text_generation_server/interceptor.py b/backends/gaudi/server/text_generation_server/interceptor.py
new file mode 100644
index 00000000..47f33cd0
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/interceptor.py
@@ -0,0 +1,45 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
+import torch
+import grpc
+
+from google.rpc import status_pb2, code_pb2
+from grpc_status import rpc_status
+from grpc_interceptor.server import AsyncServerInterceptor
+from loguru import logger
+from typing import Callable, Any
+import traceback
+import os
+
+
+class ExceptionInterceptor(AsyncServerInterceptor):
+    async def intercept(
+        self,
+        method: Callable,
+        request_or_iterator: Any,
+        context: grpc.ServicerContext,
+        method_name: str,
+    ) -> Any:
+        try:
+            response = method(request_or_iterator, context)
+            return await response
+        except Exception as err:
+            trace = " " + traceback.format_exc() if os.environ.get("DUMP_STACK") else ""
+            method_name = method_name.split("/")[-1]
+            logger.exception(f"Method {method_name} encountered an error.")
+
+            # Runtime Error cannot be recovered from
+            if isinstance(err, RuntimeError):
+                exit(1)
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+            from .utils.debug import dbg_trace
+
+            dbg_trace("EXCEPTION", traceback.format_exc())
+            await context.abort_with_status(
+                rpc_status.to_status(
+                    status_pb2.Status(code=code_pb2.INTERNAL, message=str(err) + trace)
+                )
+            )
diff --git a/backends/gaudi/server/text_generation_server/layers/__init__.py b/backends/gaudi/server/text_generation_server/layers/__init__.py
new file mode 100644
index 00000000..0000ca91
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/__init__.py
@@ -0,0 +1,34 @@
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    TensorParallelEmbedding,
+)
+from text_generation_server.layers.linear import (
+    get_linear,
+    FastLinear,
+)
+from text_generation_server.layers.speculative import SpeculativeHead
+
+# Just to add the `load` methods.
+from text_generation_server.layers.layernorm import load_layer_norm
+from text_generation_server.layers.conv import load_conv2d
+
+from text_generation_server.layers.lora import (
+    LoraLinear,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
+)
+
+__all__ = [
+    "get_linear",
+    "FastLinear",
+    "TensorParallelColumnLinear",
+    "TensorParallelRowLinear",
+    "TensorParallelEmbedding",
+    "SpeculativeHead",
+    "LoraLinear",
+    "TensorParallelMultiAdapterLinear",
+    "TensorParallelAdapterRowLinear",
+    "load_layer_norm",
+    "load_conv2d",
+]
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/__init__.py b/backends/gaudi/server/text_generation_server/layers/attention/__init__.py
new file mode 100644
index 00000000..4d83a11f
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/attention/__init__.py
@@ -0,0 +1,43 @@
+from text_generation_server.utils.import_utils import SYSTEM
+import os
+
+from .common import Seqlen
+
+if os.getenv("USE_FLASH_ATTENTION", "true").lower() == "false":
+    raise ImportError("`USE_FLASH_ATTENTION` is false.")
+if SYSTEM == "cuda":
+    from .cuda import (
+        attention,
+        paged_attention,
+        reshape_and_cache,
+        SUPPORTS_WINDOWING,
+        PREFILL_IN_KV_CACHE,
+    )
+elif SYSTEM == "rocm":
+    from .rocm import (
+        attention,
+        paged_attention,
+        reshape_and_cache,
+        PREFILL_IN_KV_CACHE,
+        SUPPORTS_WINDOWING,
+    )
+elif SYSTEM == "ipex":
+    from .ipex import (
+        attention,
+        paged_attention,
+        reshape_and_cache,
+        PREFILL_IN_KV_CACHE,
+        SUPPORTS_WINDOWING,
+    )
+else:
+    raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")
+
+
+__all__ = [
+    "attention",
+    "paged_attention",
+    "reshape_and_cache",
+    "PREFILL_IN_KV_CACHE",
+    "SUPPORTS_WINDOWING",
+    "Seqlen",
+]
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/common.py b/backends/gaudi/server/text_generation_server/layers/attention/common.py
new file mode 100644
index 00000000..d6e512c0
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/attention/common.py
@@ -0,0 +1,72 @@
+from dataclasses import dataclass
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.models.globals import ATTENTION
+import torch
+from typing import Optional
+
+
+if ATTENTION in {"flashinfer", "flashdecoding"}:
+
+    @dataclass
+    class Seqlen:
+        input_lengths: torch.Tensor
+        prefix_lengths: torch.Tensor
+        cu_seqlen_q: Optional[torch.Tensor]
+        cu_seqlen_k: Optional[torch.Tensor]
+        max_q: int
+        max_k: int
+
+        def __init__(
+            self,
+            input_lengths,
+            prefix_lengths,
+            cu_seqlen_q=None,
+            max_q=None,
+            max_k=None,
+        ):
+            self.input_lengths = input_lengths
+            self.prefix_lengths = prefix_lengths
+            device = self.input_lengths.device
+            shape = self.input_lengths.shape
+            if cu_seqlen_q is None:
+                cu_seqlen_q = torch.arange(
+                    shape[0] + 1,
+                    device=device,
+                    dtype=torch.int32,
+                )
+                max_q = 1
+            else:
+                assert max_q is not None
+            assert max_k is not None
+            cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
+
+            # cuda graphs don't like this and this is necessary to clamp within mistral
+            # Although FA2 might not want the clamping
+            # cu_seqlen_k[0] = 0
+            total = self.input_lengths + self.prefix_lengths
+            torch.cumsum(total, -1, out=cu_seqlen_k[1:])
+
+            self.cu_seqlen_q = cu_seqlen_q
+            self.cu_seqlen_k = cu_seqlen_k
+            self.max_q = max_q
+            self.max_k = max_k
+
+        def clamp(self, max):
+            # Flash decoding doesn't need to clamp
+            return self
+
+else:
+
+    @dataclass
+    class Seqlen:
+        input_lengths: torch.Tensor
+        prefix_lengths: torch.Tensor
+        cu_seqlen_q: torch.Tensor
+        max_q: int
+        max_k: int
+
+        def clamp(self, max):
+            if SYSTEM == "rocm":
+                return self
+            raise NotImplementedError("Not implemented seqlen for paged")
+            return Seqlen(torch.clamp(self.input_lengths, max=max))
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/cuda.py b/backends/gaudi/server/text_generation_server/layers/attention/cuda.py
new file mode 100644
index 00000000..51af928d
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/attention/cuda.py
@@ -0,0 +1,357 @@
+import torch
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.models.globals import (
+    ATTENTION,
+    BLOCK_SIZE,
+)
+from text_generation_server.layers.attention import Seqlen
+from typing import Optional
+
+major, minor = torch.cuda.get_device_capability()
+is_sm75 = major == 7 and minor == 5
+_PARTITION_SIZE = 512
+
+try:
+    from vllm._C import cache_ops
+except Exception as e:
+    raise ImportError(
+        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+    )
+
+
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+):
+    if ATTENTION in {"flashdecoding", "flashinfer"}:
+        shape = key_cache.shape
+        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
+        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
+    else:
+        cache_ops.reshape_and_cache(
+            key, value, key_cache, value_cache, slots, "auto", 1.0
+        )
+
+
+def paged_attention(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    kv_head_mapping: torch.Tensor,
+    softmax_scale: float,
+    block_tables: torch.Tensor,
+    seqlen: Seqlen,
+    max_s: int,
+    softcap: Optional[float] = None,
+):
+    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
+    # Copyright 2023 The vLLM team. All rights
+    # reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    #
+
+    # value_cache => [num_blocks, num_heads, head_size, block_size]
+    # block_size = value_cache.shape[3]
+    block_size = BLOCK_SIZE
+    num_seqs, num_heads, head_size = query.shape
+    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
+
+    # NOTE(woosuk): We use a simple heuristic to decide whether to use
+    # PagedAttention V1 or V2. If the number of partitions is 1, we use
+    # V1 to avoid the overhead of reduction. Also, if the number of
+    # sequences or heads is large, we use V1 since there is enough work
+    # to parallelize.
+    if ATTENTION == "flashinfer":
+        from text_generation_server.layers.attention.flashinfer import decode_state
+
+        return decode_state.get().forward(
+            query.contiguous(),
+            paged_kv_cache=(key_cache, value_cache),
+            logits_soft_cap=softcap,
+            sm_scale=softmax_scale,
+        )
+    elif ATTENTION == "flashdecoding":
+        max_q = 1
+        max_k = max_s
+        import flash_attn_2_cuda
+
+        # TODO fixme when flash contains the fix.
+        # Number of splits is not correctly handled
+        # by the current path
+        # https://github.com/Dao-AILab/flash-attention/blob/320fb59487658f033f56711efd3d61b7c7a6f8f3/csrc/flash_attn/flash_api.cpp#L577
+        # This fails becuase we're using causal, therefore window_right is set to 0 and the split logic is never applied.
+        if softcap is None:
+            softcap = 0.0
+        out = flash_attn_2_cuda.varlen_fwd(
+            query,
+            key_cache,
+            value_cache,
+            None,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_k,
+            None,  # pad_k
+            None,
+            block_tables,
+            None,
+            max_q,
+            max_k,
+            0.0,  # dropout
+            softmax_scale,
+            False,  # zero_tensors
+            True,  # causal
+            -1,  # Window_left
+            -1,  # Window right
+            softcap,
+            False,  # return softmax
+            None,  # generator
+        )
+        return out[0]
+    else:
+        if softcap is not None:
+            raise RuntimeError("Paged attention doesn't support softcapping")
+        input_lengths = seqlen.input_lengths
+        from vllm._C import ops
+
+        out = torch.empty_like(query)
+
+        use_v1 = max_s <= 8192 and (
+            max_num_partitions == 1 or num_seqs * num_heads > 512
+        )
+        if use_v1:
+            ops.paged_attention_v1(
+                out,
+                query,
+                key_cache,
+                value_cache,
+                kv_head_mapping,
+                softmax_scale,
+                block_tables,
+                input_lengths,
+                block_size,
+                max_s,
+                None,
+                "auto",
+                1.0,
+            )
+        else:
+            # Run PagedAttention V2.
+            assert _PARTITION_SIZE % block_size == 0
+            tmp_output = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions, head_size),
+                dtype=out.dtype,
+                device=out.device,
+            )
+            exp_sums = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions),
+                dtype=torch.float32,
+                device=out.device,
+            )
+            max_logits = torch.empty_like(exp_sums)
+
+            ops.paged_attention_v2(
+                out,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                kv_head_mapping,
+                softmax_scale,
+                block_tables,
+                input_lengths,
+                block_size,
+                max_s,
+                None,
+                "auto",
+                1.0,
+            )
+    return out
+
+
+try:
+    is_ampere_or_newer = major >= 8 and minor >= 0
+    if not is_ampere_or_newer:
+        raise ImportError("FlashAttention only supports Ampere GPUs or newer.")
+
+    import flash_attn_2_cuda
+
+    V2 = True
+except ImportError:
+    try:
+        import flash_attn_cuda
+
+        V2 = False
+    except ImportError as e:
+        if major >= 8:
+            architecture_suffix = f"-{SYSTEM}"
+            raise ImportError(
+                "Flash Attention V2 is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
+            )
+        elif is_sm75:
+            raise ImportError(
+                "Flash Attention is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                "or install flash attention with `cd server && make install install-flash-attention`"
+            ) from e
+        else:
+            raise ImportError(
+                f"GPU with CUDA capability {major} {minor} is not supported"
+            ) from e
+
+
+SUPPORTS_WINDOWING = V2
+
+if ATTENTION == "flashinfer":
+
+    def attention(
+        q: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        seqlen: Seqlen,
+        block_tables: torch.Tensor,
+        softmax_scale,
+        window_size_left=-1,
+        causal=True,
+        softcap=0.0,
+    ):
+        from text_generation_server.layers.attention.flashinfer import (
+            prefill_with_paged_kv_state,
+        )
+
+        return prefill_with_paged_kv_state.get().forward(
+            q.contiguous(),
+            causal=causal,
+            paged_kv_cache=(key_cache, value_cache),
+            logits_soft_cap=softcap,
+            sm_scale=softmax_scale,
+            window_left=window_size_left,
+        )
+
+elif V2:
+
+    def attention(
+        q,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        seqlen: Seqlen,
+        block_tables: torch.Tensor,
+        softmax_scale,
+        window_size_left=-1,
+        causal=True,
+        softcap=0.0,
+    ):
+        out = torch.empty_like(q)
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
+        return flash_attn_2_cuda.varlen_fwd(
+            q,
+            key_cache,
+            value_cache,
+            out,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_k,
+            None,
+            None,
+            block_tables,
+            None,
+            seqlen.max_q,
+            seqlen.max_k,
+            0.0,
+            softmax_scale,
+            False,
+            causal,
+            window_size_left,
+            0,
+            softcap,
+            False,
+            None,
+        )[0]
+
+else:
+
+    def attention(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        seqlen: Seqlen,
+        block_tables: torch.Tensor,
+        softmax_scale: float,
+        window_size_left: int = -1,
+        causal: bool = True,
+        softcap=None,
+    ):
+        if window_size_left != -1:
+            raise NotImplementedError(
+                "window_size_left is only available with flash attn v2"
+            )
+        if softcap is not None:
+            raise NotImplementedError("softcap is only available with flash attn v2")
+
+        # Flash attention v1 requires q, k and v to have the same number of heads
+        if k.shape[1] != q.shape[1]:
+            # MQA expand
+            if k.shape[1] == 1:
+                k = k.expand(-1, q.shape[1], -1)
+            # Grouped attention reshape
+            else:
+                original_shape = k.shape
+                k = (
+                    k.unsqueeze(2)
+                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
+                    .reshape(original_shape[0], -1, original_shape[2])
+                )
+        if v.shape[1] != q.shape[1]:
+            # MQA expand
+            if v.shape[1] == 1:
+                v = v.expand(-1, q.shape[1], -1)
+            # Grouped attention reshape
+            else:
+                original_shape = v.shape
+                v = (
+                    v.unsqueeze(2)
+                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
+                    .reshape(original_shape[0], -1, original_shape[2])
+                )
+
+        out = torch.empty_like(q)
+        flash_attn_cuda.fwd(
+            q,
+            k,
+            v,
+            out,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_q,
+            seqlen.max_q,
+            seqlen.max_k,
+            0.0,
+            softmax_scale,
+            False,
+            causal,
+            False,
+            0,
+            None,
+        )
+        return out
+
+
+# Prefill in the cache with every kind of attention, unless we
+# have a configuration that requires flash-attention v1, which
+# does not support block tables.
+PREFILL_IN_KV_CACHE = ATTENTION != "paged" or V2
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/flash_attn_triton.py b/backends/gaudi/server/text_generation_server/layers/attention/flash_attn_triton.py
new file mode 100644
index 00000000..3a6f9a73
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/attention/flash_attn_triton.py
@@ -0,0 +1,813 @@
+#!/usr/bin/env python
+"""
+Fused Attention
+===============
+
+This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
+(https://tridao.me/publications/flash2/flash2.pdf)
+Credits: OpenAI kernel team, AMD ML Frameworks Triton team
+
+Features supported:
+
+1) Fwd with causal masking
+2) Any sequence lengths without padding (currently fwd kernel only)
+3) Support for different sequence lengths for q and k
+4) Nested tensor API currently does not support dropout or bias.
+
+Not currently supported:
+
+1) Non power of two head dims
+
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+torch_dtype: tl.constexpr = torch.float16
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def max_fn(x, y):
+    return tl.math.max(x, y)
+
+
+@triton.jit
+def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
+    ms = tl.arange(0, m)
+    ns = tl.arange(0, n)
+    return philox_offset + ms[:, None] * stride + ns[None, :]
+
+
+@triton.jit
+def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_offsets = dropout_offsets(
+        philox_seed, philox_offset, dropout_p, m, n, stride
+    ).to(tl.uint32)
+    # TODO: use tl.randint for better performance
+    return tl.rand(philox_seed, rng_offsets)
+
+
+@triton.jit
+def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
+    rng_keep = rng_output > dropout_p
+    return rng_keep
+
+
+@triton.jit
+def load_fn(block_ptr, first, second, pad):
+    if first and second:
+        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
+    elif first:
+        tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad)
+    elif second:
+        tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad)
+    else:
+        tensor = tl.load(block_ptr)
+    return tensor
+
+
+@triton.jit
+def _attn_fwd_inner(
+    acc,
+    l_i,
+    m_i,
+    q,
+    K_block_ptr,
+    V_block_ptr,
+    start_m,
+    actual_seqlen_k,
+    dropout_p,
+    philox_seed,
+    batch_philox_offset,
+    encoded_softmax_block_ptr,
+    block_min,
+    block_max,
+    offs_n_causal,
+    masked_blocks,
+    n_extra_tokens,
+    bias_ptr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    OFFS_M: tl.constexpr,
+    OFFS_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    MASK_STEPS: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    PADDED_HEAD: tl.constexpr,
+):
+    # loop over k, v, and update accumulator
+    for start_n in range(block_min, block_max, BLOCK_N):
+        # For padded blocks, we will overrun the tensor size if
+        # we load all BLOCK_N. For others, the blocks are all within range.
+        k = load_fn(
+            K_block_ptr,
+            PADDED_HEAD,
+            MASK_STEPS and (n_extra_tokens != 0),
+            "zero",
+        )
+        if PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        # We start from end of seqlen_k so only the first iteration would need
+        # to be checked for padding if it is not a multiple of block_n
+        # TODO: This can be optimized to only be true for the padded block.
+        if MASK_STEPS:  # noqa: SIM102
+            # If this is the last block / iteration, we want to
+            # mask if the sequence length is not a multiple of block size
+            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
+            # if not is_modulo_mn. last step might get wasted but that is okay.
+            # check if this masking works for that case.
+            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
+                boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32)
+                size_n = start_n + OFFS_N[None, :]
+                mask = size_n < boundary_m[:, None]
+                qk = tl.where(mask, qk, float("-inf"))
+        if IS_CAUSAL:
+            causal_boundary = start_n + offs_n_causal
+            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
+            qk = tl.where(causal_mask, qk, float("-inf"))
+        # -- compute qk ----
+        qk += tl.dot(q, k)
+        if bias_ptr is not None:
+            bias = load_fn(
+                bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero"
+            )
+            # While bias is added after multiplying qk with sm_scale, our
+            # optimization to use 2^x instead of e^x results in an additional
+            # scale factor of log2(e) which we must also multiply the bias with.
+            qk += bias * 1.44269504089
+        m_ij = tl.maximum(m_i, tl.max(qk, 1))
+        qk = qk - m_ij[:, None]
+        p = tl.math.exp2(qk)
+
+        # CAVEAT: Must update l_ij before applying dropout
+        l_ij = tl.sum(p, 1)
+        if ENABLE_DROPOUT:
+            philox_offset = (
+                batch_philox_offset
+                + start_m * BLOCK_M * actual_seqlen_k
+                + start_n
+                - BLOCK_N
+            )
+            keep = dropout_mask(
+                philox_seed,
+                philox_offset,
+                dropout_p,
+                BLOCK_M,
+                BLOCK_N,
+                actual_seqlen_k,
+            )
+            if RETURN_ENCODED_SOFTMAX:
+                tl.store(
+                    encoded_softmax_block_ptr,
+                    tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty),
+                )
+            p = tl.where(keep, p, 0.0)
+        elif RETURN_ENCODED_SOFTMAX:
+            tl.store(
+                encoded_softmax_block_ptr,
+                p.to(encoded_softmax_block_ptr.type.element_ty),
+            )
+        # -- update output accumulator --
+        alpha = tl.math.exp2(m_i - m_ij)
+        acc = acc * alpha[:, None]
+        if not PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
+        # -- update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        # update m_i and l_i
+        m_i = m_ij
+        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(
+                encoded_softmax_block_ptr, (0, BLOCK_N)
+            )
+    return acc, l_i, m_i
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_M": 256,
+                "BLOCK_N": 64,
+                "waves_per_eu": 2,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 128,
+                "waves_per_eu": 2,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 256,
+                "BLOCK_N": 128,
+                "waves_per_eu": 2,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "waves_per_eu": 3,
+                "PRE_LOAD_V": True,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "waves_per_eu": 3,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 64,
+                "waves_per_eu": 4,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 32,
+                "BLOCK_N": 32,
+                "waves_per_eu": 4,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        # TODO: This config fails with head_size not pow2 with data mismatches.
+        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
+        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+        triton.Config(
+            {
+                "BLOCK_M": 16,
+                "BLOCK_N": 16,
+                "waves_per_eu": 1,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "waves_per_eu": 1,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+    ],
+    key=["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL"],
+)
+@triton.jit
+def attn_fwd(
+    Q,
+    K,
+    V,
+    bias,
+    sm_scale,
+    L,
+    Out,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vk,
+    stride_vn,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_on,
+    stride_bz,
+    stride_bh,
+    stride_bm,
+    stride_bn,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    dropout_p,
+    philox_seed,
+    philox_offset_base,
+    encoded_softmax,
+    HQ: tl.constexpr,
+    HK: tl.constexpr,
+    ACTUAL_BLOCK_DMODEL: tl.constexpr,
+    MAX_SEQLENS_Q: tl.constexpr,
+    MAX_SEQLENS_K: tl.constexpr,
+    VARLEN: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_h_q = tl.program_id(1)
+    off_z = tl.program_id(2)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    if VARLEN:
+        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
+        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
+        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
+        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
+        # small for all start_m so for those we return early.
+        if start_m * BLOCK_M > seqlen_q:
+            return
+        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
+        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
+        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
+    else:
+        cu_seqlens_q_start = 0
+        cu_seqlens_k_start = 0
+        seqlen_q = MAX_SEQLENS_Q
+        seqlen_k = MAX_SEQLENS_K
+
+    # Now we compute whether we need to exit early due to causal masking.
+    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
+    # are completely masked, resulting in 0s written to the output, and
+    # inf written to LSE. We don't need to do any GEMMs in this case.
+    # This block of code determines what N is, and if this WG is operating
+    # on those M rows.
+    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
+    if IS_CAUSAL:
+        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
+        # If seqlen_q != seqlen_k, attn scores are rectangular which means
+        # the causal mask boundary is bottom right aligned, and ends at either
+        # the top edge (seqlen_q < seqlen_k) or left edge.
+        # This captures the decrease in n_blocks if we have a rectangular attn
+        # matrix
+        n_blocks_seqlen = cdiv_fn(
+            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N
+        )
+        # This is what adjusts the block_max for the current WG, only
+        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
+        n_blocks = min(n_blocks, n_blocks_seqlen)
+        # If we have no blocks after adjusting for seqlen deltas, this WG is
+        # part of the blocks that are all 0. We exit early.
+        if n_blocks <= 0:
+            o_offset = (
+                off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
+            )
+            O_block_ptr = tl.make_block_ptr(
+                base=Out + o_offset,
+                shape=(seqlen_q, BLOCK_DMODEL),
+                strides=(stride_om, stride_on),
+                offsets=(start_m * BLOCK_M, 0),
+                block_shape=(BLOCK_M, BLOCK_DMODEL),
+                order=(1, 0),
+            )
+            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
+            # We still need to write 0s to the result
+            # tl.store(O_block_ptr,
+            # acc.to(Out.type.element_ty), boundary_check=(0,1))
+            # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
+            #          + offs_m
+            # We store inf to LSE, not -inf because in the bwd pass,
+            # we subtract this
+            # from qk which makes it -inf, such that exp(qk - inf) = 0
+            # for these masked blocks.
+            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
+            # tl.store(l_ptrs, l)
+            # TODO: Should dropout and return encoded softmax be handled here?
+            return
+
+    # If MQA / GQA, set the K and V head offsets appropriately.
+    GROUP_SIZE: tl.constexpr = HQ // HK
+    if GROUP_SIZE != 1:
+        off_h_k = off_h_q // GROUP_SIZE
+    else:
+        off_h_k = off_h_q
+
+    n_extra_tokens = 0
+    if seqlen_k < BLOCK_N:
+        n_extra_tokens = BLOCK_N - seqlen_k
+    elif seqlen_k % BLOCK_N:
+        n_extra_tokens = seqlen_k % BLOCK_N
+    PADDED_HEAD: tl.constexpr = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
+
+    # Compute pointers for all the tensors used in this kernel.
+    q_offset = off_z * stride_qz + off_h_q * stride_qh + cu_seqlens_q_start * stride_qm
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1),
+    )
+    v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    if BIAS_TYPE != 0:
+        bias_ptr = tl.make_block_ptr(
+            base=bias + off_h_q * stride_bh,
+            shape=(seqlen_q, seqlen_k),
+            strides=(stride_bm, stride_bn),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        bias_ptr = None
+    if ENABLE_DROPOUT:
+        batch_philox_offset = (
+            philox_offset_base + (off_z * HQ + off_h_q) * seqlen_q * seqlen_k
+        )
+    else:
+        batch_philox_offset = 0
+    # We can ask to return the dropout mask without actually doing any dropout.
+    # In this case, we return an invalid pointer so indicate the mask is not i
+    # valid.
+    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
+    if RETURN_ENCODED_SOFTMAX:
+        encoded_softmax_block_ptr = tl.make_block_ptr(
+            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
+            shape=(seqlen_q, seqlen_k),
+            strides=(seqlen_k, 1),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        encoded_softmax_block_ptr = 0
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
+    # have native e^x support in HW.
+    qk_scale = sm_scale * 1.44269504089
+    # Q is loaded once at the beginning and shared by all N blocks.
+    q = load_fn(Q_block_ptr, True, PADDED_HEAD, "zero")
+    q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
+
+    # Here we compute how many full and masked blocks we have.
+    padded_block_k = n_extra_tokens != 0
+    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
+    if IS_CAUSAL:
+        # There are always at least BLOCK_M // BLOCK_N masked blocks.
+        # Additionally there might be one more due to dissimilar seqlens.
+        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
+    else:
+        # Padding on Q does not need to be masked in the FA loop.
+        masked_blocks = padded_block_k
+    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
+    # block. In this case we might exceed n_blocks so pick the min.
+    masked_blocks = min(masked_blocks, n_blocks)
+    n_full_blocks = n_blocks - masked_blocks
+    block_min = 0
+    block_max = n_blocks * BLOCK_N
+    # Compute for full blocks. Here we set causal to false regardless of its
+    # value because there is no masking. Similarly we do not need padding.
+    if n_full_blocks > 0:
+        block_max = (n_blocks - masked_blocks) * BLOCK_N
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
+            block_min,
+            block_max,
+            0,
+            0,
+            0,
+            bias_ptr,
+            # IS_CAUSAL, ....
+            False,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            False,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            PADDED_HEAD,
+        )
+        block_min = block_max
+        block_max = n_blocks * BLOCK_N
+
+    tl.debug_barrier()
+    # Remaining blocks, if any, are full / not masked.
+    if masked_blocks > 0:
+        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
+        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(
+                encoded_softmax_block_ptr, (0, n_full_blocks)
+            )
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            block_min,
+            block_max,
+            offs_n_causal,
+            masked_blocks,
+            n_extra_tokens,
+            bias_ptr,
+            IS_CAUSAL,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            True,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            PADDED_HEAD,
+        )
+    # epilogue
+    acc = acc / l_i[:, None]
+    if ENABLE_DROPOUT:
+        acc = acc / (1 - dropout_p)
+    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
+    # then we have one block with a row of all NaNs which come from computing
+    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
+    # and store 0s where there are NaNs as these rows should've been zeroed out.
+    end_m_idx = (start_m + 1) * BLOCK_M
+    start_m_idx = start_m * BLOCK_M
+    causal_start_idx = seqlen_q - seqlen_k
+    acc = acc.to(Out.type.element_ty)
+    if IS_CAUSAL:  # noqa: SIM102
+        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
+            out_mask_boundary = tl.full(
+                (BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32
+            )
+            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
+            out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :]
+            z = 0.0
+            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
+    # write back LSE
+    # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
+    # few rows. This is only true for the last M block. For others,
+    # overflow_size will be -ve
+    # overflow_size = end_m_idx - seqlen_q
+    # if overflow_size > 0:
+    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
+    #    # This is a > check because mask being 0 blocks the store.
+    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
+    # else:
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+
+    # write back O
+    o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + o_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    # Need boundary check on this to make sure the padding from the
+    # Q and KV tensors in both dims are not part of what we store back.
+    # TODO: Do the boundary check optionally.
+    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
+
+
+def check_args(
+    q,
+    k,
+    v,
+    o,
+    varlen=True,
+    max_seqlens=None,
+    cu_seqlens_q=None,
+    cu_seqlens_k=None,
+):
+    assert q.dim() == k.dim() and q.dim() == v.dim()
+    if varlen:
+        assert q.dim() == 3
+        total_q, nheads_q, head_size = q.shape
+        total_k, nheads_k, _ = k.shape
+        assert cu_seqlens_q is not None
+        assert cu_seqlens_k is not None
+        assert len(cu_seqlens_q) == len(cu_seqlens_k)
+    else:
+        assert q.dim() == 4
+        batch, nheads_q, seqlen_q, head_size = q.shape
+        _, nheads_k, seqlen_k, _ = k.shape
+        assert max_seqlens > 0
+    assert k.shape == v.shape
+    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
+    # TODO: Change assert if we support qkl f8 and v f16
+    assert q.dtype == k.dtype and q.dtype == v.dtype
+    # TODO: Fix assert to check head size <=256 once supported
+    assert head_size <= 128
+    assert o.shape == q.shape
+    assert (nheads_q % nheads_k) == 0
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        o,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlens_q,
+        max_seqlens_k,
+        causal=False,
+        sm_scale=1.0,
+        bias=None,
+    ):
+        if o is None:
+            o = torch.empty_like(q, dtype=v.dtype)
+
+        check_args(
+            q,
+            k,
+            v,
+            o,
+            varlen=True,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+        )
+        if True:  # varlen
+            total_q, nheads_q, head_size = q.shape
+            total_k, nheads_k, _ = k.shape
+            batch = len(cu_seqlens_q) - 1
+            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
+            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
+            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
+            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
+        else:
+            batch, seqlen_q, nheads_q, head_size = q.shape
+            _, seqlen_k, nheads_k, _ = k.shape
+            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
+            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
+            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
+            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
+
+        # Get closest power of 2 over or equal to 32.
+        padded_d_model = 1 << (head_size - 1).bit_length()
+        padded_d_model = max(padded_d_model, 16)
+
+        def grid(META):
+            return triton.cdiv(max_seqlens_q, META["BLOCK_M"]), nheads_q, batch
+
+        encoded_softmax = None
+
+        # Seed the RNG so we get reproducible results for testing.
+        philox_seed = 0x1BF52
+        philox_offset = 0x1D4B42
+
+        if bias is not None:
+            bias_strides = (
+                bias.stride(0),
+                bias.stride(1),
+                bias.stride(2),
+                bias.stride(3),
+            )
+        else:
+            bias_strides = (0, 0, 0, 0)
+
+        attn_fwd[grid](
+            q,
+            k,
+            v,
+            bias,
+            sm_scale,
+            None,
+            o,
+            *q_strides,
+            *k_strides,
+            *v_strides,
+            *o_strides,
+            *bias_strides,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            dropout_p=0.0,
+            philox_seed=philox_seed,
+            philox_offset_base=philox_offset,
+            encoded_softmax=encoded_softmax,
+            HQ=nheads_q,
+            HK=nheads_k,
+            ACTUAL_BLOCK_DMODEL=head_size,
+            MAX_SEQLENS_Q=max_seqlens_q,
+            MAX_SEQLENS_K=max_seqlens_k,
+            IS_CAUSAL=causal,
+            VARLEN=True,
+            BLOCK_DMODEL=padded_d_model,
+            BIAS_TYPE=0 if bias is None else 1,
+            ENABLE_DROPOUT=False,
+            RETURN_ENCODED_SOFTMAX=False,
+        )
+
+        ctx.grid = grid
+        ctx.sm_scale = sm_scale
+        ctx.BLOCK_DMODEL = head_size
+        ctx.causal = causal
+        ctx.dropout_p = 0.0
+        ctx.philox_seed = philox_seed
+        ctx.philox_offset = philox_offset
+        ctx.encoded_softmax = encoded_softmax
+        ctx.return_encoded_softmax = False
+        return o, encoded_softmax
+
+
+triton_attention = _attention.apply
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/flashinfer.py b/backends/gaudi/server/text_generation_server/layers/attention/flashinfer.py
new file mode 100644
index 00000000..d603c6f5
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/attention/flashinfer.py
@@ -0,0 +1,251 @@
+from typing import Optional
+from contextvars import ContextVar
+from contextlib import contextmanager
+
+import flashinfer
+import torch
+
+prefill_state: ContextVar[flashinfer.BatchPrefillWithRaggedKVCacheWrapper] = ContextVar(
+    "prefill_state"
+)
+
+prefill_with_paged_kv_state: ContextVar[
+    flashinfer.BatchPrefillWithPagedKVCacheWrapper
+] = ContextVar("prefill_with_paged_kv_state")
+
+decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = ContextVar(
+    "decode_state"
+)
+
+workspace: Optional[torch.Tensor] = None
+
+
+def get_workspace(device):
+    """Get shared flashinfer workspace."""
+    global workspace
+    if workspace is None:
+        workspace = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device=device)
+    return workspace
+
+
+def create_prefill_with_paged_kv_state(
+    *,
+    device: torch.device,
+):
+    """Create a prefill state that uses the KV cache."""
+    workspace_buffer = get_workspace(device)
+    return flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, kv_layout="NHD", use_cuda_graph=False
+    )
+
+
+@contextmanager
+def use_prefill_with_paged_kv_state(
+    *,
+    state: flashinfer.BatchPrefillWithPagedKVCacheWrapper,
+    block_tables: torch.Tensor,
+    cu_seqlens: torch.Tensor,
+    input_lengths: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    page_size: int,
+    dtype: torch.dtype,
+    window_left: int,
+):
+    """
+    Context manager to set the active flashinfer prefill state to the given
+    `state` and parameters. This state will be used by all calls to the
+    `attention` function while the context manager is active.
+    """
+
+    indptr = torch.zeros(
+        input_lengths.shape[0] + 1, device=input_lengths.device, dtype=torch.int32
+    )
+    # Round up to page size and then calculate the cumulative sum to get
+    # the indices into the block table.
+    torch.add(input_lengths, page_size - 1, out=indptr[1:])
+    indptr[1:].div_(page_size, rounding_mode="floor")
+    indptr[1:].cumsum_(-1)
+
+    # Get the lengths of the last page in a block.
+    if page_size == 1:
+        last_page_len = torch.ones(
+            input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
+        )
+    else:
+        last_page_len = torch.empty(
+            input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
+        )
+        torch.sub(input_lengths, 1, out=last_page_len)
+        last_page_len.remainder_(page_size)
+        last_page_len += 1
+
+    token = prefill_with_paged_kv_state.set(state)
+    try:
+        state.begin_forward(
+            qo_indptr=cu_seqlens,
+            paged_kv_indptr=indptr,
+            paged_kv_indices=block_tables,
+            paged_kv_last_page_len=last_page_len,
+            num_qo_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_size,
+            q_data_type=dtype,
+            page_size=page_size,
+            window_left=window_left,
+        )
+        yield
+    finally:
+        state.end_forward()
+        if token is not None:
+            prefill_with_paged_kv_state.reset(token)
+
+
+def create_prefill_state(
+    *,
+    device: torch.device,
+):
+    """Create a prefill state."""
+    workspace_buffer = get_workspace(device)
+    return flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
+        workspace_buffer, kv_layout="NHD", use_cuda_graph=False
+    )
+
+
+@contextmanager
+def use_prefill_state(
+    *,
+    state: flashinfer.BatchPrefillWithRaggedKVCacheWrapper,
+    cu_seqlens: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+    window_left: int,
+):
+    """
+    Context manager to set the active flashinfer prefill state to the given
+    `state` and parameters. This state will be used by all calls to the
+    `attention` function while the context manager is active.
+    """
+
+    token = prefill_state.set(state)
+    try:
+        state.begin_forward(
+            qo_indptr=cu_seqlens,
+            kv_indptr=cu_seqlens,
+            num_qo_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_size,
+            q_data_type=dtype,
+            window_left=window_left,
+        )
+        yield
+    finally:
+        state.end_forward()
+        if token is not None:
+            prefill_state.reset(token)
+
+
+def create_decode_state(
+    *,
+    device: torch.device,
+    num_heads: int,
+    num_kv_heads: int,
+):
+    """Create a decode state."""
+    workspace_buffer = get_workspace(device)
+    num_groups = num_heads // num_kv_heads
+    return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout="NHD",
+        use_cuda_graph=False,
+        # Taken from https://github.com/flashinfer-ai/flashinfer/blob/33ef95700981ba70f4cab63b8931e562bc795b21/python/flashinfer/decode.py#L57-L60
+        use_tensor_cores=num_groups not in [1, 2, 4, 8],
+    )
+
+
+def create_decode_state_cuda_graphs(
+    *,
+    device: torch.device,
+    block_tables: torch.Tensor,
+    block_tables_ptr: torch.Tensor,
+    last_page_len: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+):
+    """
+    Create a decode state for use with CUDA Graphs. `block_tables`,
+    `block_tables_ptr`, and `last_page_len` are used in CUDA Graphs and are
+    therefore stored as part of the state.
+    """
+    workspace_buffer = get_workspace(device)
+    num_groups = num_heads // num_kv_heads
+    return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer,
+        kv_layout="NHD",
+        use_cuda_graph=True,
+        paged_kv_indices_buffer=block_tables,
+        paged_kv_indptr_buffer=block_tables_ptr,
+        paged_kv_last_page_len_buffer=last_page_len,
+        # Taken from https://github.com/flashinfer-ai/flashinfer/blob/33ef95700981ba70f4cab63b8931e562bc795b21/python/flashinfer/decode.py#L57-L60
+        use_tensor_cores=num_groups not in [1, 2, 4, 8],
+    )
+
+
+@contextmanager
+def use_decode_state(
+    *,
+    state: flashinfer.BatchDecodeWithPagedKVCacheWrapper,
+    input_lengths: torch.Tensor,
+    block_tables: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    page_size: int,
+    dtype: torch.dtype,
+    window_left: int,
+):
+    """
+    Context manager to set the active flashinfer decoding state to the given
+    `state` and parameters. This state will be used by all calls to the
+    `paged_attention` function while the context manager is active.
+    """
+    indptr = torch.zeros(
+        input_lengths.shape[0] + 1, device=input_lengths.device, dtype=torch.int32
+    )
+    # Round up to page size and then calculate the cumulative sum to get
+    # the indices into the block table.
+    torch.add(input_lengths, page_size - 1, out=indptr[1:])
+    indptr[1:].div_(page_size, rounding_mode="floor")
+    indptr[1:].cumsum_(-1)
+
+    # Get the lengths of the last page in a block.
+    last_page_len = torch.empty(
+        input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
+    )
+    torch.sub(input_lengths, 1, out=last_page_len)
+    last_page_len.remainder_(page_size)
+    last_page_len += 1
+
+    token = decode_state.set(state)
+
+    try:
+        state.begin_forward(
+            indptr=indptr,
+            indices=block_tables,
+            last_page_len=last_page_len,
+            num_qo_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_size,
+            page_size=page_size,
+            data_type=dtype,
+            q_data_type=dtype,
+            window_left=window_left,
+        )
+        yield
+    finally:
+        state.end_forward()
+        if token is not None:
+            decode_state.reset(token)
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/ipex.py b/backends/gaudi/server/text_generation_server/layers/attention/ipex.py
new file mode 100644
index 00000000..657c90af
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/attention/ipex.py
@@ -0,0 +1,82 @@
+import intel_extension_for_pytorch as ipex
+import torch
+from text_generation_server.models.flash_causal_lm import BLOCK_SIZE
+from text_generation_server.layers.attention import Seqlen
+from typing import Optional
+
+SUPPORTS_WINDOWING = False
+PREFILL_IN_KV_CACHE = False
+
+
+def attention(
+    q: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    seqlen: Seqlen,
+    block_tables: torch.Tensor,
+    softmax_scale,
+    window_size_left=-1,
+    causal=True,
+    softcap: Optional[float] = None,
+):
+    out = torch.empty_like(q)
+
+    # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
+    ipex.llm.functional.varlen_attention(
+        q.contiguous() if q.device.type == "xpu" else q,
+        key_cache.contiguous() if key_cache.device.type == "xpu" else key_cache,
+        value_cache.contiguous() if value_cache.device.type == "xpu" else value_cache,
+        out,
+        seqlen.cu_seqlen_q,
+        seqlen.cu_seqlen_q,
+        seqlen.max_q,
+        seqlen.max_q,
+        0.0,
+        softmax_scale,
+        False,
+        causal,
+        False,
+        None,
+    )
+
+    return out
+
+
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+):
+    ipex.llm.modules.PagedAttention.reshape_and_cache(
+        key, value, key_cache, value_cache, slots
+    )
+
+
+def paged_attention(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    kv_head_mapping: torch.Tensor,
+    softmax_scale: float,
+    block_tables: torch.Tensor,
+    seqlen: Seqlen,
+    max_s: int,
+    softcap: Optional[float] = None,
+):
+    out = torch.empty_like(query)
+    ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
+        out,
+        query,
+        key_cache,
+        value_cache,
+        kv_head_mapping,
+        softmax_scale,
+        block_tables,
+        seqlen.input_lengths,
+        BLOCK_SIZE,
+        max_s,
+        None,
+    )
+    return out
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/rocm.py b/backends/gaudi/server/text_generation_server/layers/attention/rocm.py
new file mode 100644
index 00000000..646a763d
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/attention/rocm.py
@@ -0,0 +1,308 @@
+import os
+from typing import Optional
+import torch
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.models.globals import ATTENTION
+from text_generation_server.layers.attention import Seqlen
+from text_generation_server.utils.log import log_master
+from loguru import logger
+
+major, minor = torch.cuda.get_device_capability()
+is_sm75 = major == 7 and minor == 5
+
+_PARTITION_SIZE_V1V2 = 512
+_PARTITION_SIZE_CUSTOM = 256
+
+use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"}
+ENGINE = "triton" if use_triton else "ck"
+
+PREFILL_IN_KV_CACHE = False
+
+use_rocm_custom_paged_attn = os.getenv("ROCM_USE_CUSTOM_PAGED_ATTN", "1") != "0"
+try:
+    if use_rocm_custom_paged_attn:
+        from vllm._custom_C import paged_attention_custom
+except ImportError as e:
+    log_master(
+        logger.info,
+        f"Custom Paged Attention not available. Complete error: {e}",
+    )
+    use_rocm_custom_paged_attn = False
+
+try:
+    import vllm._custom_ops as ops
+except Exception as e:
+    raise ImportError(
+        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+    )
+
+
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+):
+    if ATTENTION == "flashdecoding":
+        shape = key_cache.shape
+        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
+        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
+    else:
+        ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
+
+
+def paged_attention(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    kv_head_mapping: torch.Tensor,
+    softmax_scale: float,
+    block_tables: torch.Tensor,
+    seqlen: Seqlen,
+    max_s: int,
+    softcap: Optional[float] = None,
+):
+    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
+    # Copyright 2023 The vLLM team. All rights
+    # reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    #
+
+    if softcap is not None:
+        raise RuntimeError("Paged attention doesn't support softcapping")
+
+    # value_cache => [num_blocks, num_heads, head_size, block_size]
+    block_size = value_cache.shape[3]
+    num_seqs, num_heads, head_size = query.shape
+
+    num_kv_heads = key_cache.shape[1]
+    gqa_ratio = num_heads // num_kv_heads
+    use_custom = (
+        use_rocm_custom_paged_attn
+        and (query.dtype == torch.half or query.dtype == torch.bfloat16)
+        and (head_size == 128 or head_size == 64)
+        and (block_size == 16 or block_size == 32)
+        and (gqa_ratio >= 1 and gqa_ratio <= 16)
+        and max_s <= 32768
+    )
+
+    if not use_custom:
+        _PARTITION_SIZE = _PARTITION_SIZE_V1V2
+    else:
+        _PARTITION_SIZE = _PARTITION_SIZE_CUSTOM
+
+    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
+    input_lengths = seqlen.input_lengths
+
+    out = torch.empty_like(query)
+
+    # NOTE(woosuk): We use a simple heuristic to decide whether to use
+    # PagedAttention V1 or V2. If the number of partitions is 1, we use
+    # V1 to avoid the overhead of reduction. Also, if the number of
+    # sequences or heads is large, we use V1 since there is enough work
+    # to parallelize.
+    import vllm._custom_ops as ops
+
+    use_v1 = (
+        max_s <= 8192
+        and (max_num_partitions == 1 or num_seqs * num_heads > 512)
+        and not use_custom
+    )
+    if use_v1:
+        ops.paged_attention_v1(
+            out,
+            query,
+            key_cache,
+            value_cache,
+            kv_head_mapping,
+            softmax_scale,
+            block_tables,
+            input_lengths,
+            block_size,
+            max_s,
+            None,
+            "auto",
+            1.0,
+        )
+    else:
+        # Run PagedAttention V2.
+        assert _PARTITION_SIZE % block_size == 0
+        tmp_output = torch.empty(
+            size=(num_seqs, num_heads, max_num_partitions, head_size),
+            dtype=out.dtype,
+            device=out.device,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_heads, max_num_partitions),
+            dtype=torch.float32,
+            device=out.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+        if not use_custom:
+            ops.paged_attention_v2(
+                out,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                kv_head_mapping,
+                softmax_scale,
+                block_tables,
+                input_lengths,
+                block_size,
+                max_s,
+                None,
+                "auto",
+                1.0,
+            )
+        else:
+            paged_attention_custom(
+                out,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                softmax_scale,
+                block_tables,
+                input_lengths,
+                block_size,
+                max_s,
+                None,
+                "auto",
+            )
+
+    return out
+
+
+if ENGINE != "triton":
+    try:
+        import flash_attn_2_cuda
+
+        log_master(
+            logger.info,
+            "ROCm: using Flash Attention 2 Composable Kernel implementation.",
+        )
+    except ImportError as e:
+        if major >= 8:
+            architecture_suffix = f"-{SYSTEM}"
+            raise ImportError(
+                "Flash Attention V2 is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
+            )
+        elif is_sm75:
+            raise ImportError(
+                "Flash Attention is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                "or install flash attention with `cd server && make install install-flash-attention`"
+            ) from e
+        else:
+            for idx in range(torch.cuda.device_count()):
+                name = torch.cuda.get_device_name(idx)
+                if "MI210" not in name and "MI250" not in name:
+                    raise ImportError(
+                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
+                    )
+            raise ImportError(
+                f"AMD GPU with ROCm capability {major} {minor} is not supported"
+            ) from e
+
+
+SUPPORTS_WINDOWING = False
+if ENGINE == "ck":
+
+    def attention(
+        q,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        seqlen: Seqlen,
+        block_tables: torch.Tensor,
+        softmax_scale: float,
+        window_size_left: int = -1,
+        causal: bool = True,
+        softcap: float = 0.0,
+    ):
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
+
+        out = torch.empty_like(q)
+
+        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
+        return flash_attn_2_cuda.varlen_fwd(
+            q,
+            key_cache,
+            value_cache,
+            out,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_q,
+            None,
+            None,
+            None,
+            None,
+            seqlen.max_q,
+            seqlen.max_k,
+            0.0,
+            softmax_scale,
+            False,
+            causal,
+            window_size_left,
+            0,
+            softcap,
+            False,
+            None,
+        )[0]
+
+elif ENGINE == "triton":
+    from .flash_attn_triton import triton_attention
+
+    def attention(
+        q,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        seqlen: Seqlen,
+        block_tables: torch.Tensor,
+        softmax_scale: float,
+        window_size_left: int = -1,
+        causal: bool = True,
+        softcap: Optional[float] = None,
+    ):
+        if softcap is not None:
+            raise NotImplementedError("softcap is only available with CK flash attn")
+
+        out = torch.empty_like(q)
+
+        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
+        output, _ = triton_attention(
+            q,
+            key_cache,
+            value_cache,
+            out,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_q,
+            seqlen.max_q,
+            seqlen.max_k,
+            causal,
+            softmax_scale,
+        )
+        return output
+
+else:
+    raise RuntimeError(f"Unknown attention engine {ENGINE}")
diff --git a/backends/gaudi/server/text_generation_server/layers/awq/conversion_utils.py b/backends/gaudi/server/text_generation_server/layers/awq/conversion_utils.py
new file mode 100644
index 00000000..b19eafbb
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/awq/conversion_utils.py
@@ -0,0 +1,97 @@
+import torch
+from typing import List
+
+
+AWQ_PACK_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
+REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+
+
+def pack(imatrix: torch.Tensor, direction: str = "column"):
+    """
+    Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
+    Args:
+        imatrix (torch.Tensor): matrix of integers
+        direction (str): direction of packing, either "column" or "row"
+    Returns:
+        qmatrix (torch.Tensor): packed matrix of integers
+    """
+    shifts = torch.arange(0, 32, 4, dtype=torch.int32, device=imatrix.device)
+
+    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
+
+    if direction == "column":
+        imatrix = imatrix.view(-1, imatrix.shape[1] // (32 // 4), (32 // 4))
+        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, :]).sum(dim=-1)
+
+    elif direction == "row":
+        imatrix = imatrix.view(imatrix.shape[0] // (32 // 4), (32 // 4), -1)
+        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, :, None]).sum(dim=1)
+
+    qmatrix = qmatrix.to(torch.int32)
+
+    return qmatrix
+
+
+def unpack(qmatrix: torch.Tensor, direction: str = "column"):
+    """
+    Unpacks a 32-bit packed integer matrix into a 4-bit integer matrix.
+    Args:
+        qmatrix (torch.Tensor): matrix of packed integers
+        direction (str): direction of unpacking, either "column" or "row"
+    Returns:
+        imatrix (torch.Tensor): matrix of integers
+    """
+    shifts = torch.arange(0, 32, 4, device=qmatrix.device)
+
+    if direction == "column":
+        imatrix = torch.bitwise_right_shift(
+            qmatrix[:, :, None], shifts[None, None, :]
+        ).view(qmatrix.shape[0], -1)
+
+    elif direction == "row":
+        imatrix = torch.bitwise_right_shift(
+            qmatrix[:, None, :], shifts[None, :, None]
+        ).view(-1, qmatrix.shape[-1])
+
+    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
+
+    return imatrix
+
+
+def apply_order(
+    imatrix: torch.Tensor,
+    direction: str = "column",
+    order: List[int] = AWQ_PACK_ORDER,
+):
+    """
+    Applies the order to a 4-bit integer matrix.
+    Args:
+        imatrix (torch.Tensor): matrix of integers
+        direction (str): direction of applying order, either "column" or "row"
+        order (List[int]): order to apply, default is AWQ_PACK_ORDER
+    Returns:
+        imatrix (torch.Tensor): matrix of integers
+    """
+    if direction == "column":
+        imatrix = imatrix.view(-1, (32 // 4))[:, order].view(imatrix.shape)
+    elif direction == "row":
+        imatrix = imatrix.view((32 // 4), -1)[order, :].view(imatrix.shape)
+
+    return imatrix
+
+
+def fast_awq_to_gptq(qweight, qzeros):
+    # awq uses column packing for both weights and zeros
+    izeros = unpack(qzeros, direction="column")
+    iweights = unpack(qweight, direction="column")
+
+    # Reverse the order of the iweight and izeros tensors
+    izeros = apply_order(izeros, direction="column", order=REVERSE_AWQ_PACK_ORDER)
+    iweights = apply_order(iweights, direction="column", order=REVERSE_AWQ_PACK_ORDER)
+    # Subtract 1 from the izeros tensor (gptq adds 1 to the zeros)
+    izeros = izeros - 1
+    # exllama uses row packing for weights and column packing for zeros
+    qzeros = pack(izeros, direction="column")
+    qweight = pack(iweights, direction="row")
+
+    return qweight, qzeros
diff --git a/backends/gaudi/server/text_generation_server/layers/awq/quantize/qmodule.py b/backends/gaudi/server/text_generation_server/layers/awq/quantize/qmodule.py
new file mode 100644
index 00000000..391371a5
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/awq/quantize/qmodule.py
@@ -0,0 +1,49 @@
+# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py
+
+from typing import Optional
+import torch
+import torch.nn as nn
+import awq_inference_engine  # with CUDA kernels
+
+
+# class ScaledActivation(nn.Module):
+#     def __init__(self, module, scales):
+#         super().__init__()
+#         self.act = module
+#         self.scales = nn.Parameter(scales.data)
+#
+#     def forward(self, x):
+#         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
+
+
+class WQLinear(nn.Module):
+    def __init__(
+        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
+    ):
+        super().__init__()
+
+        if w_bit not in [4]:
+            raise NotImplementedError("Only 4-bit are supported for now.")
+
+        self.in_features = qweight.shape[0]
+        self.out_features = qweight.shape[1] * 32 // w_bit
+
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else self.in_features
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert self.out_features % (32 // self.w_bit) == 0
+
+        self.qweight = qweight
+        self.qzeros = qzeros
+        self.scales = scales
+        self.bias = bias
+
+    @torch.no_grad()
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.out_features,)
+        out = awq_inference_engine.gemm_forward_cuda(
+            x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8
+        )
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/backends/gaudi/server/text_generation_server/layers/bnb.py b/backends/gaudi/server/text_generation_server/layers/bnb.py
new file mode 100644
index 00000000..791d9b6d
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/bnb.py
@@ -0,0 +1,124 @@
+from dataclasses import dataclass
+
+import bitsandbytes as bnb
+import torch
+from bitsandbytes.nn import Int8Params, Params4bit
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+@dataclass
+class BNBWeight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        return Linear8bitLt(self.weight, bias, has_fp16_weights=False, threshold=6.0)
+
+
+class Linear8bitLt(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+        has_fp16_weights=True,
+        memory_efficient_backward=False,
+        threshold=0.0,
+        index=None,
+    ):
+        super().__init__()
+        assert (
+            not memory_efficient_backward
+        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
+        self.state = bnb.MatmulLtState()
+        self.index = index
+
+        # Necessary for stacked layers
+        self.state.threshold = threshold
+        self.state.has_fp16_weights = has_fp16_weights
+        self.state.memory_efficient_backward = memory_efficient_backward
+        if threshold > 0.0 and not has_fp16_weights:
+            self.state.use_pool = True
+
+        self.weight = Int8Params(
+            weight.data,
+            has_fp16_weights=has_fp16_weights,
+            requires_grad=has_fp16_weights,
+        )
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def init_8bit_state(self):
+        self.state.CB = self.weight.CB
+        self.state.SCB = self.weight.SCB
+        self.weight.CB = None
+        self.weight.SCB = None
+
+    def forward(self, x: torch.Tensor):
+        self.state.is_training = self.training
+        if self.weight.CB is not None:
+            self.init_8bit_state()
+
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
+
+        if not self.state.has_fp16_weights:
+            if self.state.CB is not None and self.state.CxB is not None:
+                # we converted 8-bit row major to turing/ampere format in the first inference pass
+                # we no longer need the row-major weight
+                del self.state.CB
+                self.weight.data = self.state.CxB
+        return out
+
+
+@dataclass
+class BNBFP4Weight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        return Linear4bit(self.weight, bias, quant_type="fp4")
+
+
+@dataclass
+class BNBNF4Weight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        return Linear4bit(self.weight, bias, quant_type="nf4")
+
+
+class Linear4bit(torch.nn.Module):
+    def __init__(self, weight, bias, quant_type):
+        super().__init__()
+        self.weight = Params4bit(
+            weight.data,
+            requires_grad=False,
+            compress_statistics=True,
+            quant_type=quant_type,
+        )
+        self.compute_dtype = None
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def forward(self, x: torch.Tensor):
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        if getattr(self.weight, "quant_state", None) is None:
+            print(
+                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
+            )
+        inp_dtype = x.dtype
+        if self.compute_dtype is not None:
+            x = x.to(self.compute_dtype)
+
+        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
+        out = bnb.matmul_4bit(
+            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
+        )
+
+        out = out.to(inp_dtype)
+
+        return out
diff --git a/backends/gaudi/server/text_generation_server/layers/conv.py b/backends/gaudi/server/text_generation_server/layers/conv.py
new file mode 100644
index 00000000..7fb18ab3
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/conv.py
@@ -0,0 +1,41 @@
+from accelerate import init_empty_weights
+import torch
+
+
+@classmethod
+def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = torch.nn.Parameter(bias)
+    return conv2d
+
+
+@classmethod
+def load_conv2d_no_bias(
+    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
+):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = None
+    return conv2d
+
+
+torch.nn.Conv2d.load = load_conv2d
+torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
diff --git a/backends/gaudi/server/text_generation_server/layers/eetq.py b/backends/gaudi/server/text_generation_server/layers/eetq.py
new file mode 100644
index 00000000..b1e5235a
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/eetq.py
@@ -0,0 +1,43 @@
+from dataclasses import dataclass
+
+import torch
+from EETQ import quant_weights, w8_a16_gemm
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+@dataclass
+class EETQWeight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        try:
+            from text_generation_server.layers.eetq import EETQLinear
+
+            return EETQLinear(self.weight, bias)
+        except ImportError:
+            raise ImportError(
+                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
+            )
+
+
+class EETQLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        device = weight.device
+        if weight.dtype != torch.float16:
+            weight = weight.to(dtype=torch.float16)
+        weight = torch.t(weight).contiguous().cpu()
+        weight, scale = quant_weights(weight, torch.int8, False)
+
+        self.weight = weight.cuda(device)
+        self.scale = scale.cuda(device)
+        self.bias = bias.cuda(device) if bias is not None else None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = w8_a16_gemm(input, self.weight, self.scale)
+        output = output + self.bias if self.bias is not None else output
+        return output
diff --git a/backends/gaudi/server/text_generation_server/layers/exl2.py b/backends/gaudi/server/text_generation_server/layers/exl2.py
new file mode 100644
index 00000000..a6e07f45
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/exl2.py
@@ -0,0 +1,78 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import torch
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
+
+
+@dataclass
+class Exl2Weight(Weight):
+    """
+    Exllama2 exl2 quantized weights.
+    """
+
+    q_weight: torch.Tensor
+    q_scale: torch.Tensor
+    q_invperm: torch.Tensor
+    q_scale_max: torch.Tensor
+    q_groups: torch.Tensor
+
+    def __post_init__(self):
+        self.q_scale_max /= 256
+        self.q_invperm = self.q_invperm.short()
+
+    @property
+    def device(self) -> torch.device:
+        return self.q_weight.device
+
+    def get_linear(self, bias: torch.Tensor):
+        from text_generation_server.layers.gptq import ExllamaQuantLinear
+
+        return ExllamaQuantLinear(self, bias)
+
+
+class Exl2WeightsLoader(WeightsLoader):
+    """Loader for exl2-quantized weights."""
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        """
+        Get weights at the given prefix and apply without tensor paralllism.
+        """
+        try:
+            q_weight = weights.get_tensor(f"{prefix}.q_weight")
+        except RuntimeError:
+            raise RuntimeError(
+                "Cannot load `exl2`-quantized weight, make sure the model is already quantized."
+            )
+
+        q_scale = weights.get_tensor(f"{prefix}.q_scale")
+        q_invperm = weights.get_tensor(f"{prefix}.q_invperm")
+        q_scale_max = weights.get_tensor(f"{prefix}.q_scale_max")
+        q_groups = weights.get_tensor(f"{prefix}.q_groups")
+
+        return Exl2Weight(
+            q_weight=q_weight,
+            q_scale=q_scale,
+            q_invperm=q_invperm,
+            q_scale_max=q_scale_max,
+            q_groups=q_groups,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        raise RuntimeError("Column-packed weights are not supported for exl")
+
+    def get_weights_col(self, weights: Weights, prefix: str):
+        # Sharding is not yet supported, so we return the weights as-is.
+        return self.get_weights(weights, prefix)
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        raise ValueError("get_multi_weights_col is not supported for exl2")
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        # Sharding is not yet supported, so we return the weights as-is.
+        return self.get_weights(weights, prefix)
diff --git a/backends/gaudi/server/text_generation_server/layers/fp8.py b/backends/gaudi/server/text_generation_server/layers/fp8.py
new file mode 100644
index 00000000..61dd5115
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/fp8.py
@@ -0,0 +1,285 @@
+import torch
+
+from dataclasses import dataclass
+from typing import Optional, Union, List
+from loguru import logger
+
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.weights import (
+    Weight,
+    WeightsLoader,
+    UnquantizedWeight,
+    Weights,
+)
+from text_generation_server.utils.log import log_master, log_once
+import importlib.util
+
+
+FBGEMM_MM_AVAILABLE = False
+FBGEMM_DYN_AVAILABLE = False
+
+
+def is_fbgemm_gpu_available():
+    try:
+        return importlib.util.find_spec("fbgemm_gpu.experimental.gen_ai") is not None
+    except ModuleNotFoundError:
+        return False
+
+
+if is_fbgemm_gpu_available():
+    if SYSTEM == "cuda":
+        major, _ = torch.cuda.get_device_capability()
+        FBGEMM_MM_AVAILABLE = major == 9
+        FBGEMM_DYN_AVAILABLE = major >= 8
+else:
+    log_master(logger.warning, "FBGEMM fp8 kernels are not installed.")
+
+
+def get_fp8_linear() -> torch.nn.Module:
+    """
+    Return an FP8 linear `Module` that is compatible with the current system.
+    """
+
+    if SYSTEM == "cuda":
+        major, _ = torch.cuda.get_device_capability()
+        if major == 8:
+            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
+
+            return GPTQMarlinFP8Linear
+
+    # On other systems let Torch decide if the hardware supports FP8.
+    return Fp8Linear
+
+
+def fp8_quantize(
+    weight, scale_upper_bound=None, qdtype=torch.float8_e4m3fn, scalar=False
+):
+    if FBGEMM_DYN_AVAILABLE and not scalar:
+        qweight, scale = torch.ops.fbgemm.quantize_fp8_per_row(
+            weight, bs=None, scale_ub=scale_upper_bound, output_dtype=qdtype
+        )
+        return qweight, scale
+
+    # weight, scale = quant_weights(weight, torch.int8, False)
+    finfo = torch.finfo(qdtype)
+    # Calculate the scale as dtype max divided by absmax
+    scale = finfo.max / weight.abs().max().clamp(min=1e-12, max=scale_upper_bound)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(qdtype)
+    scale = scale.float().reciprocal()
+    return qweight, scale
+
+
+class HybridFP8UnquantLoader(WeightsLoader):
+    """Weight loader that loads FP8 and unquantized Torch tensors."""
+
+    def __init__(self, activation_scale_ub: Optional[float], to_fp8: bool):
+        self.activation_scale_ub = activation_scale_ub
+        self.to_fp8 = to_fp8
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        w = weights.get_tensor(f"{prefix}.weight")
+
+        if w.dtype == torch.float8_e4m3fn:
+            # FP8 branch
+            scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        w = weights.get_packed_sharded(
+            f"{prefix}.weight", dim=0, block_sizes=block_sizes
+        )
+
+        if w.dtype == torch.float8_e4m3fn:
+            # FP8 branch
+            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+            if scale.numel() > 1:
+                scale = weights.get_packed_sharded(
+                    f"{prefix}.weight_scale",
+                    dim=0,
+                    block_sizes=block_sizes,
+                    to_dtype=False,
+                )
+            scale = scale.reshape(-1).expand(w.shape[0])
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
+        w = [
+            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
+        ]
+        shapes = [x.shape for x in w]
+
+        # Concat then send to the device
+        w = torch.cat(w, dim=dim).to(weights.device)
+
+        # FP8 branch
+        if w.dtype == torch.float8_e4m3fn:
+            scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+            ]
+            scale = torch.cat(scale, dim=0).reshape(-1)
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        w = weights.get_sharded(f"{prefix}.weight", dim=1)
+        # FP8 branch
+        if w.dtype == torch.float8_e4m3fn:
+            scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+
+@dataclass
+class Fp8Weight(Weight):
+    weight: torch.Tensor
+    dtype: torch.dtype
+    weight_scale: Optional[torch.Tensor] = None
+    activation_scale_ub: Optional[float] = None
+
+    def get_linear(self, bias: torch.Tensor):
+        if self.weight_scale is None:
+            return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
+        # This is not checked by the fbgemm kernels, but they require contiguous
+        # memory. Can be non-contiguous when we e.g. expand from scalars.
+        self.weight_scale = self.weight_scale.contiguous()
+        return get_fp8_linear().from_fp8(
+            self.weight, self.weight_scale, self.activation_scale_ub, bias, self.dtype
+        )
+
+
+class Fp8Linear(torch.nn.Module):
+    def __init__(
+        self,
+        qweight,
+        scale,
+        scale_upper_bound,
+        bias,
+        dtype,
+    ) -> None:
+        super().__init__()
+        if FBGEMM_MM_AVAILABLE:
+            log_once(logger.info, "Using FBGEMM fp8 optimized kernels")
+
+        self.dtype = dtype
+        self.qweight = qweight
+        self.scale = scale
+        self.scale_upper_bound = (
+            torch.tensor(
+                [scale_upper_bound], dtype=torch.float32, device=qweight.device
+            )
+            if scale_upper_bound is not None
+            else None
+        )
+
+        self.bias = bias if bias is not None else None
+
+    @classmethod
+    def from_unquant(cls, weight, bias, dtype):
+        qweight, scale = fp8_quantize(weight, scalar=not FBGEMM_MM_AVAILABLE)
+        return cls(
+            qweight=qweight, scale=scale, scale_upper_bound=None, bias=bias, dtype=dtype
+        )
+
+    @classmethod
+    def from_fp8(cls, weight, scale, input_scale, bias, dtype):
+        if FBGEMM_DYN_AVAILABLE:
+            # fbgemm needs float32 scales.
+            scale = scale.float()
+        return cls(
+            qweight=weight,
+            scale=scale,
+            scale_upper_bound=input_scale,
+            bias=bias,
+            dtype=dtype,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if FBGEMM_MM_AVAILABLE:
+            qinput, scale = fp8_quantize(
+                input, scale_upper_bound=self.scale_upper_bound
+            )
+
+            y = torch.ops.fbgemm.f8f8bf16_rowwise(
+                qinput,
+                self.qweight,
+                scale,
+                self.scale,
+                use_fast_accum=True,
+                bias=self.bias,
+            )
+            return y.to(self.dtype)
+
+        qinput, scale = fp8_quantize(input, scalar=True)
+        output, _ = torch._scaled_mm(
+            qinput,
+            self.qweight.t(),
+            out_dtype=self.dtype,
+            scale_a=scale,
+            scale_b=self.scale,
+            bias=self.bias,
+        )
+        return output
+
+
+def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: torch.Size):
+    scale = weights.get_tensor(prefix, to_dtype=False)
+    if scale.numel() > 1:
+        scale = weights.get_sharded(prefix, dim=0, to_dtype=False)
+    return scale.reshape(-1).expand(shape[0])
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/__init__.py b/backends/gaudi/server/text_generation_server/layers/gptq/__init__.py
new file mode 100644
index 00000000..505caa59
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/__init__.py
@@ -0,0 +1,440 @@
+import os
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import torch
+from loguru import logger
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
+
+
+@dataclass
+class GPTQWeight(Weight):
+    qweight: torch.Tensor
+    qzeros: torch.Tensor
+    scales: torch.Tensor
+    g_idx: Optional[torch.Tensor]
+    bits: int
+    groupsize: int
+    use_awq_kernel: bool
+    use_exllama: bool
+
+    def __post_init__(self):
+        if self.scales.dtype == torch.float:
+            self.scales = self.scales.half()
+
+    @property
+    def device(self) -> torch.device:
+        return self.qweight.device
+
+    def get_linear(self, bias: torch.Tensor):
+        if self.use_awq_kernel:
+            if SYSTEM == "rocm":
+                raise NotImplementedError(
+                    "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
+                    "to use Exllama/GPTQ kernels for AWQ inference."
+                )
+            try:
+                from text_generation_server.layers.awq.quantize.qmodule import WQLinear
+
+                return WQLinear(
+                    w_bit=self.bits,
+                    group_size=self.groupsize,
+                    qweight=self.qweight,
+                    qzeros=self.qzeros,
+                    scales=self.scales,
+                    bias=bias,
+                )
+            except ImportError:
+                raise NotImplementedError(
+                    "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
+                )
+        elif self.use_exllama:
+            try:
+                from text_generation_server.layers.gptq import ExllamaQuantLinear
+            except ImportError:
+                raise NotImplementedError(
+                    "Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
+                )
+
+            return ExllamaQuantLinear(self, bias)
+        else:
+            from text_generation_server.layers.gptq.quant_linear import QuantLinear
+
+            return QuantLinear(
+                self.qweight,
+                self.qzeros,
+                self.scales,
+                self.g_idx,
+                bias,
+                self.bits,
+                self.groupsize,
+            )
+
+
+class GPTQWeightsLoader(WeightsLoader):
+    """
+    Loader for GPTQ- and AWQ-quantized weights.
+    """
+
+    def __init__(
+        self,
+        *,
+        bits: int,
+        desc_act: bool,
+        groupsize: int,
+        quant_method: str,
+        quantize: str,
+        sym: bool,
+    ):
+        self.bits = bits
+        self.desc_act = desc_act
+        self.groupsize = groupsize
+        self.quant_method = quant_method
+        self.quantize = quantize
+        self.sym = sym
+
+    def get_weights(self, weights: Weights, prefix: str):
+        self._get_gptq_params(weights)
+
+        use_exllama = True
+        if self.bits != 4:
+            use_exllama = False
+
+        if self.desc_act:
+            log_once(logger.warning, "Disabling exllama because desc_act=True")
+            use_exllama = False
+
+        try:
+            qweight = weights.get_tensor(f"{prefix}.qweight")
+        except RuntimeError:
+            raise RuntimeError(
+                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+            )
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        else:
+            g_idx = None
+
+        from text_generation_server.layers.gptq import (
+            HAS_EXLLAMA,
+            CAN_EXLLAMA,
+            GPTQWeight,
+        )
+
+        if use_exllama:
+            if not HAS_EXLLAMA:
+                if CAN_EXLLAMA:
+                    log_once(
+                        logger.warning,
+                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
+                    )
+                use_exllama = False
+            else:
+                log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
+
+        qzeros = weights.get_tensor(f"{prefix}.qzeros")
+        scales = weights.get_tensor(f"{prefix}.scales")
+
+        if use_exllama and g_idx is not None:
+            g_idx = g_idx - g_idx[0]
+
+        if self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                    // self.groupsize
+                ).to(dtype=torch.int32)
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_exllama=use_exllama,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        try:
+            qweight = weights.get_packed_sharded(
+                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
+            )
+        scales = weights.get_packed_sharded(
+            f"{prefix}.scales", dim=1, block_sizes=block_sizes
+        )
+        scales = scales.to(dtype=weights.dtype)
+
+        self._get_gptq_params(weights)
+
+        qzeros = weights.get_packed_sharded(
+            f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
+        )
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        elif self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            g_idx = (
+                torch.arange(
+                    qweight.shape[0] * (32 // self.bits),
+                    device=qweight.device,
+                )
+                // self.groupsize
+            ).to(dtype=torch.int32)
+        else:
+            g_idx = None
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=False,
+        )
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        try:
+            qweight = torch.cat(
+                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat(
+            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
+        )
+
+        self._get_gptq_params(weights)
+
+        qzeros = torch.cat(
+            [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
+        )
+
+        from text_generation_server.layers.gptq import HAS_EXLLAMA
+
+        use_exllama = (
+            self.bits == 4
+            and HAS_EXLLAMA
+            and self.quantize == "gptq"
+            and not self.desc_act
+        )
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+        elif self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                    // self.groupsize
+                ).to(dtype=torch.int32)
+        else:
+            g_idx = None
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=use_exllama,
+        )
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        self._get_gptq_params(weights)
+
+        use_exllama = True
+        if self.bits != 4:
+            use_exllama = False
+
+        if self.desc_act:
+            log_once(logger.warning, "Disabling exllama because desc_act=True")
+            use_exllama = False
+
+        try:
+            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
+        except RuntimeError:
+            raise RuntimeError(
+                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+            )
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
+        else:
+            g_idx = None
+
+        if weights.process_group.size() > 1:
+            if g_idx is not None:
+                if (
+                    not torch.equal(
+                        g_idx.cpu(),
+                        torch.tensor(
+                            [i // self.groupsize for i in range(g_idx.shape[0])],
+                            dtype=torch.int32,
+                        ),
+                    )
+                    and not (g_idx == 0).all()
+                ):
+                    # Exllama implementation does not support row tensor parallelism with act-order, as
+                    # it would require to reorder input activations that are split unto several GPUs
+                    use_exllama = False
+
+        from text_generation_server.layers.gptq import (
+            CAN_EXLLAMA,
+            HAS_EXLLAMA,
+            GPTQWeight,
+        )
+
+        if use_exllama:
+            if not HAS_EXLLAMA:
+                if CAN_EXLLAMA:
+                    log_once(
+                        logger.warning,
+                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
+                    )
+                use_exllama = False
+            else:
+                log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
+
+        if use_exllama and self.groupsize != -1:
+            qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
+            scales = weights.get_sharded(f"{prefix}.scales", dim=0)
+        else:
+            qzeros = weights.get_tensor(f"{prefix}.qzeros")
+            scales = weights.get_tensor(f"{prefix}.scales")
+
+        if use_exllama and g_idx is not None:
+            g_idx = g_idx - g_idx[0]
+
+        if self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                    // self.groupsize
+                ).to(dtype=torch.int32)
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=use_exllama,
+        )
+
+    def _get_gptq_params(self, weights: Weights):
+        if weights._has_tensor("gptq_bits") and weights._has_tensor("gptq_groupsize"):
+            self.bits = weights.get_tensor("gptq_bits").item()
+            self.groupsize = weights.get_tensor("gptq_groupsize").item()
+            self.desc_act = False
+            # `server quantize` used asymmetric quantization unconditionally
+            # before the `gptq_sym` setting tensor was added.
+            self.sym = (
+                weights.get_tensor("gptq_sym").item()
+                if weights._has_tensor("gptq_sym")
+                else False
+            )
+            self.quant_method = "gptq"
+
+
+# Needs to be at the end because circular import.
+try:
+    major, _minor = torch.cuda.get_device_capability()
+except Exception:
+    major = 1
+
+HAS_EXLLAMA = False
+CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
+V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
+if os.getenv("DISABLE_EXLLAMA") == "True":
+    HAS_EXLLAMA = False
+elif CAN_EXLLAMA:
+    try:
+        if V2:
+            from text_generation_server.layers.gptq.exllamav2 import (
+                QuantLinear as ExllamaQuantLinear,  # noqa: F401
+                create_exllama_buffers,  # noqa: F401
+                set_device,  # noqa: F401
+            )
+
+            HAS_EXLLAMA = "2"
+        else:
+            from text_generation_server.layers.gptq.exllama import (
+                Ex4bitLinear as ExllamaQuantLinear,  # noqa: F401
+                create_exllama_buffers,  # noqa: F401
+                set_device,  # noqa: F401
+            )
+
+            HAS_EXLLAMA = "1"
+
+    except ImportError:
+        pass
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/custom_autotune.py b/backends/gaudi/server/text_generation_server/layers/gptq/custom_autotune.py
new file mode 100644
index 00000000..0388ef20
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/custom_autotune.py
@@ -0,0 +1,261 @@
+# https://github.com/fpgaminer/GPTQ-triton
+"""
+Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
+"""
+
+import builtins
+import math
+import time
+from typing import Dict
+
+import triton
+
+
+class Autotuner(triton.KernelInterface):
+    def __init__(
+        self,
+        fn,
+        arg_names,
+        configs,
+        key,
+        reset_to_zero,
+        prune_configs_by: Dict = None,
+        nearest_power_of_two: bool = False,
+    ):
+        """
+        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+                'perf_model': performance model used to predicate running time with different configs, returns running time
+                'top_k': number of configs to bench
+                'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
+                'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
+        """
+        if not configs:
+            self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
+        else:
+            self.configs = configs
+        self.key_idx = [arg_names.index(k) for k in key]
+        self.nearest_power_of_two = nearest_power_of_two
+        self.cache = {}
+        # hook to reset all required tensor to zeros before relaunching a kernel
+        self.hook = lambda args: 0
+        if reset_to_zero is not None:
+            self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
+
+            def _hook(args):
+                for i in self.reset_idx:
+                    args[i].zero_()
+
+            self.hook = _hook
+        self.arg_names = arg_names
+        # prune configs
+        if prune_configs_by:
+            perf_model, top_k = (
+                prune_configs_by["perf_model"],
+                prune_configs_by["top_k"],
+            )
+            if "early_config_prune" in prune_configs_by:
+                early_config_prune = prune_configs_by["early_config_prune"]
+        else:
+            perf_model, top_k, early_config_prune = None, None, None
+        self.perf_model, self.configs_top_k = perf_model, top_k
+        self.early_config_prune = early_config_prune
+        self.fn = fn
+
+    def _bench(self, *args, config, **meta):
+        # check for conflicts, i.e. meta-parameters both provided
+        # as kwargs and by the autotuner
+        conflicts = meta.keys() & config.kwargs.keys()
+        if conflicts:
+            raise ValueError(
+                f"Conflicting meta-parameters: {', '.join(conflicts)}."
+                " Make sure that you don't re-define auto-tuned symbols."
+            )
+        # augment meta-parameters with tunable ones
+        current = dict(meta, **config.kwargs)
+
+        def kernel_call():
+            if config.pre_hook:
+                config.pre_hook(self.nargs)
+            self.hook(args)
+            self.fn.run(
+                *args,
+                num_warps=config.num_warps,
+                num_stages=config.num_stages,
+                **current,
+            )
+
+        try:
+            # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
+            # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
+            return triton.testing.do_bench(
+                kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40
+            )
+        except triton.OutOfResources:
+            return [float("inf"), float("inf"), float("inf")]
+
+    def run(self, *args, **kwargs):
+        self.nargs = dict(zip(self.arg_names, args))
+        if len(self.configs) > 1:
+            key = tuple(args[i] for i in self.key_idx)
+
+            # This reduces the amount of autotuning by rounding the keys to the nearest power of two
+            # In my testing this gives decent results, and greatly reduces the amount of tuning required
+            if self.nearest_power_of_two:
+                key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])
+
+            if key not in self.cache:
+                # prune configs
+                pruned_configs = self.prune_configs(kwargs)
+                bench_start = time.time()
+                timings = {
+                    config: self._bench(*args, config=config, **kwargs)
+                    for config in pruned_configs
+                }
+                bench_end = time.time()
+                self.bench_time = bench_end - bench_start
+                self.cache[key] = builtins.min(timings, key=timings.get)
+                self.hook(args)
+                self.configs_timings = timings
+            config = self.cache[key]
+        else:
+            config = self.configs[0]
+        self.best_config = config
+        if config.pre_hook is not None:
+            config.pre_hook(self.nargs)
+        return self.fn.run(
+            *args,
+            num_warps=config.num_warps,
+            num_stages=config.num_stages,
+            **kwargs,
+            **config.kwargs,
+        )
+
+    def prune_configs(self, kwargs):
+        pruned_configs = self.configs
+        if self.early_config_prune:
+            pruned_configs = self.early_config_prune(self.configs, self.nargs)
+        if self.perf_model:
+            top_k = self.configs_top_k
+            if isinstance(top_k, float) and top_k <= 1.0:
+                top_k = int(len(self.configs) * top_k)
+            if len(pruned_configs) > top_k:
+                est_timing = {
+                    config: self.perf_model(
+                        **self.nargs,
+                        **kwargs,
+                        **config.kwargs,
+                        num_stages=config.num_stages,
+                        num_warps=config.num_warps,
+                    )
+                    for config in pruned_configs
+                }
+                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[
+                    :top_k
+                ]
+        return pruned_configs
+
+    def warmup(self, *args, **kwargs):
+        self.nargs = dict(zip(self.arg_names, args))
+        for config in self.prune_configs(kwargs):
+            self.fn.warmup(
+                *args,
+                num_warps=config.num_warps,
+                num_stages=config.num_stages,
+                **kwargs,
+                **config.kwargs,
+            )
+        self.nargs = None
+
+
+def autotune(
+    configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False
+):
+    """
+    Decorator for auto-tuning a :code:`triton.jit`'d function.
+    .. highlight:: python
+    .. code-block:: python
+            @triton.autotune(configs=[
+                    triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
+                    triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
+                    ],
+                    key=['x_size'] # the two above configs will be evaluated anytime
+                                                    # the value of x_size changes
+            )
+            @triton.jit
+            def kernel(x_ptr, x_size, **META):
+                    BLOCK_SIZE = META['BLOCK_SIZE']
+    :note: When all the configurations are evaluated, the kernel will run multiple time.
+                    This means that whatever value the kernel updates will be updated multiple times.
+                    To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
+                    reset the value of the provided tensor to `zero` before running any configuration.
+    :param configs: a list of :code:`triton.Config` objects
+    :type configs: list[triton.Config]
+    :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
+    :type key: list[str]
+    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
+            'perf_model': performance model used to predicate running time with different configs, returns running time
+            'top_k': number of configs to bench
+            'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
+    :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
+    :type reset_to_zero: list[str]
+    """
+
+    def decorator(fn):
+        return Autotuner(
+            fn,
+            fn.arg_names,
+            configs,
+            key,
+            reset_to_zero,
+            prune_configs_by,
+            nearest_power_of_two,
+        )
+
+    return decorator
+
+
+def matmul248_kernel_config_pruner(configs, nargs):
+    """
+    The main purpose of this function is to shrink BLOCK_SIZE_* when the corresponding dimension is smaller.
+    """
+    m = max(2 ** int(math.ceil(math.log2(nargs["M"]))), 16)
+    n = max(2 ** int(math.ceil(math.log2(nargs["N"]))), 16)
+    k = max(2 ** int(math.ceil(math.log2(nargs["K"]))), 16)
+
+    used = set()
+    for config in configs:
+        block_size_m = min(m, config.kwargs["BLOCK_SIZE_M"])
+        block_size_n = min(n, config.kwargs["BLOCK_SIZE_N"])
+        block_size_k = min(k, config.kwargs["BLOCK_SIZE_K"])
+        group_size_m = config.kwargs["GROUP_SIZE_M"]
+
+        if (
+            block_size_m,
+            block_size_n,
+            block_size_k,
+            group_size_m,
+            config.num_stages,
+            config.num_warps,
+        ) in used:
+            continue
+
+        used.add(
+            (
+                block_size_m,
+                block_size_n,
+                block_size_k,
+                group_size_m,
+                config.num_stages,
+                config.num_warps,
+            )
+        )
+        yield triton.Config(
+            {
+                "BLOCK_SIZE_M": block_size_m,
+                "BLOCK_SIZE_N": block_size_n,
+                "BLOCK_SIZE_K": block_size_k,
+                "GROUP_SIZE_M": group_size_m,
+            },
+            num_stages=config.num_stages,
+            num_warps=config.num_warps,
+        )
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/exllama.py b/backends/gaudi/server/text_generation_server/layers/gptq/exllama.py
new file mode 100644
index 00000000..f27666b7
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/exllama.py
@@ -0,0 +1,134 @@
+from text_generation_server.layers.gptq import GPTQWeight
+import torch
+from exllama_kernels import make_q4, q4_matmul, prepare_buffers, set_tuning_params
+
+# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
+none_tensor = torch.empty((1, 1), device="meta")
+
+
+def ext_make_q4(qweight, qzeros, scales, g_idx, device):
+    """Construct Q4Matrix, return handle"""
+    return make_q4(
+        qweight, qzeros, scales, g_idx if g_idx is not None else none_tensor, device
+    )
+
+
+def ext_q4_matmul(x, q4, q4_width):
+    """Matrix multiplication, returns x @ q4"""
+    outshape = x.shape[:-1] + (q4_width,)
+    x = x.view(-1, x.shape[-1])
+    output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
+
+    q4_matmul(x, q4, output)
+
+    return output.view(outshape)
+
+
+MAX_DQ = 1
+MAX_INNER = 1
+ACT_ORDER = False
+DEVICE = None
+
+TEMP_STATE = None
+TEMP_DQ = None
+
+
+def set_device(device):
+    global DEVICE
+    DEVICE = device
+
+
+def create_exllama_buffers(max_total_tokens: int):
+    global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE, TEMP_STATE, TEMP_DQ
+
+    assert DEVICE is not None, "call set_device first"
+
+    if not ACT_ORDER:
+        max_total_tokens = 1
+
+    # This temp_state buffer is required to reorder X in the act-order case.
+    temp_state = torch.zeros(
+        (max_total_tokens, MAX_INNER), dtype=torch.float16, device=DEVICE
+    )
+    temp_dq = torch.zeros((1, MAX_DQ), dtype=torch.float16, device=DEVICE)
+
+    # This temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
+    prepare_buffers(DEVICE, temp_state, temp_dq)
+
+    matmul_recons_thd = 8
+    matmul_fused_remap = False
+    matmul_no_half2 = False
+    set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
+
+    TEMP_STATE, TEMP_DQ = temp_state, temp_dq
+
+
+class Ex4bitLinear(torch.nn.Module):
+    """Linear layer implementation with per-group 4-bit quantization of the weights"""
+
+    def __init__(self, weight: GPTQWeight, bias):
+        super().__init__()
+        global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
+        assert weight.bits == 4
+
+        self.device = weight.qweight.device
+        self.qweight = weight.qweight
+        self.qzeros = weight.qzeros
+        self.scales = weight.scales
+        self.g_idx = weight.g_idx.cpu() if weight.g_idx is not None else None
+        self.bias = bias if bias is not None else None
+
+        if self.g_idx is not None and (
+            (self.g_idx == 0).all()
+            or torch.equal(
+                weight.g_idx.cpu(),
+                torch.tensor(
+                    [i // weight.groupsize for i in range(weight.g_idx.shape[0])],
+                    dtype=torch.int32,
+                ),
+            )
+        ):
+            self.empty_g_idx = True
+            self.g_idx = None
+
+        assert self.device.type == "cuda"
+        assert self.device.index is not None
+
+        self.q4 = ext_make_q4(
+            self.qweight, self.qzeros, self.scales, self.g_idx, self.device.index
+        )
+
+        self.height = weight.qweight.shape[0] * 8
+        self.width = weight.qweight.shape[1]
+
+        # Infer groupsize from height of qzeros
+        self.groupsize = None
+        if self.qzeros.shape[0] > 1:
+            self.groupsize = (self.qweight.shape[0] * 8) // (self.qzeros.shape[0])
+
+        if self.groupsize is not None:
+            assert weight.groupsize == self.groupsize
+
+        # Handle act-order matrix
+        if self.g_idx is not None:
+            if self.groupsize is None:
+                raise ValueError("Found group index but no groupsize. What do?")
+            self.act_order = True
+        else:
+            self.act_order = False
+
+        DEVICE = self.qweight.device
+
+        MAX_DQ = max(MAX_DQ, self.qweight.numel() * 8)
+
+        if self.act_order:
+            MAX_INNER = max(MAX_INNER, self.height, self.width)
+
+            ACT_ORDER = True
+
+    def forward(self, x):
+        out = ext_q4_matmul(x, self.q4, self.width)
+
+        if self.bias is not None:
+            out.add_(self.bias)
+        return out
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/exllamav2.py b/backends/gaudi/server/text_generation_server/layers/gptq/exllamav2.py
new file mode 100644
index 00000000..920a6adf
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/exllamav2.py
@@ -0,0 +1,267 @@
+# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
+
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+
+from loguru import logger
+
+from text_generation_server.layers.exl2 import Exl2Weight
+from text_generation_server.layers.gptq import GPTQWeight
+from text_generation_server.utils.log import log_master
+
+try:
+    from exllamav2.ext import exllamav2_ext
+
+    make_q_matrix = exllamav2_ext.make_q_matrix
+    gemm_half_q_half = exllamav2_ext.gemm_half_q_half
+except ImportError:
+    log_master(logger.warning, "exllamav2_kernels not installed.")
+    raise
+
+# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
+none_tensor = torch.empty((1, 1), device="meta")
+
+
+@dataclass
+class _ExtraTensors:
+    """Additional generated quantizer tensors."""
+
+    q_group_map: Optional[torch.Tensor] = None
+    q_invperm: Optional[torch.Tensor] = None
+    q_perm: Optional[torch.Tensor] = None
+
+
+def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
+    """Matrix multiplication, returns x @ q4"""
+    output_shape = x.shape[:-1] + (q4_width,)
+    x = x.view(-1, x.shape[-1])
+    output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
+    gemm_half_q_half(x, q_handle, output, force_cuda)
+    return output.view(output_shape)
+
+
+def make_group_map(q_groups: torch.Tensor, num_qrows: int):
+    gr = q_groups.tolist()
+    group_map = []
+    num_groups = len(gr) // 2
+
+    for i in range(num_groups):
+        bits = gr[i * 2]
+        if i < num_groups - 1:
+            qrows = gr[i * 2 + 3] - gr[i * 2 + 1]
+        else:
+            qrows = num_qrows - gr[i * 2 + 1]
+        rows = qrows * 32 // bits
+        for j in range(rows):
+            group_map += [i]
+            group_map += [rows - j]
+
+    return torch.tensor(group_map, dtype=torch.short, device=q_groups.device)
+
+
+# Create Q matrix
+
+
+def ext_make_q_matrix(
+    w: Exl2Weight | GPTQWeight,
+    extra: _ExtraTensors,
+    temp_dq,
+    key: Optional[str] = None,
+):
+    """
+    Create Q matrix
+    """
+    # max_dq_size = 512*(1024**2)
+    # max_dq_rows = max_dq_size // out_features[0]
+    max_dq_rows = 0
+
+    # EXL2
+    if isinstance(w, Exl2Weight):
+        extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
+        extra.q_perm = torch.argsort(w.q_invperm).short()
+
+        return make_q_matrix(
+            w.q_weight,
+            extra.q_perm,
+            w.q_invperm,
+            w.q_scale,
+            w.q_scale_max,
+            w.q_groups,
+            extra.q_group_map,
+            none_tensor,  # zeros
+            none_tensor,  # scales
+            none_tensor,  # g_idx
+            none_tensor,  # bias
+            temp_dq,
+            max_dq_rows,
+        )
+    # GPTQ
+    elif isinstance(w, GPTQWeight):
+        if w.scales.dtype == torch.float:
+            w.scales = w.scales.half()
+
+        # GPTQ with g_idx (act_order)
+        if w.g_idx is not None and not (w.g_idx == 0).all().item():
+            extra.q_perm = torch.empty(
+                (w.qweight.shape[0] * 8,),
+                dtype=torch.short,
+                device=w.qweight.device,
+            )
+            extra.q_invperm = torch.empty_like(extra.q_perm)
+            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
+            return make_q_matrix(
+                w.qweight,
+                extra.q_perm,
+                extra.q_invperm,
+                none_tensor,  # q_scale
+                none_tensor,  # q_scale_max
+                none_tensor,  # q_groups
+                none_tensor,  # q_group_map
+                w.qzeros,
+                w.scales,
+                w.g_idx.cpu(),
+                none_tensor,  # bias
+                temp_dq,
+                max_dq_rows,
+            )
+        # GPTQ without g_idx
+        else:
+            return make_q_matrix(
+                w.qweight,
+                none_tensor,  # q_perm
+                none_tensor,  # q_invperm
+                none_tensor,  # q_scale
+                none_tensor,  # q_scale_max
+                none_tensor,  # q_groups
+                none_tensor,  # q_group_map
+                w.qzeros,
+                w.scales,
+                none_tensor,  # g_idx
+                none_tensor,  # bias
+                temp_dq,
+                max_dq_rows,
+            )
+    else:
+        RuntimeError("Cannot create handle")
+
+
+DEVICE = None
+LAYERS = []
+
+
+def set_device(device):
+    global DEVICE
+    DEVICE = device
+
+
+def create_exllama_buffers(max_total_tokens: int):
+    global LAYERS, DEVICE
+
+    # No need to initialize scratch space if there are no layers
+    # that use ExLLamav2.
+    if len(LAYERS) == 0:
+        return
+
+    # Find the size of the scratch space.
+    scratch_bytes = max(
+        layer.scratch_space_fixed(max_input_len=max_total_tokens, max_batch_size=1)
+        for layer in LAYERS
+    )
+    temp_dq = ExLlamaV2DeviceTensors(DEVICE, scratch_bytes)
+
+    for layer in LAYERS:
+        layer.post_init(temp_dq)
+
+
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "exllamav2"
+
+    """Linear layer implementation with per-group 4-bit quantization of the weights"""
+
+    def __init__(
+        self,
+        weight: Exl2Weight | GPTQWeight,
+        bias: torch.Tensor,
+    ):
+        super().__init__()
+
+        self.q_handle = None
+        self.q_tensors = weight
+        self.extra_tensors = _ExtraTensors()
+
+        if isinstance(weight, Exl2Weight):
+            self.infeatures = weight.q_invperm.shape[0]
+            self.outfeatures = weight.q_weight.shape[1]
+        elif isinstance(weight, GPTQWeight):
+            if weight.bits != 4:
+                raise ValueError(
+                    f"Exllamav2 kernel supports only bits=4, requested bits={weight.bits}. Something is wrong in the model initialization."
+                )
+
+            self.infeatures = weight.qweight.shape[0] // weight.bits * 32
+            self.outfeatures = weight.qweight.shape[1]
+
+        self.padding = -self.outfeatures % 32
+        self.outfeatures = self.outfeatures + self.padding
+
+        self.device = weight.device
+        self.bias = bias if bias is not None else None
+
+        global LAYERS
+        LAYERS.append(self)
+
+    def post_init(self, temp_dq):
+        device = self.q_tensors.device
+        assert device.type == "cuda"
+        assert device.index is not None
+        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
+
+        # We NEED to keep a pointer on Python side, otherwise the garbage collector will mess with us,
+        # and `Memory access fault by GPU node-2` will EAT you.
+        self.temp_dq = temp_dq
+        self.q_handle = ext_make_q_matrix(self.q_tensors, self.extra_tensors, temp_dq)
+
+    def forward(self, x, force_cuda=False):
+        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
+
+        if self.bias is not None:
+            output.add_(self.bias)
+        return output
+
+    def temp_dq_size(self):
+        return self.infeatures * self.outfeatures * 2 + 128
+
+    def temp_fwd_size(self, max_input_len, max_batch_size):
+        return self.outfeatures * max_input_len * max_batch_size * 4 + 128
+
+    def scratch_space_fixed(self, max_input_len, max_batch_size):
+        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
+
+
+class ExLlamaV2DeviceTensors:
+
+    device_idx: int
+    scratch_bytes: int
+    scratch_idx: int
+    scratch: torch.tensor = None
+
+    def __init__(self, device, scratch_bytes):
+        self.device = device
+        self.scratch_bytes = scratch_bytes
+
+    def prepare(self):
+        self.scratch = torch.empty(
+            (self.scratch_bytes // 2,), dtype=torch.half, device=self.device
+        )
+
+    def get_scratch_slice(self, size_bytes):
+
+        if self.scratch is None:
+            self.prepare()
+
+        size_bytes = ((size_bytes + 127) // 128) * 128
+        size_half = size_bytes // 2
+        scratch_slice = self.scratch.narrow(0, 0, size_half)
+        return scratch_slice
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/quant_linear.py b/backends/gaudi/server/text_generation_server/layers/gptq/quant_linear.py
new file mode 100644
index 00000000..736c357b
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/quant_linear.py
@@ -0,0 +1,359 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.cuda.amp import custom_fwd
+
+import triton
+import triton.language as tl
+from . import custom_autotune
+
+
+# code based https://github.com/fpgaminer/GPTQ-triton
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=4,
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+    prune_configs_by={
+        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+        "perf_model": None,
+        "top_k": None,
+    },
+)
+@triton.jit
+def matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    g_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, K) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, N) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    g_ptr is of shape (K) int32
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = b_ptr + (
+        (offs_k[:, None] // infearure_per_bits) * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    g_ptrs = g_ptr + offs_k
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + offs_bn[None, :]
+    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
+
+    shifter = (offs_k % infearure_per_bits) * bits
+    zeros_shifter = (offs_bn % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, num_pid_k):
+        g_idx = tl.load(g_ptrs)
+
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        scales = tl.load(
+            scales_ptrs + g_idx[:, None] * stride_scales
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(
+            zeros_ptrs + g_idx[:, None] * stride_zeros
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+        zeros = (zeros + 1) & maxq  # eventually avoid overflow
+
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros) * scales  # Scale and shift
+
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+        g_ptrs += BLOCK_SIZE_K
+
+    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty(
+            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
+        )
+
+        def grid(META):
+            return (
+                triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
+                * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+            )
+
+        matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales,
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            input.shape[1],
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+class QuantLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        return output
+
+
+class QuantLinear(nn.Module):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        self.register_buffer("qweight", qweight)
+        self.register_buffer("qzeros", qzeros)
+        self.register_buffer("scales", scales)
+        self.register_buffer("g_idx", g_idx)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize
+
+        self.outfeatures = qweight.shape[1]
+        self.infeatures = qweight.shape[0] * 32 // bits
+
+    @classmethod
+    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
+        qzeros = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
+            dtype=torch.int32,
+        )
+        scales = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
+        )
+        g_idx = torch.tensor(
+            [i // groupsize for i in range(infeatures)], dtype=torch.int32
+        )
+        if bias:
+            bias = torch.zeros((outfeatures), dtype=torch.float16)
+        else:
+            bias = None
+        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        out = QuantLinearFunction.apply(
+            x.reshape(-1, x.shape[-1]),
+            self.qweight,
+            self.scales,
+            self.qzeros,
+            self.g_idx,
+            self.bits,
+            self.maxq,
+        )
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/quantize.py b/backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
new file mode 100644
index 00000000..b0086ea0
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
@@ -0,0 +1,1017 @@
+import time
+import torch.nn as nn
+import math
+import json
+import os
+import torch
+import transformers
+
+from texttable import Texttable
+from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
+from huggingface_hub import HfApi
+from accelerate import init_empty_weights
+from text_generation_server.utils import initialize_torch_distributed, Weights
+from text_generation_server.utils.hub import weight_files
+from text_generation_server.layers.gptq.quant_linear import QuantLinear
+from loguru import logger
+from typing import Optional
+from text_generation_server.layers.gptq.utils import torch_snr_error
+
+from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight
+
+DEV = torch.device("cuda:0")
+
+
+class Quantizer(nn.Module):
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer("maxq", torch.tensor(0))
+        self.register_buffer("scale", torch.zeros(shape))
+        self.register_buffer("zero", torch.zeros(shape))
+
+    def configure(
+        self,
+        bits,
+        perchannel=False,
+        sym=True,
+        mse=False,
+        norm=2.4,
+        grid=100,
+        maxshrink=0.8,
+        trits=False,
+    ):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+        if trits:
+            self.maxq = torch.tensor(-1)
+        self.scale = torch.zeros_like(self.scale)
+
+    def _quantize(self, x, scale, zero, maxq):
+        if maxq < 0:
+            return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
+        q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+        return scale * (q - zero)
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        if self.maxq < 0:
+            self.scale = xmax
+            self.zero = xmin
+        else:
+            self.scale = (xmax - xmin) / self.maxq
+            if self.sym:
+                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+            else:
+                self.zero = torch.round(-xmin / self.scale)
+
+        if self.mse:
+            best = torch.full([x.shape[0]], float("inf"), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = self._quantize(
+                    x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq
+                )
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+
+    def quantize(self, x):
+        if self.ready():
+            return self._quantize(x, self.scale, self.zero, self.maxq)
+
+        return x
+
+    def enabled(self):
+        return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)
+
+
+class GPTQ:
+    def __init__(self, layer, observe=False):
+        self.layer = layer
+        self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+        self.quantizer = Quantizer()
+        self.observe = observe
+
+    def add_batch(self, inp, out):
+        # Hessian H = 2 X XT + λ I
+        if self.observe:
+            self.inp1 = inp
+            self.out1 = out
+        else:
+            self.inp1 = None
+            self.out1 = None
+
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear) or isinstance(
+            self.layer, transformers.Conv1D
+        ):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        if isinstance(self.layer, nn.Conv2d):
+            unfold = nn.Unfold(
+                self.layer.kernel_size,
+                dilation=self.layer.dilation,
+                padding=self.layer.padding,
+                stride=self.layer.stride,
+            )
+            inp = unfold(inp)
+            inp = inp.permute([1, 0, 2])
+            inp = inp.flatten(1)
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
+
+    def print_loss(self, name, q_weight, weight_error, timecost):
+        table = Texttable()
+        length = 28
+        name = (
+            (name + " " * (length - len(name)))
+            if len(name) <= length
+            else name[:length]
+        )
+
+        table.header(["name", "weight_error", "fp_inp_SNR", "q_inp_SNR", "time"])
+
+        # assign weight
+        self.layer.weight.data = q_weight.reshape(self.layer.weight.shape).to(
+            self.layer.weight.data.dtype
+        )
+
+        if self.inp1 is not None:
+            # quantize input to int8
+            quantizer = Quantizer()
+            quantizer.configure(8, perchannel=False, sym=True, mse=False)
+            quantizer.find_params(self.inp1)
+            q_in = quantizer.quantize(self.inp1).type(torch.float16)
+            q_out = self.layer(q_in)
+
+            # get kinds of SNR
+            q_SNR = torch_snr_error(q_out, self.out1).item()
+            fp_SNR = torch_snr_error(self.layer(self.inp1), self.out1).item()
+        else:
+            q_SNR = "-"
+            fp_SNR = "-"
+
+        table.add_row([name, weight_error, fp_SNR, q_SNR, timecost])
+        print(table.draw().split("\n")[-2])
+
+    def fasterquant(
+        self, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False, name=""
+    ):
+        self.layer.to(self.dev)
+
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+
+        tick = time.time()
+
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+
+        H = self.H
+        if not self.observe:
+            del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+
+        if act_order:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        try:
+            H = torch.linalg.cholesky(H, upper=True)
+        except Exception:
+            # Addition because Falcon fails on h_to_4h
+            H = torch.linalg.cholesky(
+                H + 1e-5 * torch.eye(H.shape[0]).to(H.device), upper=True
+            )
+        Hinv = H
+
+        g_idx = []
+        scale = []
+        zero = []
+        now_idx = 1
+
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if groupsize != -1:
+                    if (i1 + i) % groupsize == 0:
+                        self.quantizer.find_params(
+                            W[:, (i1 + i) : (i1 + i + groupsize)], weight=True
+                        )
+
+                    if ((i1 + i) // groupsize) - now_idx == -1:
+                        scale.append(self.quantizer.scale)
+                        zero.append(self.quantizer.zero)
+                        now_idx += 1
+
+                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d**2
+
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+
+        torch.cuda.synchronize()
+        error = torch.sum(Losses).item()
+
+        groupsize = groupsize if groupsize != -1 else self.columns
+        g_idx = [i // groupsize for i in range(self.columns)]
+        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
+        if act_order:
+            invperm = torch.argsort(perm)
+            Q = Q[:, invperm]
+            g_idx = g_idx[invperm]
+
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+
+        self.print_loss(
+            name=name, q_weight=Q, weight_error=error, timecost=(time.time() - tick)
+        )
+
+        if scale == []:
+            scale.append(self.quantizer.scale)
+            zero.append(self.quantizer.zero)
+        scale = torch.cat(scale, dim=1)
+        zero = torch.cat(zero, dim=1)
+        return scale, zero, g_idx, error
+
+    def free(self):
+        self.inp1 = None
+        self.out1 = None
+        self.H = None
+        self.Losses = None
+        self.Trace = None
+        torch.cuda.empty_cache()
+
+
+def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code):
+    from datasets import load_dataset
+
+    traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
+    testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=False, trust_remote_code=trust_remote_code
+        )
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=True, trust_remote_code=trust_remote_code
+        )
+
+    trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
+    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
+
+    import random
+
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+
+
+def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code):
+    from datasets import load_dataset
+
+    traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
+    valdata = load_dataset("ptb_text_only", "penn_treebank", split="validation")
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=False, trust_remote_code=trust_remote_code
+        )
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=True, trust_remote_code=trust_remote_code
+        )
+
+    trainenc = tokenizer("\n\n".join(traindata["sentence"]), return_tensors="pt")
+    testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt")
+
+    import random
+
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+
+
+def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code):
+    from datasets import load_dataset
+
+    traindata = load_dataset(
+        "allenai/c4",
+        "allenai--c4",
+        data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
+        split="train",
+        use_auth_token=False,
+    )
+    valdata = load_dataset(
+        "allenai/c4",
+        "allenai--c4",
+        data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
+        split="validation",
+        use_auth_token=False,
+    )
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=False, trust_remote_code=trust_remote_code
+        )
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=True, trust_remote_code=trust_remote_code
+        )
+
+    import random
+
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]["text"], return_tensors="pt")
+            if trainenc.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+
+    import random
+
+    random.seed(0)
+    valenc = []
+    for _ in range(256):
+        while True:
+            i = random.randint(0, len(valdata) - 1)
+            tmp = tokenizer(valdata[i]["text"], return_tensors="pt")
+            if tmp.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        valenc.append(tmp.input_ids[:, i:j])
+    valenc = torch.hstack(valenc)
+
+    class TokenizerWrapper:
+        def __init__(self, input_ids):
+            self.input_ids = input_ids
+
+    valenc = TokenizerWrapper(valenc)
+
+    return trainloader, valenc
+
+
+def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code):
+    from datasets import load_dataset
+
+    traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
+    testdata = load_dataset("ptb_text_only", "penn_treebank", split="test")
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=False, trust_remote_code=trust_remote_code
+        )
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=True, trust_remote_code=trust_remote_code
+        )
+
+    trainenc = tokenizer(" ".join(traindata["sentence"]), return_tensors="pt")
+    testenc = tokenizer(" ".join(testdata["sentence"]), return_tensors="pt")
+
+    import random
+
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+    return trainloader, testenc
+
+
+def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
+    from datasets import load_dataset
+
+    traindata = load_dataset(
+        "allenai/c4",
+        "allenai--c4",
+        data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
+        split="train",
+    )
+    valdata = load_dataset(
+        "allenai/c4",
+        "allenai--c4",
+        data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
+        split="validation",
+    )
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=False, trust_remote_code=trust_remote_code
+        )
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, use_fast=True, trust_remote_code=trust_remote_code
+        )
+
+    import random
+
+    random.seed(seed)
+    trainloader = []
+    for _ in range(nsamples):
+        while True:
+            i = random.randint(0, len(traindata) - 1)
+            trainenc = tokenizer(traindata[i]["text"], return_tensors="pt")
+            if trainenc.input_ids.shape[1] >= seqlen:
+                break
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        tar = inp.clone()
+        tar[:, :-1] = -100
+        trainloader.append((inp, tar))
+
+    valenc = tokenizer(" ".join(valdata[:1100]["text"]), return_tensors="pt")
+    valenc = valenc.input_ids[:, : (256 * seqlen)]
+
+    class TokenizerWrapper:
+        def __init__(self, input_ids):
+            self.input_ids = input_ids
+
+    valenc = TokenizerWrapper(valenc)
+
+    return trainloader, valenc
+
+
+def get_loaders(
+    name, nsamples=128, seed=0, seqlen=2048, model_id="", trust_remote_code=False
+):
+    if "wikitext2" in name:
+        return get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code)
+    if "ptb" in name:
+        if "new" in name:
+            return get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code)
+        return get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code)
+    if "c4" in name:
+        if "new" in name:
+            return get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code)
+        return get_c4(nsamples, seed, seqlen, model_id, trust_remote_code)
+
+
+def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
+    # Skip last lm_head linear
+    # Need isintance Falcon is inheriting Linear.
+    if isinstance(module, layers) and "lm_head" not in name:
+        return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(
+            find_layers(
+                child, layers=layers, name=name + "." + name1 if name != "" else name1
+            )
+        )
+    return res
+
+
+@torch.no_grad()
+def sequential(
+    model,
+    dataloader,
+    dev,
+    nsamples,
+    bits,
+    groupsize,
+    *,
+    hooks,
+    percdamp=0.01,
+    sym: bool = False,
+    act_order: bool = False,
+):
+    print("Starting ...")
+
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    try:
+        layers = model.model.layers
+        prefix = "model.layers"
+    except Exception:
+        layers = model.transformer.h
+        prefix = "transformer.h"
+
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros(
+        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
+    )
+
+    cache = {"i": 0}
+    extra = {}
+
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            extra.update(kwargs.copy())
+            raise ValueError
+
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].cuda())
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+
+    # layers[0] = layers[0].cpu()
+    # model.model.embed_tokens = model.model.embed_tokens.cpu()
+    # model.model.norm = model.model.norm.cpu()
+    torch.cuda.empty_cache()
+    for hook in hooks:
+        hook.remove()
+
+    outs = torch.zeros_like(inps)
+
+    extra = {
+        k: v.to(dev) if isinstance(v, torch.Tensor) else v for k, v in extra.items()
+    }
+
+    print("Ready.")
+
+    quantizers = {}
+    for i in range(len(layers)):
+        print(f"Quantizing layer {i+1}/{len(layers)}..")
+        print("+------------------+--------------+------------+-----------+-------+")
+        print("|       name       | weight_error | fp_inp_SNR | q_inp_SNR | time  |")
+        print("+==================+==============+============+===========+=======+")
+
+        layer = layers[i]
+        layer.load()
+        full = find_layers(layer)
+        sequential = [list(full.keys())]
+
+        for names in sequential:
+            subset = {n: full[n] for n in names}
+            gptq = {}
+            for name in subset:
+                gptq[name] = GPTQ(subset[name])
+                gptq[name].quantizer.configure(
+                    bits, perchannel=True, sym=sym, mse=False
+                )
+                pass
+
+            def add_batch(name):
+                nonlocal gptq
+
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+
+                return tmp
+
+            handles = []
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(nsamples):
+                outs[j] = layer(inps[j].unsqueeze(0), **extra)[0]
+            for h in handles:
+                h.remove()
+
+            for name in subset:
+                scale, zero, g_idx, error = gptq[name].fasterquant(
+                    percdamp=percdamp,
+                    groupsize=groupsize,
+                    act_order=act_order,
+                    name=name,
+                )
+                quantizers[f"{prefix}.{i}.{name}"] = (
+                    gptq[name].quantizer.cpu(),
+                    scale.cpu(),
+                    zero.cpu(),
+                    g_idx.cpu(),
+                    bits,
+                    groupsize,
+                )
+
+                gptq[name].free()
+
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), **extra)[0]
+
+        layer.unload()
+        del layer
+        del gptq
+        torch.cuda.empty_cache()
+
+        inps, outs = outs, inps
+        print("+------------------+--------------+------------+-----------+-------+")
+        print("\n")
+
+    model.config.use_cache = use_cache
+
+    return quantizers
+
+
+def make_quant_linear(module, names, bits, groupsize, name=""):
+    if isinstance(module, QuantLinear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + "." + attr if name != "" else attr
+        if name1 in names:
+            delattr(module, attr)
+            setattr(
+                module,
+                attr,
+                QuantLinear.new(
+                    bits,
+                    groupsize,
+                    tmp.in_features,
+                    tmp.out_features,
+                    tmp.bias is not None,
+                ),
+            )
+    for name1, child in module.named_children():
+        make_quant_linear(
+            child, names, bits, groupsize, name + "." + name1 if name != "" else name1
+        )
+
+
+# TODO: perform packing on GPU
+def pack(model, quantizers, bits, groupsize):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    make_quant_linear(model, quantizers, bits, groupsize)
+    qlayers = find_layers(model, (QuantLinear,))
+    print("Packing ...")
+    for name in qlayers:
+        print(name)
+        quantizers[name], scale, zero, g_idx, _, _ = quantizers[name]
+        qlayers[name].pack(layers[name], scale, zero, g_idx)
+    print("Done.")
+    return model
+
+
+def setdeepattr(module, full_name, tensor):
+    current = module
+    tokens = full_name.split(".")
+    for token in tokens[:-1]:
+        current = getattr(current, token)
+    setattr(current, tokens[-1], tensor)
+
+
+def getdeepattr(module, full_name):
+    current = module
+    tokens = full_name.split(".")
+    for token in tokens:
+        current = getattr(current, token)
+    return current
+
+
+def load_weights_pre_hook(module_name, weights, recursive=False):
+    def inner(module, args):
+        print(f"Pre hook {module_name}")
+        local_params = {}
+        for k, v in module.named_parameters():
+            if not recursive and k.count(".") != 1:
+                continue
+            local_params[k] = v
+        for k, v in module.named_buffers():
+            if not recursive and k.count(".") != 1:
+                continue
+            local_params[k] = v
+
+        for local_param in local_params:
+            current_tensor = getdeepattr(module, local_param)
+            if current_tensor.device == torch.device("meta"):
+                # print(f"Loading {local_param}")
+                if module_name:
+                    tensor_name = f"{module_name}.{local_param}"
+                else:
+                    tensor_name = local_param
+                tensor = weights.get_tensor(tensor_name)
+                setdeepattr(module, local_param, nn.Parameter(tensor))
+            else:
+                tensor = current_tensor.to(device=torch.device("cuda:0"))
+                if current_tensor.requires_grad:
+                    tensor = nn.Parameter(tensor)
+                setdeepattr(module, local_param, tensor)
+
+    return inner
+
+
+def load_weights_post_hook(module_name, weights, recursive=False):
+    def inner(module, args, output):
+        print(f"Post hook {module_name}")
+        local_params = {}
+        for k, v in module.named_parameters():
+            if not recursive and k.count(".") != 1:
+                continue
+            local_params[k] = v
+        for k, v in module.named_buffers():
+            if not recursive and k.count(".") != 1:
+                continue
+            local_params[k] = v
+        for local_param in local_params:
+            # print(f"Unloading {local_param}")
+            current_tensor = getdeepattr(module, local_param)
+            setdeepattr(
+                module,
+                local_param,
+                nn.Parameter(current_tensor.to(device=torch.device("cpu"))),
+            )
+        return output
+
+    return inner
+
+
+def quantize(
+    model_id: str,
+    bits: int,
+    groupsize: int,
+    output_dir: str,
+    revision: str,
+    trust_remote_code: bool,
+    upload_to_model_id: Optional[str],
+    percdamp: float,
+    act_order: bool,
+    sym: bool,
+):
+    print("loading model")
+    config = AutoConfig.from_pretrained(
+        model_id,
+        trust_remote_code=trust_remote_code,
+    )
+
+    with init_empty_weights():
+        model = AutoModelForCausalLM.from_config(
+            config, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+        )
+    model = model.eval()
+
+    print("LOADED model")
+    files = weight_files(model_id, revision, extension=".safetensors")
+    process_group, _, _ = initialize_torch_distributed()
+    weights = Weights(
+        files,
+        device=torch.device("cuda:0"),
+        dtype=torch.float16,
+        process_group=process_group,
+        aliases={"embed_tokens.weight": ["lm_head.weight"]},
+        weights_loader=DefaultWeightsLoader(UnquantizedWeight),
+    )
+    hooks = []
+    for name, module in model.named_modules():
+
+        def load(module, name):
+            def _load():
+                load_weights_pre_hook(name, weights, recursive=True)(module, None)
+
+            return _load
+
+        def unload(module, name):
+            def _unload():
+                load_weights_post_hook(name, weights, recursive=True)(
+                    module, None, None
+                )
+
+            return _unload
+
+        module.load = load(module, name)
+        module.unload = unload(module, name)
+        hooks.append(
+            module.register_forward_pre_hook(load_weights_pre_hook(name, weights))
+        )
+        hooks.append(
+            module.register_forward_hook(load_weights_post_hook(name, weights))
+        )
+    model.seqlen = 2048
+
+    dataset = "wikitext2"
+    nsamples = 128
+    seed = None
+
+    dataloader, testloader = get_loaders(
+        dataset,
+        nsamples=nsamples,
+        seed=seed,
+        model_id=model_id,
+        seqlen=model.seqlen,
+        trust_remote_code=trust_remote_code,
+    )
+
+    tick = time.time()
+    quantizers = sequential(
+        model,
+        dataloader,
+        DEV,
+        nsamples,
+        bits,
+        groupsize,
+        percdamp=percdamp,
+        act_order=act_order,
+        hooks=hooks,
+        sym=sym,
+    )
+    print(time.time() - tick)
+
+    pack(model, quantizers, bits, groupsize)
+    from safetensors.torch import save_file
+    from transformers.modeling_utils import shard_checkpoint
+
+    state_dict = model.state_dict()
+    state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}
+
+    max_shard_size = "10GB"
+    shards, index = shard_checkpoint(
+        state_dict, max_shard_size=max_shard_size, weights_name="model.safetensors"
+    )
+    os.makedirs(output_dir, exist_ok=True)
+    for shard_file, shard in shards.items():
+        save_file(
+            shard,
+            os.path.join(output_dir, shard_file),
+            metadata={
+                "format": "pt",
+                "quantized": "gptq",
+                "origin": "text-generation-inference",
+            },
+        )
+    if index is None:
+        path_to_weights = os.path.join(output_dir, "model.safetensors")
+        logger.info(f"Model weights saved in {path_to_weights}")
+    else:
+        save_index_file = "model.safetensors.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+        logger.info(
+            f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+            f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
+            f"index located at {save_index_file}."
+        )
+    config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+    config.quantization_config = {
+        "bits": bits,
+        "group_size": groupsize,
+        "damp_percent": percdamp,
+        "desc_act": act_order,
+        "static_groups": False,
+        "sym": sym,
+        "quant_method": "gptq",
+    }
+    config.save_pretrained(output_dir)
+    logger.info("Saved config")
+    logger.info("Saving tokenizer")
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_id, trust_remote_code=trust_remote_code
+    )
+    tokenizer.save_pretrained(output_dir)
+    logger.info("Saved tokenizer")
+
+    if upload_to_model_id:
+        api = HfApi()
+
+        api.upload_folder(
+            folder_path=output_dir, repo_id=upload_to_model_id, repo_type="model"
+        )
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/utils.py b/backends/gaudi/server/text_generation_server/layers/gptq/utils.py
new file mode 100644
index 00000000..cbc0f391
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/utils.py
@@ -0,0 +1,56 @@
+import torch
+
+
+# copied from https://github.com/openppl-public/ppq/blob/master/ppq/quantization/measure/norm.py
+def torch_snr_error(
+    y_pred: torch.Tensor, y_real: torch.Tensor, reduction: str = "mean"
+) -> torch.Tensor:
+    """
+    Compute SNR between y_pred(tensor) and y_real(tensor)
+
+    SNR can be calcualted as following equation:
+
+        SNR(pred, real) = (pred - real) ^ 2 / (real) ^ 2
+
+    if x and y are matrixs, SNR error over matrix should be the mean value of SNR error over all elements.
+
+        SNR(pred, real) = mean((pred - real) ^ 2 / (real) ^ 2)
+
+    Args:
+        y_pred (torch.Tensor): _description_
+        y_real (torch.Tensor): _description_
+        reduction (str, optional): _description_. Defaults to 'mean'.
+
+    Raises:
+        ValueError: _description_
+        ValueError: _description_
+
+    Returns:
+        torch.Tensor: _description_
+    """
+    if y_pred.shape != y_real.shape:
+        raise ValueError(
+            f"Can not compute snr loss for tensors with different shape. "
+            f"({y_pred.shape} and {y_real.shape})"
+        )
+    reduction = str(reduction).lower()
+
+    if y_pred.ndim == 1:
+        y_pred = y_pred.unsqueeze(0)
+        y_real = y_real.unsqueeze(0)
+
+    y_pred = y_pred.flatten(start_dim=1)
+    y_real = y_real.flatten(start_dim=1)
+
+    noise_power = torch.pow(y_pred - y_real, 2).sum(dim=-1)
+    signal_power = torch.pow(y_real, 2).sum(dim=-1)
+    snr = (noise_power) / (signal_power + 1e-7)
+
+    if reduction == "mean":
+        return torch.mean(snr)
+    elif reduction == "sum":
+        return torch.sum(snr)
+    elif reduction == "none":
+        return snr
+    else:
+        raise ValueError("Unsupported reduction method.")
diff --git a/backends/gaudi/server/text_generation_server/layers/layernorm.py b/backends/gaudi/server/text_generation_server/layers/layernorm.py
new file mode 100644
index 00000000..ce5289f9
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/layernorm.py
@@ -0,0 +1,184 @@
+import torch
+from torch import nn
+from accelerate import init_empty_weights
+from text_generation_server.utils.import_utils import (
+    SYSTEM,
+)
+
+
+# Monkey patching
+@classmethod
+def load_layer_norm(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = torch.nn.Parameter(bias)
+    return ln
+
+
+@classmethod
+def load_layer_norm_no_bias(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = None
+    return ln
+
+
+torch.nn.LayerNorm.load = load_layer_norm
+torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
+
+if SYSTEM == "cuda":
+    import dropout_layer_norm
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if hidden_states.shape[-1] > 8192:
+                if residual is not None:
+                    hidden_states += residual
+                residual = hidden_states
+
+                return super(FastLayerNorm, self).forward(hidden_states), residual
+            else:
+                (
+                    normed_hidden_states,
+                    residual,
+                    *rest,
+                ) = dropout_layer_norm.dropout_add_ln_fwd(
+                    hidden_states,
+                    residual,
+                    self.weight,
+                    self.bias,
+                    None,
+                    None,
+                    None,
+                    None,
+                    0.0,
+                    self.eps,
+                    1.0,
+                    0,
+                    None,
+                    False,
+                    False,
+                )
+                if residual is None:
+                    residual = hidden_states
+
+                return normed_hidden_states, residual
+
+elif SYSTEM == "rocm":
+    from vllm._C import ops
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            return super().forward(hidden_states), residual
+
+elif SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            out = ipex.llm.functional.add_layer_norm(
+                residual,
+                hidden_states,
+                self.weight,
+                self.bias,
+                self.eps,
+                residual is not None,
+            )
+            return out, residual if residual is not None else hidden_states
+
+
+class FastRMSNorm(nn.Module):
+    def __init__(self, weight: torch.Tensor, eps: float):
+        super().__init__()
+
+        self.weight = nn.Parameter(weight)
+        self.variance_epsilon = eps
+
+    @classmethod
+    def load(cls, prefix, weights, eps=1e-6):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        return cls(weight, eps)
+
+    def forward(self, hidden_states, residual=None):
+        if SYSTEM == "ipex":
+            out = ipex.llm.functional.add_rms_norm(
+                residual,
+                hidden_states,
+                self.weight,
+                None,
+                self.variance_epsilon,
+                residual is not None,
+            )
+            return out, residual if residual is not None else hidden_states
+        elif hidden_states.shape[-1] > 8192:
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(
+                variance + self.variance_epsilon
+            )
+
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+
+            return self.weight * hidden_states, residual
+        elif SYSTEM == "cuda":
+            # faster post attention rms norm
+            (
+                normed_hidden_states,
+                res,
+                *rest,
+            ) = dropout_layer_norm.dropout_add_ln_fwd(
+                hidden_states,
+                residual,
+                self.weight,
+                None,
+                None,
+                None,
+                None,
+                None,
+                0.0,
+                self.variance_epsilon,
+                1.0,
+                0,
+                None,
+                False,
+                True,  # Activate RMSNorm
+            )
+            if res is None:
+                res = hidden_states
+
+            return normed_hidden_states, res
+        elif SYSTEM == "rocm":
+            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            out = torch.empty_like(hidden_states)
+            ops.rms_norm(
+                out,
+                hidden_states,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return out, residual
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
diff --git a/backends/gaudi/server/text_generation_server/layers/linear.py b/backends/gaudi/server/text_generation_server/layers/linear.py
new file mode 100644
index 00000000..08306d57
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/linear.py
@@ -0,0 +1,126 @@
+import torch
+from text_generation_server.utils.import_utils import SYSTEM
+from torch.nn import functional as F
+import os
+
+if SYSTEM == "rocm":
+    ROCM_USE_SKINNY_GEMM = os.getenv("ROCM_USE_SKINNY_GEMM", "True").lower() in (
+        "true",
+        "1",
+    )
+
+    if ROCM_USE_SKINNY_GEMM:
+        try:
+            from vllm import _custom_C
+        except Exception as e:
+            raise ImportError(
+                f"Could not load `vllm._custom_C` for ROCm skinny gemm. Full error: {e}"
+            )
+
+
+class FastLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        if bias is not None:
+            self.bias = torch.nn.Parameter(bias, requires_grad=False)
+        else:
+            self.bias = None
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        if bias:
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(weight, bias)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight, self.bias)
+
+
+class FastLinearROCm(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight)
+        if bias is not None:
+            self.bias = torch.nn.Parameter(bias)
+        else:
+            self.bias = None
+
+        self.cu_count = torch.cuda.get_device_properties(
+            device="cuda"
+        ).multi_processor_count
+        self.use_skinny_gemm = (
+            ROCM_USE_SKINNY_GEMM
+            and "gfx1" not in torch.cuda.get_device_properties("cuda").gcnArchName
+        )
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        if bias:
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(weight, bias)
+
+    def forward(self, inp: torch.Tensor) -> torch.Tensor:
+        weight = self.weight
+        bias = self.bias
+
+        if (
+            self.use_skinny_gemm
+            and inp.dtype == torch.float16
+            and inp.shape[-1] % 8 == 0
+        ):
+            batched = False
+            inp_shape = inp.shape
+
+            if inp.dim() == 3:
+                inp = inp.view(-1, inp_shape[-1])
+                batched = True
+
+            m, n, k = weight.shape[0], inp_shape[0], inp_shape[1]
+            if m > 8 and n <= 4:
+                out = torch.empty(
+                    inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
+                )
+                _custom_C.wvSpltK(weight, inp, out, n, self.cu_count)
+            elif m % 4 == 0 and n == 1 and k <= 8192:
+                out = torch.empty(
+                    inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
+                )
+                _custom_C.LLMM1(weight, inp, out, 4)
+            else:
+                out = F.linear(inp, weight)
+
+            if batched:
+                out.view(*inp_shape[:-1], out.shape[-1])
+
+            if bias is not None:
+                out = out + bias
+            return out
+        return F.linear(inp, self.weight, self.bias)
+
+
+def get_linear(weight, bias):
+    # Weights that are loaded through methods that are not
+    # quantization-aware are still bare tensors. We may want
+    # to change this in the future.
+    if isinstance(weight, torch.Tensor):
+        if SYSTEM == "rocm":
+            return FastLinearROCm(weight, bias)
+        else:
+            return FastLinear(weight, bias)
+
+    return weight.get_linear(bias)
diff --git a/backends/gaudi/server/text_generation_server/layers/lora.py b/backends/gaudi/server/text_generation_server/layers/lora.py
new file mode 100644
index 00000000..a4537b55
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/lora.py
@@ -0,0 +1,279 @@
+from typing import TYPE_CHECKING, Optional, List
+
+import torch
+import torch.distributed
+from torch import nn
+from torch.distributed import ProcessGroup
+
+from text_generation_server.utils.sgmv import (
+    add_lora_a_bgmv,
+    add_lora_b_bgmv,
+    has_sgmv,
+    lora_a_sgmv_cutlass,
+    lora_b_sgmv_cutlass,
+    orient_for_rank,
+)
+
+if TYPE_CHECKING:
+    from text_generation_server.adapters import AdapterBatchData
+    from text_generation_server.adapters.lora import BatchLoraWeights
+
+
+class LoraLinear(nn.Module):
+    def __init__(
+        self, base_layer: nn.Module, layer_id: int, process_group: ProcessGroup
+    ):
+        super().__init__()
+        self.base_layer = base_layer
+        self.layer_id = layer_id
+        self.process_group = process_group
+
+    def forward_layer_type(
+        self,
+        result: torch.Tensor,
+        input: torch.Tensor,
+        adapter_data: "AdapterBatchData",
+        layer_type: str,
+        start_idx: int,
+        end_idx: int,
+    ) -> torch.Tensor:
+        if adapter_data is None:
+            return result
+        data: Optional["BatchLoraWeights"] = adapter_data.data.get(layer_type)
+
+        if has_sgmv() and data is not None and data.can_vectorize(self.process_group):
+            # In tensor-parallel configurations, each GPU processes a specific segment of the output.
+            # The 'result' tensor represents the full output, which can vary in size based on
+            # the layer type (e.g., attention vs. feed-forward layers). We define the current
+            # segment using start_idx and end_idx. If the segment size doesn't match this GPU's
+            # slice of 'result', we create a zero tensor of the correct size for LoRA computation.
+            # This approach ensures accurate LoRA application across various layer sizes and
+            # configurations, adapting to different model architectures and parallelization strategies.
+            #
+            # Example scenarios where this is necessary:
+            # 1. The adapter's size doesn't evenly divide across GPUs.
+            # 2. We're processing the last segment which might be smaller.
+            # 3. Different projection layers (q, k, v) have different sizes.
+            if end_idx - start_idx != result.shape[1]:
+                proj = torch.zeros_like(result[:, start_idx:end_idx])
+            else:
+                proj = result
+
+            for r, rank_segments in data.rank_data.items():
+                lora_a_ptr = rank_segments.lora_a_ptr
+                lora_b_ptr = rank_segments.lora_b_ptr
+
+                if lora_a_ptr is None or lora_b_ptr is None:
+                    raise ValueError("LoRA data is missing")
+
+                if data.use_sgmv:
+                    # Use SGMV for prefill
+                    v = lora_a_sgmv_cutlass(
+                        input,
+                        rank_segments.tmp_shrink,
+                        lora_a_ptr,
+                        rank_segments.segment_starts,
+                        rank_segments.segment_ends,
+                        self.layer_id,
+                        r,
+                    )
+
+                    if self.process_group.size() > 1:
+                        v = self.collect_lora_a(v)
+
+                    lora_b_sgmv_cutlass(
+                        proj,
+                        v,
+                        rank_segments.tmp_expand,
+                        lora_b_ptr,
+                        rank_segments.segment_starts,
+                        rank_segments.segment_ends,
+                        self.layer_id,
+                    )
+                else:
+                    # Use BGMV for decode
+                    v = torch.zeros(
+                        (input.size(0), r), dtype=input.dtype, device=input.device
+                    )
+                    # TODO: error with [-1, 0], but not [0, -1]
+                    add_lora_a_bgmv(
+                        v,
+                        input,
+                        lora_a_ptr,
+                        rank_segments.indices,
+                        self.layer_id,
+                    )
+
+                    if self.process_group.size() > 1:
+                        v = self.collect_lora_a(v)
+
+                    add_lora_b_bgmv(
+                        proj,
+                        v,
+                        lora_b_ptr,
+                        rank_segments.indices,
+                        self.layer_id,
+                    )
+
+            if end_idx - start_idx != result.shape[1]:
+                result[:, start_idx:end_idx] += proj
+        else:
+            for adapter_index in adapter_data.meta.adapter_set:
+                if data is not None and data.has_adapter(adapter_index):
+                    adapter_mask = (
+                        (adapter_data.meta.adapter_indices == adapter_index)
+                        .to(input.dtype)
+                        .view(-1, 1)
+                    )
+                    layer_result = self.forward_lora(
+                        input, data, adapter_index, adapter_mask
+                    )
+                    result[:, start_idx:end_idx] += layer_result
+
+        return result
+
+    def forward_lora(
+        self,
+        input: torch.Tensor,
+        data: "BatchLoraWeights",
+        adapter_index: int,
+        adapter_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        lora_a = data.lora_a[adapter_index][self.layer_id, :, :]
+        lora_b = data.lora_b[adapter_index][self.layer_id, :, :]
+
+        lora_a = orient_for_rank(lora_a, lora_b.size(0))
+
+        a_out = input @ lora_a
+        if self.process_group.size() > 1:
+            a_out = self.collect_lora_a(a_out)
+
+        result = (a_out @ lora_b) * adapter_mask
+        return result
+
+    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError("Implemented in subclasses")
+
+
+class TensorParallelMultiAdapterLinear(LoraLinear):
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        layer_id: int,
+        layer_names: List[str],
+        sizes: List[int],
+        process_group: ProcessGroup,
+    ):
+        super().__init__(base_layer, layer_id, process_group)
+        self.layer_names = layer_names
+        self.sizes = sizes
+
+    @classmethod
+    def load(
+        cls,
+        base_layer: nn.Module,
+        layer_id: int,
+        layer_names: List[str],
+        sizes: List[int],
+        process_group: ProcessGroup,
+    ):
+        return TensorParallelMultiAdapterLinear(
+            base_layer, layer_id, layer_names, sizes, process_group
+        )
+
+    def forward(
+        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
+    ) -> torch.Tensor:
+        result = self.base_layer(input)
+
+        # noop if no layer names are provided (e.g. for models without adapters)
+        if self.layer_names is None:
+            return result
+
+        # handle models like Bloom that have inputs of shape
+        # (batch_size, sequence_length, hidden_size)
+        # we need to reshape them to (batch_size * sequence_length, hidden_size)
+        # for the LoRA computation, then reshape back
+        prev_shape = result.shape
+        is_3d = len(input.shape) >= 3
+        if is_3d:
+            input = input.reshape(-1, input.shape[-1])
+            result = result.reshape(-1, result.shape[-1])
+
+        offset = 0
+        for i, layer_name in enumerate(self.layer_names):
+            start_idx = offset // self.process_group.size()
+            # The 'sizes' parameter is essential in tensor-parallel setups for handling multiple
+            # projection layers (q_proj, k_proj, v_proj) by defining their output dimensions. It
+            # ensures correct slicing of the result tensor, accommodating variations like grouped-query
+            # attention where k_proj and v_proj differ from q_proj. This allows precise application of
+            # LoRA adapters to each sub-component of the multi-head attention mechanism, managing the
+            # different projection sizes across layers and model architectures.
+            if self.sizes is not None:
+                offset += self.sizes[i]
+                end_idx = offset // self.process_group.size()
+            else:
+                end_idx = result.shape[1]
+
+            result = self.forward_layer_type(
+                result, input, adapter_data, layer_name, start_idx, end_idx
+            )
+
+        if is_3d:
+            result = result.reshape(prev_shape)
+
+        return result
+
+    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
+        # Tensor parallel implementation of X @ A@B, where A and B are sharded column-wise.
+        # We use an all-gather between X@A and (X@A)@B to ensure alignment across ranks.
+        #
+        # TODO(travis): this is not very efficient as we do an all-gather for every adapter,
+        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
+        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
+        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
+        gathered_tensors = [
+            torch.empty_like(a_out) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(gathered_tensors, a_out)
+        return torch.cat(gathered_tensors, dim=1)
+
+
+class TensorParallelAdapterRowLinear(LoraLinear):
+    def __init__(self, base_layer, layer_id, layer_name, process_group):
+        super().__init__(base_layer, layer_id, process_group)
+        self.layer_name = layer_name
+
+    @classmethod
+    def load(cls, base_layer, layer_id, layer_name, process_group):
+        return cls(base_layer, layer_id, layer_name, process_group)
+
+    def forward(
+        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
+    ) -> torch.Tensor:
+        result = self.base_layer(input)
+
+        if self.layer_name is None:
+            return result
+
+        # Fused all-gather + all-reduce from S-LoRA paper: https://arxiv.org/abs/2311.03285
+        stride = result.shape[-1] // self.process_group.size()
+        start_idx = self.process_group.rank() * stride
+        end_idx = (self.process_group.rank() + 1) * stride
+
+        self.forward_layer_type(
+            result, input, adapter_data, self.layer_name, start_idx, end_idx
+        )
+
+        return result
+
+    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
+        # Tensor parallel implementation of X @ A@B, where A and B are sharded row-wise.
+        # We use an all-reduce between X@A and (X@A)@B to ensure alignment across ranks.
+        #
+        # TODO(travis): this is not very efficient as we do an all-reduce for every adapter,
+        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
+        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
+        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
+        torch.distributed.all_reduce(a_out, group=self.process_group)
+        return a_out
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/__init__.py b/backends/gaudi/server/text_generation_server/layers/marlin/__init__.py
new file mode 100644
index 00000000..3ff3ed58
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/marlin/__init__.py
@@ -0,0 +1,15 @@
+from text_generation_server.layers.marlin.fp8 import GPTQMarlinFP8Linear
+from text_generation_server.layers.marlin.gptq import (
+    GPTQMarlinWeightsLoader,
+    can_use_gptq_marlin,
+    repack_gptq_for_marlin,
+)
+from text_generation_server.layers.marlin.marlin import MarlinWeightsLoader
+
+__all__ = [
+    "GPTQMarlinFP8Linear",
+    "GPTQMarlinWeightsLoader",
+    "MarlinWeightsLoader",
+    "can_use_gptq_marlin",
+    "repack_gptq_for_marlin",
+]
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/fp8.py b/backends/gaudi/server/text_generation_server/layers/marlin/fp8.py
new file mode 100644
index 00000000..fe55a58a
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/marlin/fp8.py
@@ -0,0 +1,140 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from loguru import logger
+from text_generation_server.layers.fp8 import fp8_quantize
+from text_generation_server.layers.marlin.gptq import _check_valid_shape
+from text_generation_server.layers.marlin.util import (
+    _check_marlin_kernels,
+    permute_scales,
+)
+from text_generation_server.utils.log import log_once
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+
+MARLIN_TILE_SIZE = 16
+
+
+class GPTQMarlinFP8Linear(nn.Module):
+    """
+    FP8 GPTQ-Marlin linear layer.
+    """
+
+    def __init__(
+        self,
+        qweight: torch.Tensor,
+        scales: torch.Tensor,
+        bias: Optional[torch.Tensor],
+    ) -> None:
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        log_once(logger.info, "GPU does not support FP8, using Marlin FP8 kernel")
+
+        scales = scales.unsqueeze(0)
+        if scales.shape[1] == 1:
+            out_features, in_features = qweight.shape
+            scales = scales.repeat(1, out_features)
+        qweight, scales = repack_fp8_for_marlin(qweight, scales)
+
+        in_features = qweight.shape[0] * MARLIN_TILE_SIZE
+        out_features = scales.shape[1]
+        _check_valid_shape(in_features=in_features, out_features=out_features)
+
+        self.qweight = qweight
+        self.scales = scales
+        self.bias = bias if bias is not None else None
+
+        self.workspace = torch.zeros(
+            out_features // 64 * 16, dtype=torch.int, device=qweight.device
+        )
+
+    @classmethod
+    def from_unquant(cls, weight, bias, dtype):
+        qweight, scales = fp8_quantize(weight)
+        return cls(qweight=qweight, scales=scales.to(dtype), bias=bias)
+
+    @classmethod
+    def from_fp8(cls, weight, scale, _input_scale, bias, dtype):
+        return cls(qweight=weight, scales=scale.to(dtype), bias=bias)
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        A_flat = A.view(-1, A.shape[-1])
+        C = marlin_kernels.fp8_marlin_gemm(
+            A_flat,
+            self.qweight,
+            self.scales,
+            self.workspace,
+            8,
+            A_flat.shape[0],
+            self.scales.shape[1],
+            A_flat.shape[1],
+        )
+        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
+
+
+def pack_fp8_as_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Repack FP8 weights to gptq format (packed int32 elements).
+    """
+    assert fp8_tensor.dtype == torch.float8_e4m3fn
+
+    if fp8_tensor.shape[0] % 4 != 0:
+        raise ValueError(
+            f"Leading tensor dimension is not divisable by 4: {fp8_tensor.shape[0]}"
+        )
+
+    # Reshape to prepare for packing
+    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
+
+    # Convert fp8 to uint8 (byte) representation
+    byte_tensor = reshaped.view(torch.uint8)
+
+    # Pack 4 uint8 values into one int32
+    packed = torch.zeros(
+        fp8_tensor.shape[0] // 4,
+        fp8_tensor.shape[1],
+        dtype=torch.int32,
+        device=fp8_tensor.device,
+    )
+
+    for i in range(4):
+        packed.bitwise_or_(byte_tensor[:, i].to(torch.int32) << i * 8)
+
+    return packed
+
+
+def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor):
+    """
+    Repack FP8 tensor for GPTQ-Marlin.
+    """
+
+    out_features, in_features = weight.shape
+
+    # Torch linear layers weights with shape [out_features, in_features],
+    # GPTQ-quantized weights use [in_feateres/pack_factor, in_features],
+    # so transpose before packing.
+    qweight = pack_fp8_as_int32(weight.t())
+
+    perm = torch.empty(0, dtype=torch.int, device=qweight.device)
+    repacked = marlin_kernels.gptq_marlin_repack(
+        qweight, perm, in_features, out_features, 8
+    )
+
+    scales = permute_scales(scales)
+
+    return repacked, scales
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/gptq.py b/backends/gaudi/server/text_generation_server/layers/marlin/gptq.py
new file mode 100644
index 00000000..0a785d94
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/marlin/gptq.py
@@ -0,0 +1,464 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy
+import torch
+import torch.nn as nn
+from loguru import logger
+from text_generation_server.layers.marlin.util import (
+    _check_marlin_kernels,
+    marlin_zero_points,
+    permute_scales,
+    unpack_cols,
+)
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+    has_sm_8_0 = major >= 8
+except Exception:
+    has_sm_8_0 = False
+
+
+GPTQ_MARLIN_BITS = [4, 8]
+GPTQ_MARLIN_GROUP_SIZES = [-1, 32, 64, 128]
+MARLIN_TILE_SIZE = 16
+
+
+def can_use_gptq_marlin(
+    *, bits: int, groupsize: int, quant_method: str, quantize: str, sym: bool
+) -> bool:
+    return (
+        SYSTEM == "cuda"
+        and marlin_kernels is not None
+        and has_sm_8_0
+        and quantize in {"awq", "gptq"}
+        and quant_method in {"awq", "gptq"}
+        and bits in GPTQ_MARLIN_BITS
+        and groupsize in GPTQ_MARLIN_GROUP_SIZES
+        # We only suppord asymmetric quantization for AWQ.
+        and (sym or quant_method == "awq")
+    )
+
+
+class GPTQMarlinWeightsLoader(WeightsLoader):
+    """
+    Loader for using GPTQ- and AWQ-quantized weights with Marlin kernels.
+    """
+
+    def __init__(
+        self,
+        *,
+        bits: int,
+        desc_act: bool,
+        groupsize: int,
+        quant_method: str,
+        quantize: str,
+        sym: bool,
+    ):
+        self.bits = bits
+        self.desc_act = desc_act
+        self.groupsize = groupsize
+        self.quant_method = quant_method
+        self.quantize = quantize
+        self.sym = sym
+
+    def get_weights(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            qweight = weights.get_tensor(f"{prefix}.qweight")
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
+            )
+
+        if not self.sym:
+            qzeros = weights.get_tensor(f"{prefix}.qzeros")
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        scales = weights.get_tensor(f"{prefix}.scales")
+
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        try:
+            qweight = weights.get_packed_sharded(
+                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
+            )
+        scales = weights.get_packed_sharded(
+            f"{prefix}.scales", dim=1, block_sizes=block_sizes
+        )
+        scales = scales.to(dtype=weights.dtype)
+
+        if not self.sym:
+            qzeros = weights.get_packed_sharded(
+                f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
+            )
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=False,
+        )
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        try:
+            qweight = torch.cat(
+                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat(
+            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
+        )
+
+        if not self.sym:
+            qzeros = torch.cat(
+                [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
+            )
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=False,
+        )
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        log_once(logger.info, "Using GPTQ-Marlin kernels")
+        try:
+            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
+            )
+
+        if not self.sym:
+            if self.desc_act or self.groupsize == -1:
+                qzeros = weights.get_tensor(f"{prefix}.qzeros")
+            else:
+                qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
+        else:
+            qzeros = None
+
+        if self.quant_method == "awq":
+            g_idx = None
+        else:
+            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
+
+        if self.desc_act or self.groupsize == -1:
+            scales = weights.get_tensor(f"{prefix}.scales")
+        else:
+            scales = weights.get_sharded(f"{prefix}.scales", dim=0)
+
+        sharded_in_features = weights.process_group.size() > 1
+
+        return repack_gptq_for_marlin(
+            qweight=qweight,
+            scales=scales,
+            qzeros=qzeros,
+            g_idx=g_idx,
+            bits=self.bits,
+            desc_act=self.desc_act,
+            groupsize=self.groupsize,
+            quant_method=self.quant_method,
+            sym=self.sym,
+            sharded_infeatures=sharded_in_features,
+        )
+
+    def _get_gptq_params(self, weights: Weights):
+        if weights._has_tensor("gptq_bits") and weights._has_tensor("gptq_groupsize"):
+            self.bits = weights.get_tensor("gptq_bits").item()
+            self.groupsize = weights.get_tensor("gptq_groupsize").item()
+            self.desc_act = False
+            # `server quantize` used asymmetric quantization unconditionally
+            # before the `gptq_sym` setting tensor was added.
+            self.sym = (
+                weights.get_tensor("gptq_sym").item()
+                if weights._has_tensor("gptq_sym")
+                else False
+            )
+            self.quant_method = "gptq"
+
+
+@dataclass
+class GPTQMarlinWeight(Weight):
+    """
+    Repacked GPTQ Marlin weights.
+    """
+
+    qweight: torch.Tensor
+    qzeros: torch.Tensor
+    scales: torch.Tensor
+    g_idx: torch.Tensor
+    perm: torch.Tensor
+    bits: int
+    is_full_k: bool
+
+    def __post_init__(self):
+        assert self.qweight.dtype == torch.int32
+        assert self.scales.dtype == torch.float16
+        assert self.g_idx.dtype == torch.int32
+        assert self.perm.dtype == torch.int32
+
+    def get_linear(self, bias: torch.Tensor):
+        return GPTQMarlinLinear(
+            weight=self,
+            bias=bias,
+        )
+
+
+def repack_gptq_for_marlin(
+    *,
+    qweight: torch.Tensor,
+    qzeros: Optional[torch.Tensor],
+    scales: torch.Tensor,
+    g_idx: Optional[torch.Tensor],
+    bits: int,
+    desc_act: bool,
+    groupsize: int,
+    quant_method: str,
+    sym: bool,
+    sharded_infeatures: bool,
+) -> GPTQMarlinWeight:
+    """Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels."""
+    _check_marlin_kernels()
+    assert marlin_kernels is not None
+
+    if bits not in GPTQ_MARLIN_BITS:
+        supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
+        raise RuntimeError(
+            f"Repacking {bits}-bit GPTQ weights as Marlin is not supported, must be one of: {supported_bits}"
+        )
+
+    if groupsize not in GPTQ_MARLIN_GROUP_SIZES:
+        supported_sizes = ", ".join(str(b) for b in GPTQ_MARLIN_GROUP_SIZES)
+        raise RuntimeError(
+            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
+        )
+    if not (sym or quant_method == "awq"):
+        raise RuntimeError(
+            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
+        )
+
+    log_once(logger.info, f"Converting {quant_method} model to Marlin packing format.")
+
+    weights_per_int = 32 // bits
+    in_features = qweight.shape[0]
+    out_features = qweight.shape[1]
+
+    # AWQ uses column packing, GPTQ uses row packing
+    if quant_method == "awq":
+        out_features *= weights_per_int
+    else:
+        in_features *= weights_per_int
+
+    if in_features % groupsize != 0:
+        raise ValueError(
+            f"Number of input features ({in_features}) not divisible by group size ({groupsize})"
+        )
+
+    if g_idx is not None and desc_act and groupsize != -1:
+        perm = torch.argsort(g_idx).to(torch.int)
+        g_idx = g_idx[perm]
+    else:
+        perm = torch.empty(0, dtype=torch.int, device=qweight.device)
+        g_idx = torch.empty(0, dtype=torch.int, device=qweight.device)
+
+    if quant_method == "awq":
+        repacked = marlin_kernels.awq_marlin_repack(
+            qweight, in_features, out_features, bits
+        )
+        if qzeros is not None:
+            qzeros = awq_to_marlin_zero_points(
+                qzeros,
+                in_features // groupsize,
+                out_features,
+                bits,
+            )
+
+    else:
+        repacked = marlin_kernels.gptq_marlin_repack(
+            qweight, perm, in_features, out_features, bits
+        )
+
+    if qzeros is None:
+        qzeros = torch.empty(0, dtype=torch.int, device=qweight.device)
+
+    scales = permute_scales(scales)
+
+    is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)
+
+    return GPTQMarlinWeight(
+        qweight=repacked,
+        qzeros=qzeros,
+        scales=scales,
+        g_idx=g_idx,
+        perm=perm,
+        bits=bits,
+        is_full_k=is_full_k,
+    )
+
+
+class GPTQMarlinLinear(nn.Module):
+    """
+    Linear layer for GPTQ weights that were converted for the GPTQ-Marlin
+    kernels.
+    """
+
+    def __init__(
+        self,
+        *,
+        weight: GPTQMarlinWeight,
+        bias: Optional[torch.Tensor],
+    ):
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE
+        out_features = weight.scales.shape[1]
+        _check_valid_shape(in_features=in_features, out_features=out_features)
+
+        self.bits = weight.bits
+        self.is_full_k = weight.is_full_k
+
+        self.qweight = weight.qweight
+        self.qzeros = weight.qzeros
+        self.scales = weight.scales
+        self.g_idx = weight.g_idx
+        self.perm = weight.perm
+        if bias is not None:
+            self.bias = bias
+        else:
+            self.bias = None
+
+        self.workspace = torch.zeros(
+            out_features // 64 * 16, dtype=torch.int, device=weight.qweight.device
+        )
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        A_flat = A.view(-1, A.shape[-1])
+        C = marlin_kernels.gptq_marlin_gemm(
+            A_flat,
+            self.qweight,
+            self.scales,
+            self.qzeros,
+            self.g_idx,
+            self.perm,
+            self.workspace,
+            self.bits,
+            A_flat.shape[0],
+            self.scales.shape[1],
+            A_flat.shape[1],
+            self.is_full_k,
+            self.qzeros.numel() > 0,
+            True,
+        )
+        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
+
+
+def awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    # AWQ zero-points are quantized and packed on the column dim.
+    # In addition, the values are permuted based on dequantizer.
+    # Here we undo both of these, and then apply marlin permutation
+    # and pack it back.
+    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
+
+    # Undo interleaving (use argsort(..) to get inverse perm)
+    if num_bits == 4:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
+    elif num_bits == 8:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
+    q_zp = q_zp.reshape((-1, size_n)).contiguous()
+
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    return marlin_zp
+
+
+def _check_valid_shape(in_features: int, out_features: int):
+    if (in_features % 128 != 0 or out_features % 64 != 0) and (
+        in_features % 64 != 0 or out_features % 128 != 0
+    ):
+        raise ValueError(
+            f"The GPTQ Marlin kernel does not have a valid thread configuration for weight matrix with shape ({out_features}, {in_features})."
+            " The shape elements must be divisible by (128, 64) or (64, 128)."
+        )
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/marlin.py b/backends/gaudi/server/text_generation_server/layers/marlin/marlin.py
new file mode 100644
index 00000000..89ebaca6
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/marlin/marlin.py
@@ -0,0 +1,346 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from text_generation_server.layers.marlin.util import _check_marlin_kernels
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+
+class MarlinWeightsLoader(WeightsLoader):
+    """Loader for Marlin-quantized weights."""
+
+    def __init__(self, *, bits: int, is_marlin_24: bool):
+        self.bits = bits
+        self.is_marlin_24 = is_marlin_24
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        """
+        Get weights at the given prefix and apply without tensor paralllism.
+        """
+        is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
+        if is_marlin_24:
+            try:
+                B = weights.get_tensor(f"{prefix}.B_24")
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
+                )
+
+            B_meta = weights.get_tensor(f"{prefix}.B_meta")
+            s = weights.get_tensor(f"{prefix}.s")
+            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
+        else:
+            try:
+                B = weights.get_tensor(f"{prefix}.B")
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` weight, make sure the model is already quantized."
+                )
+
+            s = weights.get_tensor(f"{prefix}.s")
+            weight = MarlinWeight(B=B, s=s)
+
+        return weight
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        if self.is_marlin_24:
+            B = weights.get_packed_sharded(
+                f"{prefix}.B_24", dim=1, block_sizes=block_sizes
+            )
+            B_meta = weights.get_packed_sharded(
+                f"{prefix}.B_meta", dim=1, block_sizes=block_sizes
+            )
+            s = weights.get_packed_sharded(
+                f"{prefix}.s", dim=1, block_sizes=block_sizes
+            )
+
+            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
+        else:
+            B = weights.get_packed_sharded(
+                f"{prefix}.B", dim=1, block_sizes=block_sizes
+            )
+            s = weights.get_packed_sharded(
+                f"{prefix}.s", dim=1, block_sizes=block_sizes
+            )
+            weight = MarlinWeight(B=B, s=s)
+
+        return weight
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        if self.is_marlin_24:
+            try:
+                B = torch.cat(
+                    [weights.get_sharded(f"{p}.B_24", dim=1) for p in prefixes], dim=1
+                )
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` weight, make sure the model is already quantized"
+                )
+
+            B_meta = torch.cat(
+                [weights.get_sharded(f"{p}.B_meta", dim=1) for p in prefixes], dim=1
+            )
+
+            s = torch.cat(
+                [weights.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
+            )
+
+            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
+        else:
+            try:
+                B = torch.cat(
+                    [weights.get_sharded(f"{p}.B", dim=1) for p in prefixes], dim=1
+                )
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` weight, make sure the model is already quantized"
+                )
+            s = torch.cat(
+                [weights.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
+            )
+
+            weight = MarlinWeight(B=B, s=s)
+
+        return weight
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        if self.is_marlin_24:
+            try:
+                B = weights.get_sharded(f"{prefix}.B_24", dim=0)
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
+                )
+
+            B_meta = weights.get_sharded(f"{prefix}.B_meta", dim=0)
+            num_groups = weights._get_slice(f"{prefix}.s").get_shape()[0]
+            if num_groups == 1:
+                # The number of groups is 1 when groupsize == -1. share
+                # scales between all shards in this case.
+                s = weights.get_tensor(f"{prefix}.s")
+            else:
+                s = weights.get_sharded(f"{prefix}.s", dim=0)
+
+            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
+        else:
+            try:
+                B = weights.get_sharded(f"{prefix}.B", dim=0)
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `marlin` weight, make sure the model is already quantized."
+                )
+
+            num_groups = weights._get_slice(f"{prefix}.s").get_shape()[0]
+            if num_groups == 1:
+                # The number of groups is 1 when groupsize == -1. share
+                # scales between all shards in this case.
+                s = weights.get_tensor(f"{prefix}.s")
+            else:
+                s = weights.get_sharded(f"{prefix}.s", dim=0)
+            weight = MarlinWeight(B=B, s=s)
+
+        return weight
+
+
+@dataclass
+class MarlinWeight(Weight):
+    """
+    Marlin weights.
+
+    Attributes:
+        B (torch.Tensor): int4-quantized weights packed into int32.
+        s (torch.Tensor): bfloat16/float16 scales.
+    """
+
+    B: torch.Tensor
+    s: torch.Tensor
+
+    def __post_init__(self):
+        assert self.B.dtype == torch.int32
+        assert self.s.dtype in [torch.float16, torch.bfloat16]
+
+    def get_linear(self, bias: torch.Tensor):
+        return MarlinLinear(weight=self, bias=bias)
+
+
+class MarlinLinear(nn.Module):
+    def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tensor]):
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE
+        out_features = weight.s.shape[1]
+        assert (
+            in_features % 128 == 0
+        ), f"Number of input features ({in_features}) not divisable by 128"
+        assert (
+            out_features % 256 == 0
+        ), f"Number of output features ({out_features}) not divisable by 256"
+
+        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
+        assert groupsize in {
+            -1,
+            128,
+        }, f"Group size must be -1 or 128, was {groupsize}"
+
+        self.B = weight.B
+        self.s = weight.s
+        if bias is not None:
+            self.bias = bias
+        else:
+            self.bias = None
+
+        self.workspace = torch.zeros(
+            out_features // 64 * 16, dtype=torch.int, device=weight.B.device
+        )
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        C = marlin_kernels.marlin_gemm(
+            A.view(-1, A.shape[-1]),
+            self.B,
+            self.s,
+            self.workspace,
+            A.shape[0],
+            self.s.shape[1],
+            A.shape[1],
+        )
+        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
+
+
+GPTQ_MARLIN_24_MIN_THREAD_N = 128
+GPTQ_MARLIN_24_MIN_THREAD_K = 128
+GPTQ_MARLIN_24_MAX_PARALLEL = 64
+GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8]
+GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
+MARLIN_TILE_SIZE = 16
+
+
+@dataclass
+class GPTQMarlin24Weight:
+    """
+    GPTQ-Marlin 2:4 weights.
+
+    Attributes:
+        B (torch.Tensor): int4-quantized weights packed into int32.
+        B_meta (torch.Tensor): metadata for 2:4 sparsity.
+        s (torch.Tensor): float16 scales.
+        bits: quantized weight size.
+    """
+
+    B: torch.Tensor
+    B_meta: torch.Tensor
+    s: torch.Tensor
+    bits: int
+
+    def __post_init__(self):
+        assert self.B.dtype == torch.int32
+        assert self.B_meta.dtype == torch.int16
+        assert self.s.dtype == torch.float16
+
+    def get_linear(self, bias: torch.Tensor):
+        return GPTQMarlin24Linear(
+            weight=self,
+            bias=bias,
+        )
+
+
+class GPTQMarlin24Linear(nn.Module):
+    def __init__(self, *, weight: GPTQMarlin24Weight, bias: Optional[torch.Tensor]):
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        if weight.bits not in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS:
+            supported_bits = ", ".join(
+                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+            )
+            raise RuntimeError(
+                f"{weight.bits}-bit GPTQ Sparse 2:4 Marlin is not supported, must be one of: {supported_bits}"
+            )
+
+        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE * 2
+        out_features = weight.s.shape[1]
+        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
+
+        if groupsize not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
+            supported_sizes = ", ".join(
+                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
+            )
+            raise RuntimeError(
+                f"Group size {groupsize} is not supported, must be one of: {supported_sizes}"
+            )
+
+        self.bits = weight.bits
+        weights_per_int32 = 32 // self.bits
+
+        assert (
+            out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0
+        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_N} threads"
+        assert (
+            out_features % weights_per_int32 == 0
+        ), f"Number of output features ({out_features}) not divisable by weights per int32 ({weights_per_int32})"
+
+        assert (
+            in_features % GPTQ_MARLIN_24_MIN_THREAD_K == 0
+        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_K} threads"
+        if groupsize != -1 and in_features % groupsize != 0:
+            raise ValueError(
+                f"Number of input features ({in_features}) not divisable by group size ({groupsize})"
+            )
+
+        self.B = weight.B
+        self.B_meta = weight.B_meta
+        self.s = weight.s
+        if bias is not None:
+            self.bias = bias
+        else:
+            self.bias = None
+
+        self.workspace = torch.zeros(
+            (out_features // GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL,
+            dtype=torch.int,
+            device=weight.B.device,
+        )
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        C = marlin_kernels.gptq_marlin_24_gemm(
+            A.view(-1, A.shape[-1]),
+            self.B,
+            self.B_meta,
+            self.s,
+            self.workspace,
+            self.bits,
+            A.shape[0],
+            self.s.shape[1],
+            A.shape[1],
+        )
+
+        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/util.py b/backends/gaudi/server/text_generation_server/layers/marlin/util.py
new file mode 100644
index 00000000..250d1714
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/marlin/util.py
@@ -0,0 +1,141 @@
+import functools
+from typing import List, Tuple
+
+import numpy
+import torch
+from text_generation_server.utils.import_utils import SYSTEM
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+    has_sm_8_0 = major >= 8
+except Exception:
+    has_sm_8_0 = False
+
+
+def _check_marlin_kernels():
+    if not (SYSTEM == "cuda" and has_sm_8_0):
+        raise NotImplementedError(
+            "Using quantized Marlin models requires a GPU with CUDA capability 8.0 or later."
+        )
+
+    if marlin_kernels is None:
+        raise NotImplementedError(
+            "marlin is not installed, install it with: pip install server/marlin"
+        )
+
+
+# https://github.com/IST-DASLab/marlin/blob/2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c/marlin/__init__.py#L40C1-L68C54
+@functools.cache
+def get_perms() -> Tuple[List[int], List[int]]:
+    scale_perm = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single = []
+    for i in range(4):
+        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+def permute_scales(scales: torch.Tensor):
+    scale_perm, scale_perm_single = get_perms()
+    out_features = scales.shape[1]
+    if scales.shape[0] == 1:
+        scales = scales.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    else:
+        scales = scales.reshape((-1, len(scale_perm)))[:, scale_perm]
+    return scales.reshape((-1, out_features)).contiguous()
+
+
+# Functions below are from vLLM
+
+
+def get_pack_factor(bits: int) -> int:
+    if 32 % bits != 0:
+        raise ValueError(f"Cannot {bits} bit values into uint32")
+    return 32 // bits
+
+
+def pack_cols(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def unpack_cols(
+    packed_q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+    assert packed_q_w.shape == (
+        size_k,
+        size_n // pack_factor,
+    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
+        packed_q_w.shape, size_k, size_n, pack_factor
+    )
+
+    orig_device = packed_q_w.device
+
+    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
+    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
+
+    mask = (1 << num_bits) - 1
+    for i in range(pack_factor):
+        vals = packed_q_w_cpu & mask
+        packed_q_w_cpu >>= num_bits
+        q_res[:, i::pack_factor] = vals
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def marlin_zero_points(
+    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    scale_perm, _ = get_perms()
+    # Permute zero-points in a similar way to scales, but do not use the
+    # "single" permutation, since zero-points are applied on every MMA
+    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
+
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    zp = zp.reshape((-1, size_n)).contiguous()
+    zp = pack_cols(zp, num_bits, size_k, size_n)
+
+    return zp
diff --git a/backends/gaudi/server/text_generation_server/layers/medusa.py b/backends/gaudi/server/text_generation_server/layers/medusa.py
new file mode 100644
index 00000000..139c4dc2
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/medusa.py
@@ -0,0 +1,191 @@
+import torch
+from torch import nn
+from typing import Tuple, Optional
+from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.layers.linear import FastLinear
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelHead,
+    TensorParallelColumnLinear,
+)
+
+
+class ResBlock(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.linear = FastLinear.load(
+            config, prefix=f"{prefix}.linear", weights=weights, bias=True
+        )
+        self.act = torch.nn.SiLU()
+
+    def forward(self, x):
+        return x + self.act(self.linear(x))
+
+
+class MedusaModel(torch.nn.Module):
+    def __init__(self, config, medusa_config, weights):
+        super().__init__()
+        self.heads = torch.nn.ModuleList(
+            [
+                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
+                for i in range(get_speculate())
+            ]
+        )
+
+    def forward(self, x):
+        if not self.heads:
+            return None
+        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
+        return speculative_logits
+
+
+class MedusaHead(torch.nn.Module):
+    def __init__(self, config, medusa_config, prefix, weights):
+        super().__init__()
+        self.blocks = torch.nn.ModuleList(
+            [
+                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
+                for i in range(medusa_config["medusa_num_layers"])
+            ]
+        )
+        n = len(self.blocks)
+        self.out = FastLinear.load(
+            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
+        )
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.out(x)
+        return x
+
+
+class MedusaHeadV1(nn.Module):
+    def __init__(self, lm_head, medusa):
+        super().__init__()
+        self.lm_head = lm_head
+        self.medusa = medusa
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        speculator = config.speculator
+
+        path = speculator["path"]
+        medusa_config = str(Path(path) / "config.json")
+
+        for fname in speculator["model_paths"]:
+            filename = str(Path(path) / fname)
+
+            with open(medusa_config, "r") as f:
+                medusa_config = json.load(f)
+            routing = weights.routing
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing and routing[k] != filename:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+
+        medusa = MedusaModel(config, medusa_config, weights)
+        lm_head = TensorParallelHead.load(config, prefix, weights)
+        return MedusaHeadV1(lm_head, medusa)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
+        speculative_logits = self.medusa(input)
+        return logits, speculative_logits
+
+
+class MedusaHeadV2(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        speculator_path = config.speculator["path"]
+
+        medusa_config = str(Path(speculator_path) / "config.json")
+        filename = str(Path(speculator_path) / "medusa_lm_head.safetensors")
+
+        with open(medusa_config, "r") as f:
+            medusa_config = json.load(f)
+        routing = weights.routing
+        with safe_open(filename, framework="pytorch") as f:
+            for k in f.keys():
+                if k in routing and routing[k] != filename:
+                    raise RuntimeError(
+                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                    )
+                routing[k] = filename
+
+        self.n_medusa_heads = get_speculate()
+
+        assert medusa_config["medusa_num_layers"] == 1
+        self.linear = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.process_group = weights.process_group
+        self.world_size = self.process_group.size()
+        self.rank = self.process_group.rank()
+
+        self.act = torch.nn.SiLU()
+
+        self.lm_head = TensorParallelHead.load(config, prefix, weights)
+
+    def forward(self, x):
+        # If we have too many tokens, we skip speculative logits
+        if x.shape[0] > 128:
+            logits = self.lm_head(x)
+            return logits, None
+
+        size = x.shape[-1]
+        block_size = (size + self.world_size - 1) // self.world_size
+        start = self.rank * block_size
+        stop = (self.rank + 1) * block_size
+
+        x_block = x[:, start:stop]
+
+        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
+        medusa_res = self.act(self.linear(x)).reshape(
+            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
+        )
+
+        # Apply all residual medusa heads
+        output = x[:, start:stop].unsqueeze(-2) + medusa_res
+
+        # Gather medusa heads
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+
+        # Stack x and medusa residual x
+        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
+
+        # Compute lm head on x + medusa residual x
+        logits = self.lm_head(stacked_x)
+
+        # Finally, split logits from speculative logits
+        logits, speculative_logits = torch.split(
+            logits, [1, self.n_medusa_heads], dim=-2
+        )
+        # Squeeze added dimension
+        logits = logits.squeeze(-2)
+
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/layers/mlp.py b/backends/gaudi/server/text_generation_server/layers/mlp.py
new file mode 100644
index 00000000..d33b41f3
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/mlp.py
@@ -0,0 +1,282 @@
+import torch
+import math
+from torch import nn
+from torch.nn import functional as F
+from typing import Optional, Tuple
+from text_generation_server.layers import TensorParallelEmbedding, FastLinear
+from text_generation_server.layers.tensor_parallel import TensorParallelHead
+from text_generation_server.utils.speculate import get_speculate
+
+
+class MLPSpeculatorLayerNorm(nn.Module):
+    """
+    A L2 normalization implementation
+    ...
+    Args
+    ----
+    normalized_shape : int
+        Dimensionality of input data (size of final tensor axis)
+    elementwise_scale_weight : torch.Tensor
+        learned scaling term after normalization?
+    elementwise_shift_bias : torch.Tensor
+        learned bias term after normalization?
+    eps : float
+        Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
+    """
+
+    def __init__(
+        self,
+        prefix,
+        config,
+        weights,
+        eps=1e-06,
+    ):
+        super(MLPSpeculatorLayerNorm, self).__init__()
+        self.weight = weights.get_tensor(f"{prefix}.weight")
+        self.bias = weights.get_tensor(f"{prefix}.bias")
+        self.eps = eps
+
+    def forward(self, x):
+        xf = x
+        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
+        x = xf.type_as(x)
+        x = self.weight * x
+        x = x + self.bias
+        return x
+
+
+INV_SQRT2 = 2**-0.5
+
+
+def simple_norm(x: torch.Tensor, eps=1e-06):
+    xf = x
+    xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + eps)
+    x = xf.type_as(x)
+    return x * INV_SQRT2
+
+
+class MLPSpeculatorModelTied(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.config = config
+        self.n_predict = get_speculate()
+        self.hidden_size = config.hidden_size
+
+        self.emb = TensorParallelEmbedding(f"{prefix}.emb.0", weights)
+        self.proj0 = FastLinear.load(
+            config,
+            prefix=f"{prefix}.proj.0",
+            weights=weights,
+            bias=False,
+        )
+        self.proj1 = FastLinear.load(
+            config,
+            prefix=f"{prefix}.proj.1",
+            weights=weights,
+            bias=False,
+        )
+        self.head = FastLinear.load(config, f"{prefix}.head.0", weights, bias=False)
+        self.ln = MLPSpeculatorLayerNorm(
+            prefix=f"{prefix}.ln.0",
+            config=config,
+            weights=weights,
+        )
+
+        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
+        self.state_weight = 0.5 ** (0.5 / self.n_predict) if self.n_predict > 0 else 1
+        self.activation = nn.GELU()
+        self.vsize = config.vocab_size
+        self.inner_dim = config.speculator_config["inner_dim"]
+        self.top_k_tokens_per_head = [1] * self.n_predict
+        self.emb_weight = math.sqrt(1 - self.state_weight**2) * math.sqrt(
+            self.inner_dim / 2
+        )
+        self.emb.weight *= self.emb_weight
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_ids: torch.Tensor,
+    ):
+        top_k_tokens_per_head = self.top_k_tokens_per_head
+
+        # k indicates # of candidates
+        # h indicates # of generated tokens
+        state = hidden_states
+        b = state.size(0)
+        ind = input_ids.unsqueeze(0)
+        all_probs = torch.empty(
+            b, self.n_predict, self.vsize, device=state.device
+        )  # b k h v
+        assert (
+            len(top_k_tokens_per_head) == self.n_predict
+        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
+        for i in range(self.n_predict):
+            # Project and predict
+            z = self.emb(ind)
+            # z = z.mul(self.emb_weight)  # b k d
+            if i == 0:
+                state = self.proj0(state) * self.state_weight + z
+            else:
+                state = self.proj1(state) * self.state_weight + z
+            state = self.activation(self.ln(state))  # b k d
+            probs = F.log_softmax(self.head(state), dim=-1)  # b k v
+            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'
+
+            # Update candidate set with new predictions
+
+            # Update distribution set with new logits
+            all_probs[:, i] = probs.exp()
+
+            # Update state, log_probs and ind for new predictions
+            state = state.unsqueeze(2).expand(
+                -1, -1, top_k_tokens_per_head[i], -1
+            )  # b k k' d
+            state = state.reshape(-1, b, state.size(3))  # b kk' d
+            ind = preds.view(-1, b)  # b kk'
+
+        speculative_logits = all_probs
+        return speculative_logits
+
+
+class MLPSpeculatorModel(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.config = config
+        self.n_predict = get_speculate()
+        self.hidden_size = config.hidden_size
+
+        self.emb = nn.ModuleList(
+            [
+                TensorParallelEmbedding(f"{prefix}.emb.{i}", weights)
+                for i in range(self.n_predict)
+            ]
+        )
+        self.proj = [
+            FastLinear.load(
+                config,
+                prefix=f"{prefix}.proj.{i}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_predict)
+        ]
+        self.head = nn.ModuleList(
+            [
+                FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False)
+                for i in range(self.n_predict)
+            ]
+        )
+        self.ln = nn.ModuleList(
+            [
+                MLPSpeculatorLayerNorm(
+                    prefix=f"{prefix}.ln.{i}",
+                    config=config,
+                    weights=weights,
+                )
+                for i in range(self.n_predict)
+            ]
+        )
+
+        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
+        self.state_weight = 0.5 ** (0.5 / self.n_predict) if self.n_predict > 0 else 1
+        self.activation = nn.GELU()
+        self.vsize = config.vocab_size
+        self.inner_dim = config.speculator_config["inner_dim"]
+        self.top_k_tokens_per_head = [1] * self.n_predict
+        self.emb_weight = math.sqrt(1 - self.state_weight**2) * math.sqrt(
+            self.inner_dim / 2
+        )
+        self.emb.weight *= self.emb_weight
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_ids: torch.Tensor,
+    ):
+        top_k_tokens_per_head = self.top_k_tokens_per_head
+
+        # k indicates # of candidates
+        # h indicates # of generated tokens
+        state = hidden_states
+        b = state.size(0)
+        ind = input_ids.unsqueeze(0)
+        all_probs = torch.empty(
+            b, self.n_predict, self.vsize, device=state.device
+        )  # b k h v
+        assert (
+            len(top_k_tokens_per_head) == self.n_predict
+        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
+        for i in range(self.n_predict):
+            # Project and predict
+            z = self.emb[i](ind)
+            # z = z.mul(self.emb_weight)  # b k d
+            state = self.proj[i](state) * self.state_weight + z
+            state = self.activation(self.ln[i](state))  # b k d
+            probs = F.log_softmax(self.head[i](state), dim=-1)  # b k v
+            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'
+
+            # Update candidate set with new predictions
+
+            # Update distribution set with new logits
+            all_probs[:, i] = probs.exp()
+
+            # Update state, log_probs and ind for new predictions
+            state = state.unsqueeze(2).expand(
+                -1, -1, top_k_tokens_per_head[i], -1
+            )  # b k k' d
+            state = state.reshape(-1, b, state.size(3))  # b kk' d
+            ind = preds.view(-1, b)  # b kk'
+
+        speculative_logits = all_probs
+        return speculative_logits
+
+
+class MLPSpeculatorHead(nn.Module):
+    def __init__(self, lm_head, mlp_speculator, scale_input: bool):
+        super().__init__()
+        self.lm_head = lm_head
+        self.mlp_speculator = mlp_speculator
+        self.scale_input = scale_input
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
+        input_ids = logits.argmax(dim=-1)
+        if self.scale_input:
+            input = simple_norm(input)
+        speculative_logits = self.mlp_speculator(input, input_ids)
+        return logits, speculative_logits
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        from pathlib import Path
+        from safetensors import safe_open
+
+        speculator_path = config.speculator["path"]
+
+        for fname in config.speculator["model_paths"]:
+            filename = str(Path(speculator_path) / fname)
+            routing = weights.routing
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing and routing[k] != filename:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+
+        tie_weights = config.speculator_config.get("tie_weights", False)
+        if tie_weights:
+            mlp_speculator = MLPSpeculatorModelTied(config, "speculator", weights)
+        else:
+            mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
+        # This is used in https://huggingface.co/ibm-fms/llama3-70b-accelerator
+        scale_input = config.speculator_config.get("scale_input", False)
+        lm_head = TensorParallelHead.load(config, prefix, weights)
+        return MLPSpeculatorHead(lm_head, mlp_speculator, scale_input)
diff --git a/backends/gaudi/server/text_generation_server/layers/moe/__init__.py b/backends/gaudi/server/text_generation_server/layers/moe/__init__.py
new file mode 100644
index 00000000..2c46ca02
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/moe/__init__.py
@@ -0,0 +1,257 @@
+from typing import Optional, Protocol, runtime_checkable
+
+import torch
+import torch.nn as nn
+from loguru import logger
+from transformers.activations import ACT2FN
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
+from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader
+from text_generation_server.layers.moe.gptq_marlin import (
+    GPTQMarlinSparseMoELayer,
+    can_use_marlin_moe_gemm,
+)
+from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    Weights,
+    UnquantizedWeight,
+)
+
+if SYSTEM == "rocm":
+    from .fused_moe_rocm import grouped_topk
+    from vllm.model_executor.layers.fused_moe import fused_topk
+elif SYSTEM != "ipex":
+    from moe_kernels.fused_moe import fused_topk, grouped_topk
+
+
+# NOTE: we are using a protocol here, because multiple inherance is not nice.
+#       We need `Module`, and `Module` -> some abstract class -> some concrete
+#       class inheritance is whacky.
+
+
+@runtime_checkable
+class MoELayer(Protocol):
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+        hidden_act: str = "silu",
+    ): ...
+
+    def forward(
+        self, x: torch.Tensor, *, gating_output: torch.Tensor
+    ) -> torch.Tensor: ...
+
+
+class DenseMoELayer(nn.Module):
+    """
+    Layer for MoE that applies *all* experts to each tokens and then weights
+    their outputs based on the calculated routing. This layer is much slower
+    than `SparseMoELayer` and should only be used when no fused kernels are
+    available (e.g. for unsupported quantizers).
+    """
+
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+        hidden_act: str = "silu",
+    ):
+        super().__init__()
+
+        log_once(
+            logger.info,
+            "No fused layers are available for this model type, using (slower) dense MoE layer",
+        )
+
+        assert (n_expert_group is None) == (
+            topk_group is None
+        ), "n_expert_group and topk_group must both be None or have some value"
+
+        self.n_expert_group = n_expert_group
+        self.n_experts = n_experts
+        self.renormalize = renormalize
+        self.topk = topk
+        self.topk_group = topk_group
+
+        if "gelu" in hidden_act:
+            self.act = lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh"
+                    if hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
+                    else "none"
+                ),
+            )
+        elif "silu" in hidden_act:
+            self.act = torch.nn.functional.silu
+        else:
+            self.act = ACT2FN[hidden_act]
+
+        self.gate_proj = [
+            TensorParallelColumnLinear.load(
+                None,
+                prefix=f"{prefix}.{i}.{gate_proj_name}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_experts)
+        ]
+        self.up_proj = [
+            TensorParallelColumnLinear.load(
+                None,
+                prefix=f"{prefix}.{i}.{up_proj_name}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_experts)
+        ]
+        self.down_proj = [
+            TensorParallelRowLinear.load(
+                None,
+                prefix=f"{prefix}.{i}.{down_proj_name}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_experts)
+        ]
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        """
+        x: (sequence_length, model_dim)
+        gating_output: (sequence_length, n_experts)
+        """
+        # optional reshape
+        input_shape = x.shape
+        x = x.view(-1, input_shape[-1])
+
+        if self.n_expert_group is not None and self.topk_group is not None:
+            topk_weights, topk_ids = grouped_topk(
+                x,
+                gating_output,
+                self.topk,
+                renormalize=self.renormalize,
+                num_expert_group=self.n_expert_group,
+                topk_group=self.topk_group,
+            )
+        else:
+            topk_weights, topk_ids = fused_topk(
+                x, gating_output, self.topk, self.renormalize
+            )
+            topk_weights = topk_weights.to(x.dtype)
+
+        weights = torch.zeros(
+            topk_ids.shape[0], self.n_experts, dtype=x.dtype, device=x.device
+        )
+
+        weights.scatter_(1, topk_ids.long(), topk_weights.to(weights.dtype))
+
+        out = torch.zeros_like(x)
+        for i in range(self.n_experts):
+            h = self.act(self.gate_proj[i](x)) * self.up_proj[i](x)
+            h = self.down_proj[i](h, reduce=False)
+            out += h * weights[:, i].view(-1, 1)
+
+        return out
+
+
+class SparseMoELayer(nn.Module):
+    """
+    Layer for MoE that uses fused kernels to only apply the active experts
+    for each token (rather than applying all experts and selecting the
+    outputs of active experts).
+    """
+
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+    ):
+        super().__init__()
+
+        if (
+            isinstance(weights.loader, DefaultWeightsLoader)
+            and isinstance(weights.loader.weight_class, UnquantizedWeight)
+        ) or isinstance(weights.loader, HybridFP8UnquantLoader):
+            cls = UnquantizedSparseMoELayer
+        elif isinstance(weights.loader, GPTQMarlinWeightsLoader) and weights.loader.sym:
+            cls = GPTQMarlinSparseMoELayer
+        else:
+            raise ValueError(
+                f"Unsupported weights loader: {weights.loader}, sparse MoE is only supported for unquantized and GPTQ weights"
+            )
+
+        log_once(
+            logger.info,
+            "Using MoE layer wih fused gemm",
+        )
+
+        self.moe = cls(
+            n_expert_group=n_expert_group,
+            n_experts=n_experts,
+            prefix=prefix,
+            renormalize=renormalize,
+            topk=topk,
+            topk_group=topk_group,
+            weights=weights,
+            gate_proj_name=gate_proj_name,
+            up_proj_name=up_proj_name,
+            down_proj_name=down_proj_name,
+        )
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        return self.moe(x, gating_output=gating_output)
+
+    @staticmethod
+    def is_supported(weights: Weights) -> bool:
+        return (
+            (
+                isinstance(weights.loader, DefaultWeightsLoader)
+                and isinstance(weights.loader.weight_class, UnquantizedWeight)
+            )
+            or isinstance(weights.loader, HybridFP8UnquantLoader)
+            or (
+                isinstance(weights.loader, GPTQMarlinWeightsLoader)
+                and can_use_marlin_moe_gemm(
+                    quant_method=weights.loader.quant_method,
+                    quantize=weights.loader.quantize,
+                    sym=weights.loader.sym,
+                )
+            )
+        )
diff --git a/backends/gaudi/server/text_generation_server/layers/moe/fused_moe_rocm.py b/backends/gaudi/server/text_generation_server/layers/moe/fused_moe_rocm.py
new file mode 100644
index 00000000..68accb99
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/moe/fused_moe_rocm.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import torch
+import torch.distributed
+
+
+# TODO: Remove the functions once moe_kernel are built for ROCM
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    group_scores = (
+        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_ids
diff --git a/backends/gaudi/server/text_generation_server/layers/moe/gptq_marlin.py b/backends/gaudi/server/text_generation_server/layers/moe/gptq_marlin.py
new file mode 100644
index 00000000..3217cdc2
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/moe/gptq_marlin.py
@@ -0,0 +1,215 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.weights import Weights
+from text_generation_server.layers.marlin.gptq import (
+    GPTQMarlinWeight,
+    GPTQMarlinWeightsLoader,
+)
+
+if SYSTEM == "cuda":
+    from moe_kernels.fused_marlin_moe import fused_marlin_moe
+else:
+    fused_marlin_moe = None
+
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+    has_sm_8_0 = major >= 8
+except Exception:
+    has_sm_8_0 = False
+
+
+def can_use_marlin_moe_gemm(
+    *,
+    quant_method: str,
+    quantize: str,
+    sym: bool,
+):
+    return (
+        SYSTEM == "cuda"
+        and fused_marlin_moe is not None
+        and has_sm_8_0
+        and quantize == "gptq"
+        and quant_method == "gptq"
+        and sym
+    )
+
+
+@dataclass
+class GPTQMarlinMoEWeight:
+    qweight: torch.Tensor
+    qzeros: torch.Tensor
+    scales: torch.Tensor
+    g_idx: torch.Tensor
+    perm: torch.Tensor
+    is_full_k: bool
+
+
+class GPTQMarlinSparseMoELayer(nn.Module):
+    """
+    MoE layer that uses a fused GPTQ-Marlin kernel.
+    """
+
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+    ):
+        super().__init__()
+
+        if not (
+            isinstance(weights.loader, GPTQMarlinWeightsLoader) and weights.loader.sym
+        ):
+            raise ValueError(
+                f"Unsupported weights loader: {weights.loader}, only GPTQMarlinWeightsLoader with symmetric quantization is supported"
+            )
+
+        assert (n_expert_group is None) == (
+            topk_group is None
+        ), "n_expert_group and topk_group must both be None or have some value"
+
+        self.n_expert_group = n_expert_group
+        self.topk = topk
+        self.topk_group = topk_group
+        self.renormalize = renormalize
+
+        self.gate_up_proj = _load_expert_multi_weights_col(
+            prefix=prefix,
+            n_experts=n_experts,
+            names=[gate_proj_name, up_proj_name],
+            weights=weights,
+        )
+
+        self.down_proj = _load_expert_weights_row(
+            prefix=prefix, n_experts=n_experts, name=down_proj_name, weights=weights
+        )
+
+        self.bits = weights.loader.bits
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        return fused_marlin_moe(
+            x,
+            w1=self.gate_up_proj.qweight,
+            w2=self.down_proj.qweight,
+            g_idx1=self.gate_up_proj.g_idx,
+            g_idx2=self.down_proj.g_idx,
+            perm1=self.gate_up_proj.perm,
+            perm2=self.down_proj.perm,
+            w1_scale=self.gate_up_proj.scales,
+            w2_scale=self.down_proj.scales,
+            is_full_k1=self.gate_up_proj.is_full_k,
+            is_full_k2=self.down_proj.is_full_k,
+            gating_output=gating_output,
+            topk=self.topk,
+            renormalize=self.renormalize,
+            use_grouped_topk=self.n_expert_group is not None,
+            num_expert_group=self.n_expert_group,
+            topk_group=self.topk_group,
+            num_bits=self.bits,
+        )
+
+
+def _load_expert_multi_weights_col(
+    *,
+    prefix: str,
+    n_experts: int,
+    names: List[str],
+    weights: Weights,
+) -> GPTQMarlinMoEWeight:
+    moe_weight = None
+    for i in range(n_experts):
+        weight = weights.get_multi_weights_col(
+            [f"{prefix}.{i}.{name}" for name in names], 0
+        )
+        assert isinstance(weight, GPTQMarlinWeight)
+        moe_weight = _pack_weight(
+            n_experts=n_experts, expert=i, weight=weight, moe_weight=moe_weight
+        )
+    assert moe_weight is not None
+    return moe_weight
+
+
+def _load_expert_weights_row(
+    *,
+    prefix: str,
+    n_experts: int,
+    name: str,
+    weights: Weights,
+) -> GPTQMarlinMoEWeight:
+    moe_weight = None
+    for i in range(n_experts):
+        weight = weights.get_weights_row(
+            f"{prefix}.{i}.{name}",
+        )
+        assert isinstance(weight, GPTQMarlinWeight)
+        moe_weight = _pack_weight(
+            n_experts=n_experts, expert=i, weight=weight, moe_weight=moe_weight
+        )
+    assert moe_weight is not None
+    return moe_weight
+
+
+def _pack_weight(
+    *,
+    n_experts: int,
+    expert: int,
+    moe_weight: Optional[GPTQMarlinMoEWeight],
+    weight: GPTQMarlinWeight,
+) -> GPTQMarlinMoEWeight:
+    if moe_weight is None:
+        qweight = torch.empty(
+            (n_experts,) + weight.qweight.shape,
+            dtype=weight.qweight.dtype,
+            device=weight.qweight.device,
+        )
+        qzeros = torch.empty(
+            (n_experts,) + weight.qzeros.shape,
+            dtype=weight.qzeros.dtype,
+            device=weight.qzeros.device,
+        )
+        scales = torch.empty(
+            (n_experts,) + weight.scales.shape,
+            dtype=weight.scales.dtype,
+            device=weight.scales.device,
+        )
+        g_idx = torch.empty(
+            (n_experts,) + weight.g_idx.shape,
+            dtype=weight.g_idx.dtype,
+            device=weight.g_idx.device,
+        )
+        perm = torch.empty(
+            (n_experts,) + weight.perm.shape,
+            dtype=weight.perm.dtype,
+            device=weight.perm.device,
+        )
+
+        moe_weight = GPTQMarlinMoEWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            perm=perm,
+            is_full_k=weight.is_full_k,
+        )
+
+    moe_weight.qweight[expert] = weight.qweight
+    moe_weight.qzeros[expert] = weight.qzeros
+    moe_weight.scales[expert] = weight.scales
+    moe_weight.g_idx[expert] = weight.g_idx
+    moe_weight.perm[expert] = weight.perm
+
+    return moe_weight
diff --git a/backends/gaudi/server/text_generation_server/layers/moe/unquantized.py b/backends/gaudi/server/text_generation_server/layers/moe/unquantized.py
new file mode 100644
index 00000000..d9d62c0e
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/moe/unquantized.py
@@ -0,0 +1,138 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.weights import UnquantizedWeight, Weights
+
+if SYSTEM == "rocm":
+    from vllm.model_executor.layers.fused_moe import fused_moe
+elif SYSTEM != "ipex":
+    from moe_kernels.fused_moe import fused_moe
+
+
+class UnquantizedSparseMoELayer(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+    ):
+        super().__init__()
+
+        assert (n_expert_group is None) == (
+            topk_group is None
+        ), "n_expert_group and topk_group must both be None or have some value"
+
+        self.n_expert_group = n_expert_group
+        self.topk = topk
+        self.topk_group = topk_group
+        self.renormalize = renormalize
+
+        self.gate_up_proj = _load_expert_multi_weights_col(
+            prefix=prefix,
+            n_experts=n_experts,
+            gate_proj_name=gate_proj_name,
+            up_proj_name=up_proj_name,
+            weights=weights,
+        )
+
+        self.down_proj = _load_expert_weights_row(
+            prefix=prefix,
+            n_experts=n_experts,
+            name=down_proj_name,
+            weights=weights,
+        )
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        if SYSTEM == "rocm":
+            return fused_moe(
+                x,
+                self.gate_up_proj,
+                self.down_proj,
+                gating_output,
+                self.topk,
+                renormalize=self.renormalize,
+                inplace=True,
+            )
+
+        return fused_moe(
+            x,
+            w1=self.gate_up_proj,
+            w2=self.down_proj,
+            gating_output=gating_output,
+            topk=self.topk,
+            renormalize=self.renormalize,
+            inplace=True,
+            use_grouped_topk=self.n_expert_group is not None,
+            num_expert_group=self.n_expert_group,
+            topk_group=self.topk_group,
+        )
+
+
+def _load_expert_multi_weights_col(
+    *,
+    prefix: str,
+    n_experts: int,
+    gate_proj_name: str,
+    up_proj_name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    all_weight = None
+    for i in range(n_experts):
+        weight = weights.get_multi_weights_col(
+            [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
+        )
+
+        assert isinstance(weight, UnquantizedWeight)
+
+        if all_weight is None:
+            all_weight = torch.empty(
+                (n_experts,) + weight.weight.shape,
+                dtype=weight.weight.dtype,
+                device=weight.weight.device,
+            )
+
+        all_weight[i] = weight.weight
+
+    assert all_weight is not None
+
+    return all_weight
+
+
+def _load_expert_weights_row(
+    *,
+    prefix: str,
+    n_experts: int,
+    name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    all_weight = None
+    for i in range(n_experts):
+        weight = weights.get_weights_row(
+            f"{prefix}.{i}.{name}",
+        )
+
+        assert isinstance(weight, UnquantizedWeight)
+
+        if all_weight is None:
+            all_weight = torch.empty(
+                (n_experts,) + weight.weight.shape,
+                dtype=weight.weight.dtype,
+                device=weight.weight.device,
+            )
+
+        all_weight[i] = weight.weight
+
+    assert all_weight is not None
+
+    return all_weight
diff --git a/backends/gaudi/server/text_generation_server/layers/rotary.py b/backends/gaudi/server/text_generation_server/layers/rotary.py
new file mode 100644
index 00000000..a2076bb2
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/rotary.py
@@ -0,0 +1,548 @@
+import os
+import math
+import torch
+from torch import nn
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "cuda":
+    import rotary_emb
+elif SYSTEM == "rocm":
+    from vllm._C import ops
+elif SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+
+
+def _create_inv_freq(dim, base, device):
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
+    )
+    return inv_freq
+
+
+def _get_rope_config(config):
+    if os.getenv("ROPE_SCALING", None) is not None:
+        rope_scaling = {
+            "type": os.environ["ROPE_SCALING"],
+            "factor": float(os.environ["ROPE_FACTOR"]),
+        }
+        return rope_scaling
+    return getattr(config, "rope_scaling", None)
+
+
+class PositionRotaryEmbedding(nn.Module):
+    def __init__(self, inv_freq, scaling_factor):
+        super().__init__()
+        self.inv_freq = inv_freq
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.scaling_factor = scaling_factor
+        self.dynamic_args = None
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            rotary_dim = cos.shape[-1]
+            q1 = query[..., :rotary_dim]
+            q2 = query[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., :rotary_dim]
+            k2 = key[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            ops.rotary_embedding(query, key, head_size, cos, sin, True)
+        elif SYSTEM == "ipex":
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), True
+            )
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+    @classmethod
+    def static(cls, config, dim, base, device):
+        inv_freq = _create_inv_freq(dim, base, device)
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            # `rope_type` is now standard in transformers, but some existing models
+            # have `type` instead.
+            rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))
+
+            if rope_type == "linear":
+                pass
+            elif rope_type == "dynamic":
+                scaling_factor = rope_scaling["factor"]
+                return DynamicPositionRotaryEmbedding(
+                    dim=dim,
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=base,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_type == "llama3":
+                inv_freq = apply_llama3_scaling(
+                    inv_freq,
+                    scaling_factor=rope_scaling["factor"],
+                    low_freq_factor=rope_scaling["low_freq_factor"],
+                    high_freq_factor=rope_scaling["high_freq_factor"],
+                    original_max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                )
+
+                return cls(inv_freq, scaling_factor)
+
+            elif rope_type == "yarn":
+                scaling_factor = rope_scaling["factor"]
+                mscale = rope_scaling.get("mscale", 1.0)
+                mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=base,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                    mscale=mscale,
+                    mscale_all_dim=mscale_all_dim,
+                )
+            elif rope_type in ["su", "longrope"]:
+                short_factor = torch.tensor(
+                    rope_scaling["short_factor"], dtype=torch.float32, device=device
+                )
+                short_inv_freq = 1.0 / (
+                    short_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+                long_factor = torch.tensor(
+                    rope_scaling["long_factor"], dtype=torch.float32, device=device
+                )
+                long_inv_freq = 1.0 / (
+                    long_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+
+                original_max_position_embeddings = (
+                    config.original_max_position_embeddings
+                )
+                max_position_embeddings = config.max_position_embeddings
+                if max_position_embeddings <= original_max_position_embeddings:
+                    scaling_factor = 1.0
+                else:
+                    scale = max_position_embeddings / original_max_position_embeddings
+                    scaling_factor = math.sqrt(
+                        1 + math.log(scale) / math.log(original_max_position_embeddings)
+                    )
+
+                # if short_mscale and long_mscale are provided we need to scale the freqs
+                # using the Phi3LongRoPEScaledRotaryEmbedding
+                if ("short_mscale" in rope_scaling) and ("long_mscale" in rope_scaling):
+                    short_mscale = rope_scaling["short_mscale"]
+                    long_mscale = rope_scaling["long_mscale"]
+                    return Phi3LongRoPEScaledRotaryEmbedding(
+                        short_inv_freq=short_inv_freq,
+                        long_inv_freq=long_inv_freq,
+                        max_position_embeddings=config.max_position_embeddings,
+                        short_mscale=short_mscale,
+                        long_mscale=long_mscale,
+                        original_max_position_embeddings=original_max_position_embeddings,
+                    )
+
+                return SuRotaryEmbedding(
+                    short_inv_freq=short_inv_freq,
+                    long_inv_freq=long_inv_freq,
+                    scaling_factor=scaling_factor,
+                    original_max_position_embeddings=original_max_position_embeddings,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    @classmethod
+    def load(cls, config, prefix, weights):
+        # XXX: Always load this in float32 !
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
+        weights.dtype = dtype
+
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            scaling_factor = rope_scaling["factor"]
+            if rope_scaling["type"] == "linear":
+                pass
+            elif rope_scaling["type"] == "dynamic":
+                return DynamicPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_scaling["type"] == "yarn":
+                mscale = rope_scaling.get("mscale", 1.0)
+                mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                    mscale=mscale,
+                    mscale_all_dim=mscale_all_dim,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            if self.scaling_factor is not None:
+                t /= self.scaling_factor
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
+        """
+        Return cos and sin for the asked position ids
+        """
+        if SYSTEM == "rocm":
+            # For RoCm, we always use float cos/sin to avoid a cast.
+            # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
+            # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
+            dtype = torch.float32
+
+        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
+
+        cos = torch.index_select(self._cos_cached, 0, position_ids)
+        sin = torch.index_select(self._sin_cached, 0, position_ids)
+
+        # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
+        return cos.unsqueeze(1), sin.unsqueeze(1)
+
+
+class SuRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        short_inv_freq,
+        long_inv_freq,
+        scaling_factor,
+        original_max_position_embeddings,
+    ):
+        super(PositionRotaryEmbedding, self).__init__()
+        self.short_inv_freq = short_inv_freq
+        self.long_inv_freq = long_inv_freq
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.dynamic_args = None
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+
+            t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype)
+            short_freqs = torch.outer(
+                t[: self.original_max_position_embeddings],
+                self.short_inv_freq.to(device=t.device),
+            )
+            long_freqs = torch.outer(
+                t[self.original_max_position_embeddings :],
+                self.long_inv_freq.to(device=t.device),
+            )
+
+            freqs = torch.cat([short_freqs, long_freqs])
+
+            self._cos_cached = (torch.cos(freqs) * self.scaling_factor).to(dtype)
+            self._sin_cached = (torch.sin(freqs) * self.scaling_factor).to(dtype)
+
+
+class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        short_inv_freq: torch.Tensor,
+        long_inv_freq: torch.Tensor,
+        max_position_embeddings: int,
+        short_mscale: float,
+        long_mscale: float,
+        original_max_position_embeddings: int,
+    ):
+        super(PositionRotaryEmbedding, self).__init__()
+        self.short_inv_freq = short_inv_freq
+        self.long_inv_freq = long_inv_freq
+        self.max_position_embeddings = max_position_embeddings
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
+        self.original_max_position_embeddings = original_max_position_embeddings
+
+        # cache
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.dynamic_args = None
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype)
+
+            short_freqs = torch.outer(
+                t[: self.original_max_position_embeddings],
+                self.short_inv_freq.to(device=t.device),
+            )
+
+            long_freqs = torch.outer(
+                t[self.original_max_position_embeddings :],
+                self.long_inv_freq.to(device=t.device),
+            )
+
+            short_freqs = short_freqs * self.short_mscale
+            long_freqs = long_freqs * self.long_mscale
+
+            freqs = torch.empty((seqlen, short_freqs.shape[1]), device=device)
+            freqs[: self.original_max_position_embeddings] = short_freqs
+            freqs[self.original_max_position_embeddings :] = long_freqs
+
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+
+class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings:
+                newbase = self.base * (
+                    (self.scaling_factor * seqlen / self.max_position_embeddings)
+                    - (self.scaling_factor - 1)
+                ) ** (self.dim / (self.dim - 2))
+                self.inv_freq = _create_inv_freq(
+                    self.dim, newbase, self.inv_freq.device
+                )
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+
+def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def get_mscale(scale: float = 1.0, mscale: float = 1.0):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings,
+        base,
+        device,
+        scaling_factor,
+        *,
+        extrapolation_factor,
+        attn_factor,
+        beta_fast,
+        beta_slow,
+        mscale: float,
+        mscale_all_dim: float,
+    ):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale_all_dim = mscale_all_dim
+        self.scaling_factor = scaling_factor
+        self.mscale = float(
+            get_mscale(self.scaling_factor, mscale)
+            / get_mscale(self.scaling_factor, mscale_all_dim)
+            * self.attn_factor
+        )  # Get n-d magnitude scaling corrected for interpolation
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings or True:
+                inv_freq_extrapolation = _create_inv_freq(
+                    self.dim, self.base, self.inv_freq.device
+                )
+                freqs = 1.0 / inv_freq_extrapolation
+                inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
+                low, high = find_correction_range(
+                    self.beta_fast,
+                    self.beta_slow,
+                    self.dim,
+                    self.base,
+                    self.max_position_embeddings,
+                )
+
+                inv_freq_mask = (
+                    1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
+                ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
+                inv_freq = (
+                    inv_freq_interpolation * (1 - inv_freq_mask)
+                    + inv_freq_extrapolation * inv_freq_mask
+                )
+
+                self.inv_freq = inv_freq
+
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
+            self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
+
+
+def apply_llama3_scaling(
+    freqs: torch.Tensor,
+    *,
+    scaling_factor: int,
+    low_freq_factor: int,
+    high_freq_factor: int,
+    original_max_position_embeddings: int,
+):
+    low_freq_wavelen = original_max_position_embeddings / low_freq_factor
+    high_freq_wavelen = original_max_position_embeddings / high_freq_factor
+    new_freqs = []
+
+    for freq in freqs:
+        wavelen = 2 * math.pi / freq
+
+        if wavelen < high_freq_wavelen:
+            new_freqs.append(freq)
+        elif wavelen > low_freq_wavelen:
+            new_freqs.append(freq / scaling_factor)
+        else:
+            assert low_freq_wavelen != high_freq_wavelen
+            smooth = (original_max_position_embeddings / wavelen - low_freq_factor) / (
+                high_freq_factor - low_freq_factor
+            )
+            new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
+
+    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
diff --git a/backends/gaudi/server/text_generation_server/layers/speculative.py b/backends/gaudi/server/text_generation_server/layers/speculative.py
new file mode 100644
index 00000000..cf8469b5
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/speculative.py
@@ -0,0 +1,52 @@
+import torch
+import json
+from typing import Tuple, Optional
+from text_generation_server.layers.tensor_parallel import TensorParallelHead
+from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
+from text_generation_server.layers.mlp import MLPSpeculatorHead
+
+
+class SpeculativeHead(torch.nn.Module):
+    def __init__(self, lm_head, speculator):
+        super().__init__()
+        self.head = lm_head
+        self.speculator = speculator
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        speculator = config.speculator
+        if speculator:
+            speculator_path = config.speculator["path"]
+            speculator_config = str(speculator_path / "config.json")
+
+            with open(speculator_config, "r") as f:
+                speculator_config = json.load(f)
+
+            config.speculator_config = speculator_config
+            try:
+                architecture = speculator_config["architectures"][0]
+
+                if architecture == "MLPSpeculatorPreTrainedModel":
+                    speculator = MLPSpeculatorHead.load(config, prefix, weights)
+                else:
+                    speculator = None
+            except KeyError:
+                try:
+                    speculator = MedusaHeadV1.load(config, prefix, weights)
+                except Exception:
+                    speculator = MedusaHeadV2(config, prefix, weights)
+            lm_head = None
+        else:
+            lm_head = TensorParallelHead.load(config, prefix, weights)
+            speculator = None
+        return SpeculativeHead(lm_head, speculator)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if self.speculator is not None:
+            return self.speculator(input)
+
+        assert self.head is not None
+        logits = self.head(input)
+        return logits, None
diff --git a/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py b/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py
new file mode 100644
index 00000000..13f12ef1
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py
@@ -0,0 +1,249 @@
+import torch
+from torch.nn import functional as F
+from typing import Iterable, List
+from text_generation_server.layers.linear import get_linear, FastLinear
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+
+
+class LayerConcat(torch.nn.Module):
+    """
+    Apply multiple layers to the input and concatenate their
+    outputs.
+    """
+
+    def __init__(self, layers: Iterable[torch.nn.Module], dim: int = -1):
+        """
+        `dim` is the dimension along which layer outputs are concatenated.
+        """
+        super().__init__()
+        self.layers = layers
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor):
+        outputs = [layer(x) for layer in self.layers]
+        return torch.cat(outputs, self.dim)
+
+
+class SuperLayer(torch.nn.Module):
+    def __init__(self, linear):
+        super().__init__()
+        self.linear = linear
+
+    def forward(self, x):
+        return self.linear.forward(x)
+
+
+class TensorParallelHead(SuperLayer):
+    def __init__(self, linear, process_group, should_gather: bool):
+        super().__init__(linear)
+        self.process_group = process_group
+        self.should_gather = should_gather
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        if config.quantize == "exl2":
+            try:
+                # If the piece and LM head embeddings are shared, we have
+                # non-quantized weights...
+                weight = weights.get_tensor(f"{prefix}.weight")
+            except Exception:
+                # ...otherwise they are quantized.
+                weight = weights.get_weights_col(prefix)
+            should_gather = weights.process_group.size() > 1
+        elif weights.process_group.size() > 1:
+            try:
+                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
+                should_gather = True
+            except AssertionError:
+                # If the vocab size is not divisible by number of shards
+                # just load the entire thing.
+                weight = weights.get_tensor(f"{prefix}.weight")
+                should_gather = False
+        else:
+            weight = weights.get_tensor(f"{prefix}.weight")
+            should_gather = False
+
+        return TensorParallelHead(
+            get_linear(weight, bias=None),
+            process_group=weights.process_group,
+            should_gather=should_gather,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if not self.should_gather:
+            return super().forward(input)
+
+        world_size = self.process_group.size()
+        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
+            out_dim = self.linear.weight.shape[0]
+
+            if input.shape[0] == 1:
+                world_out = input.new_empty(1, out_dim * world_size)
+                local_out = input.new_empty(1, out_dim)
+                gather_input = local_out
+            else:
+                world_out = input.new_empty(out_dim * world_size, input.shape[0])
+                gather_input = input.new_empty(out_dim, input.shape[0])
+                local_out = gather_input.T
+
+            torch.mm(input, self.linear.weight.T, out=local_out)
+            if SYSTEM == "ipex":
+                ipex.distributed.all_gather_into_tensor(
+                    world_out, gather_input, group=self.process_group
+                )
+            else:
+                torch.distributed.all_gather_into_tensor(
+                    world_out, gather_input, group=self.process_group
+                )
+
+            if input.shape[0] == 1:
+                return world_out
+            return world_out.T
+
+        output = super().forward(input)
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        if SYSTEM == "ipex":
+            ipex.distributed.all_gather(world_output, output, group=self.process_group)
+        else:
+            torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+        return world_output
+
+
+class TensorParallelColumnLinear(SuperLayer):
+    @classmethod
+    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_gate_up(prefix)
+        if bias:
+            raise NotImplementedError("packed_gate_up only implemented without bias")
+        else:
+            bias = None
+        linear = get_linear(weight, bias)
+        return cls(linear)
+
+    @classmethod
+    def load_qkv(
+        cls,
+        config,
+        prefix: str,
+        weights,
+        bias: bool,
+        num_heads: int,
+        num_key_value_heads: int,
+    ):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_qkv(
+            prefix,
+            num_heads=num_heads,
+            num_key_value_heads=num_key_value_heads,
+        )
+        if bias:
+            raise NotImplementedError("packed_qkv only implemented for baichuan")
+        else:
+            bias = None
+        linear = get_linear(weight, bias)
+        return cls(linear)
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_weights_col(prefix)
+        if bias:
+            bias = weights.get_sharded(f"{prefix}.bias", dim=0)
+        else:
+            bias = None
+        linear = get_linear(weight, bias)
+        return cls(linear)
+
+    @classmethod
+    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
+        if config.quantize == "exl2":
+            linears = []
+            for prefix in prefixes:
+                weight = weights.get_weights_col(prefix)
+                b = weights.get_tensor(f"{prefix}.bias") if bias else None
+                linears.append(get_linear(weight, b))
+            linear = LayerConcat(linears)
+        else:
+            weight = weights.get_multi_weights_col(prefixes, dim=dim)
+            if bias:
+                b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
+                bias = torch.cat(b, dim=dim)
+            else:
+                bias = None
+            linear = get_linear(weight, bias)
+        return cls(linear)
+
+
+class TensorParallelRowLinear(SuperLayer):
+    def __init__(self, linear, process_group):
+        super().__init__(linear)
+        self.process_group = process_group
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_weights_row(prefix)
+
+        if bias and weights.process_group.rank() == 0:
+            # Rank is only on the first rank process
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(
+            get_linear(weight, bias),
+            process_group=weights.process_group,
+        )
+
+    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
+        out = super().forward(input)
+        if self.process_group.size() > 1 and reduce:
+            if SYSTEM == "ipex":
+                ipex.distributed.all_reduce(out, group=self.process_group)
+            else:
+                torch.distributed.all_reduce(out, group=self.process_group)
+        return out
+
+
+class TensorParallelEmbedding(torch.nn.Module):
+    def __init__(self, prefix: str, weights, reduce=True):
+        super().__init__()
+        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
+        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
+
+        process_group = weights.process_group
+
+        world_size = process_group.size()
+        rank = process_group.rank()
+
+        block_size = (num_embeddings + world_size - 1) // world_size
+        self.min_id = rank * block_size
+        self.max_id = min(num_embeddings, (rank + 1) * block_size)
+        self.null_idx = weight.shape[
+            0
+        ]  # Usually block_size, might be less in non even vocab_size.
+        self.process_group = weights.process_group
+        self.reduce = reduce
+
+        """Additional 0 entry used for masking"""
+        self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
+        # translate for [0, self.max_id - self.min_id[
+        input = torch.where(
+            (self.min_id > input) | (input >= self.max_id),
+            self.null_idx,
+            input - self.min_id,
+        )
+        out = torch.nn.functional.embedding(input, self.weight)
+        if self.reduce and self.process_group.size() > 1:
+            if SYSTEM == "ipex":
+                ipex.distributed.all_reduce(out, group=self.process_group)
+            else:
+                torch.distributed.all_reduce(out, group=self.process_group)
+        return out
diff --git a/backends/gaudi/server/text_generation_server/models/__init__.py b/backends/gaudi/server/text_generation_server/models/__init__.py
new file mode 100644
index 00000000..651b71ec
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/__init__.py
@@ -0,0 +1,326 @@
+import torch
+import os
+
+from loguru import logger
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto import modeling_auto
+from huggingface_hub import hf_hub_download, HfApi
+from typing import Optional
+from pathlib import Path
+from typing import List, Dict
+
+# Needed to properly setup habana_frameworks
+
+from text_generation_server.utils.speculate import get_speculate, set_speculate
+from text_generation_server.models.model import Model
+from text_generation_server.models.causal_lm import CausalLM
+from text_generation_server.models.bloom import BLOOM
+from text_generation_server.models.starcoder import StarCoder
+from text_generation_server.models.vlm_causal_lm import VlmCausalLM
+
+# from text_generation_server.models.mllama_causal_lm import MllamaCausalLM
+from text_generation_server.models.custom_modeling.llava_next import (
+    LlavaNextForConditionalGeneration,
+)
+
+# from text_generation_server.models.mllama_causal_lm import MllamaCausalLMBatch
+# from text_generation_server.models.custom_modeling.mllama import (
+#     MllamaForConditionalGeneration,
+# )
+from text_generation_server.utils.adapter import (
+    AdapterParameters,
+    build_layer_weight_lookup,
+    load_and_merge_adapters,
+    AdapterInfo,
+)
+from text_generation_server.adapters.lora import LoraWeights
+
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+
+# Disable gradients
+torch.set_grad_enabled(False)
+
+
+def get_model(
+    model_id: str,
+    lora_adapter_ids: Optional[List[str]],
+    revision: Optional[str],
+    sharded: bool,
+    quantize: Optional[str],
+    speculate: Optional[int],
+    dtype: Optional[torch.dtype],
+    trust_remote_code: bool,
+    max_input_tokens: int,
+) -> Model:
+    adapt_transformers_to_gaudi()
+
+    if speculate is not None:
+        set_speculate(speculate)
+    else:
+        set_speculate(0)
+
+    config_dict, _ = PretrainedConfig.get_config_dict(
+        model_id, revision=revision, trust_remote_code=trust_remote_code
+    )
+    model_type = config_dict.get("model_type", None)
+
+    speculator = None
+    if "medusa_num_heads" in config_dict:
+        medusa_model_id = model_id
+        medusa_revision = revision
+        model_id = config_dict["base_model_name_or_path"]
+        revision = "main"
+        speculate_medusa = config_dict["medusa_num_heads"]
+        if speculate is not None:
+            if speculate > speculate_medusa:
+                raise RuntimeError(
+                    f"Speculate is set to `{speculate}` but this medusa models only has `{speculate_medusa}` heads, please make them match"
+                )
+            else:
+                set_speculate(speculate)
+        else:
+            set_speculate(speculate_medusa)
+
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        # Reload model type from parent.
+        model_type = config_dict.get("model_type", None)
+        is_local = Path(medusa_model_id).exists()
+        if not is_local:
+            medusa_config = hf_hub_download(
+                medusa_model_id, revision=medusa_revision, filename="config.json"
+            )
+            hf_hub_download(
+                medusa_model_id,
+                revision=medusa_revision,
+                filename="medusa_lm_head.safetensors",
+            )
+            speculator = {
+                "path": Path(medusa_config).parent,
+                "model_paths": ["medusa_lm_head.safetensors"],
+            }
+        else:
+            speculator = {
+                "path": Path(medusa_model_id),
+                "model_paths": ["medusa_lm_head.safetensors"],
+            }
+
+        method = "medusa"
+    elif model_type == "mlp_speculator":
+        mlp_model_id = model_id
+        mlp_revision = revision
+        model_id = config_dict["base_model_name_or_path"]
+        revision = "main"
+        speculate_mlp = config_dict["n_predict"]
+        if speculate is not None:
+            if speculate > speculate_mlp:
+                raise RuntimeError(
+                    f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match"
+                )
+            else:
+                set_speculate(speculate)
+        else:
+            set_speculate(speculate_mlp)
+
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        # Reload model type from parent.
+        model_type = config_dict.get("model_type", None)
+        is_local = Path(mlp_model_id).exists()
+        extension = ".safetensors"
+        if not is_local:
+            mlp_speculator_config = hf_hub_download(
+                mlp_model_id, revision=mlp_revision, filename="config.json"
+            )
+            api = HfApi()
+            info = api.model_info(mlp_model_id, revision=mlp_revision)
+            filenames = [
+                s.rfilename
+                for s in info.siblings
+                if s.rfilename.endswith(extension)
+                and len(s.rfilename.split("/")) == 1
+                and "arguments" not in s.rfilename
+                and "args" not in s.rfilename
+                and "training" not in s.rfilename
+            ]
+            for filename in filenames:
+                hf_hub_download(
+                    mlp_model_id,
+                    revision=mlp_revision,
+                    filename=filename,
+                )
+            speculator_dir_path = Path(mlp_speculator_config).parent
+            # if these are downloaded, they get converted to safetensors
+            filenames.extend(
+                [p for p in os.listdir(speculator_dir_path) if p.endswith(extension)]
+            )
+            speculator = {
+                "path": Path(mlp_speculator_config).parent,
+                "model_paths": filenames,
+            }
+        else:
+            speculator = Path(mlp_model_id)
+            filenames = [p for p in os.listdir(speculator) if p.endswith(extension)]
+            speculator = {"path": speculator, "model_paths": filenames}
+        method = "mlp_speculator"
+    else:
+        method = "n-gram"
+
+    speculate = get_speculate()
+    if speculate > 0:
+        logger.info(f"Using speculation {method} with {speculate} input ids.")
+
+    model_type = config_dict["model_type"]
+
+    if model_type == "gpt_bigcode":
+        return StarCoder(model_id=model_id, revision=revision, dtype=dtype)
+
+    if model_type == "bloom":
+        return BLOOM(
+            model_id=model_id,
+            revision=revision,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+        )
+
+    if model_type == "llava_next":
+        return VlmCausalLM(
+            model_class=LlavaNextForConditionalGeneration,
+            model_id=model_id,
+            revision=revision,
+            quantize=None,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+        )
+
+    if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+        return CausalLM(
+            model_id,
+            revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+        )
+
+    raise ValueError(f"Unsupported model type {model_type}")
+
+
+# get_model_with_lora_adapters wraps the internal get_model function and adds support for loading adapters
+# this provides a post model loading hook to load adapters into the model after the model has been loaded
+def get_model_with_lora_adapters(
+    model_id: str,
+    lora_adapters: Optional[List[AdapterInfo]],
+    revision: Optional[str],
+    sharded: bool,
+    quantize: Optional[str],
+    speculate: Optional[int],
+    dtype: Optional[torch.dtype],
+    trust_remote_code: bool,
+    max_input_tokens: int,
+    adapter_to_index: Dict[str, int],
+):
+    lora_adapter_ids = [adapter.id for adapter in lora_adapters]
+    model = get_model(
+        model_id,
+        lora_adapter_ids,
+        revision,
+        sharded,
+        quantize,
+        speculate,
+        dtype,
+        trust_remote_code,
+        max_input_tokens,
+    )
+
+    if len(lora_adapters) > 0:
+        target_to_layer = build_layer_weight_lookup(model.model)
+
+        for index, adapter in enumerate(lora_adapters):
+            # The AdapterParameters object allows for merging multiple adapters into a single adapter.
+            # At the moment, we only support loading a single adapter into the model, but we keep the
+            # AdapterParameters object for easier extension in the future.
+            adapter_parameters = AdapterParameters(
+                adapter_info=[adapter],
+                # when merging multiple adapters we can weight them differently
+                # if this is not set, all adapters will be weighted equally
+                # see: text_generation_server.utils.merges.strategies for impl
+                weights=None,
+                merge_strategy=0,
+                density=1.0,
+                majority_sign_method=0,
+            )
+
+            adapter_index = index + 1
+            adapter_to_index[adapter.id] = adapter_index
+
+            logger.info(
+                f"Loading adapter weights into model: {','.join([adapter.id for adapter in adapter_parameters.adapter_info])}"
+            )
+            weight_names = tuple([v[0] for v in target_to_layer.values()])
+            (
+                module_map,
+                adapter_config,
+                adapter_weight_names,
+                adapter_tokenizer,
+            ) = load_and_merge_adapters(
+                model.model_id,
+                adapter_parameters,
+                adapter_index,
+                weight_names,
+                False,
+            )
+
+            unused_weight_names = adapter_weight_names.copy()
+
+            adapter_layers = [
+                "q_proj",
+                "k_proj",
+                "v_proj",
+                "o_proj",
+                "gate_proj",
+                "up_proj",
+                "down_proj",
+                "qkv_proj",
+            ]
+
+            for layer_name in adapter_layers:
+                nlayers = (
+                    1 if layer_name == "lm_head" else len(model.model.model.layers)
+                )
+                adapter_weights = LoraWeights.prepare_weights(
+                    config=adapter_config,
+                    module_map=module_map,
+                    layer_type=layer_name,
+                    unused_weight_names=unused_weight_names,
+                    nlayers=nlayers,
+                    dtype=model.dtype,
+                    world_size=model.world_size,
+                    process_group=model.process_group,
+                    target_to_layer=target_to_layer,
+                )
+
+                if adapter_weights is None:
+                    continue
+
+                model.layer_to_adapter_weights[layer_name].add_adapter(
+                    adapter_index, adapter_weights
+                )
+
+            if len(unused_weight_names) > 0:
+                logger.warning(
+                    f"{','.join([a.id for a in lora_adapters])} unused adapter weights: {unused_weight_names}"
+                )
+
+            if adapter_tokenizer is not None:
+                model.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)
+
+            model.loaded_adapters.add(adapter_index)
+
+    return model
diff --git a/backends/gaudi/server/text_generation_server/models/bloom.py b/backends/gaudi/server/text_generation_server/models/bloom.py
new file mode 100644
index 00000000..6fe64374
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/bloom.py
@@ -0,0 +1,52 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
+import torch
+
+from typing import Optional, Type
+
+from transformers import PreTrainedTokenizerBase
+
+from text_generation_server.models import CausalLM
+from text_generation_server.models.causal_lm import CausalLMBatch
+from text_generation_server.pb import generate_pb2
+
+
+class BloomCausalLMBatch(CausalLMBatch):
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "CausalLMBatch":
+        batch = super().from_pb(
+            pb=pb,
+            tokenizer=tokenizer,
+            dtype=dtype,
+            device=device,
+        )
+        batch.keys_head_dim_last = False
+        return batch
+
+
+class BLOOM(CausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+    ):
+        super(BLOOM, self).__init__(
+            model_id=model_id,
+            revision=revision,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+        )
+
+    @property
+    def batch_type(self) -> Type[CausalLMBatch]:
+        return BloomCausalLMBatch
diff --git a/backends/gaudi/server/text_generation_server/models/causal_lm.py b/backends/gaudi/server/text_generation_server/models/causal_lm.py
new file mode 100644
index 00000000..8fda0517
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/causal_lm.py
@@ -0,0 +1,1421 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
+import bisect
+from dataclasses import dataclass
+from functools import wraps
+import itertools
+import math
+import os
+import tempfile
+import time
+import copy
+from typing import Dict, List, Optional, Tuple, Type
+
+import torch
+import torch._dynamo
+from loguru import logger
+from opentelemetry import trace
+
+import text_generation_server.habana_quantization_env as hq_env
+import habana_frameworks.torch as htorch
+from optimum.habana.utils import HabanaProfile
+from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
+from text_generation_server.utils.chunks import concat_text_chunks
+from optimum.habana.checkpoint_utils import (
+    get_repo_root,
+    model_on_meta,
+    write_checkpoints_json,
+)
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    PreTrainedTokenizerBase,
+    AutoConfig,
+)
+
+from text_generation_server.utils.tokens import batch_top_tokens
+from text_generation_server.models import Model
+from text_generation_server.models.types import (
+    Batch,
+    Tokens,
+    Generation,
+    GeneratedText,
+)
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils import (
+    HeterogeneousNextTokenChooser,
+    StoppingCriteria,
+    is_tokenizer_transparent,
+    pad_next_token_chooser_parameters,
+)
+from optimum.habana.utils import get_hpu_memory_stats
+from text_generation_server.utils.debug import dbg_trace
+from text_generation_server.utils.speculate import get_speculate
+
+tracer = trace.get_tracer(__name__)
+MAX_TOTAL_TOKENS = int(os.environ.get("MAX_TOTAL_TOKENS", 2048))
+PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 256))
+CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+LAZY_MODE = int(os.environ.get("PT_HPU_LAZY_MODE", 1))
+BATCH_BUCKET_SIZE = int(os.environ.get("BATCH_BUCKET_SIZE", 8))
+PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get("PREFILL_BATCH_BUCKET_SIZE", 2))
+MAX_BATCH_SIZE = (
+    int(os.environ.get("MAX_BATCH_SIZE"))
+    if os.environ.get("MAX_BATCH_SIZE") is not None
+    else None
+)
+
+
+def torch_compile_for_eager(func):
+    if LAZY_MODE == 1:
+        return func
+    return torch.compile(
+        func, backend="hpu_backend", options={"keep_input_mutations": True}
+    )
+
+
+def round_up(number, k):
+    return (number + k - 1) // k * k
+
+
+def to_tensor_indices(indices, device):
+    return torch.tensor(indices, dtype=torch.long, device=device)
+
+
+def calculate_chunks(offset):
+    result = []
+    while offset != 0:
+        sign = 1 if offset > 0 else -1
+        best_chunk = min((abs(offset - sign * c), sign * c) for c in CHUNK_SIZES)[1]
+        result.append(best_chunk)
+        offset = offset - best_chunk
+    return result
+
+
+def biggest_single_chunk(offset):
+    if offset != 0:
+        idx = bisect.bisect(CHUNK_SIZES, abs(offset))
+        return int(math.copysign(CHUNK_SIZES[idx - 1], offset))
+    else:
+        return 0
+
+
+@torch_compile_for_eager
+def grouped_pad(tensor_groups, dims, values):
+    grouped_result = []
+    for tensors, dim, value in zip(tensor_groups, dims, values):
+        padding = MAX_TOTAL_TOKENS - tensors[0].size(dim) if dim is not None else 0
+        if padding > 0:
+            assert dim in [-1, -2], f"Only dims -1 and -2 are supported! {dim}"
+            pad_shape = (0, 0, 0, padding) if dim == -2 else (0, padding)
+            result = [
+                torch.nn.functional.pad(t, pad_shape, value=value) for t in tensors
+            ]
+        else:
+            result = [t for t in tensors]
+        grouped_result.append(result)
+        htorch.core.mark_step()
+    return grouped_result
+
+
+@torch_compile_for_eager
+def roll(tensor, chunk, dim, merge_graphs):
+    if dim is None:
+        return tensor
+    tensor = torch.roll(tensor, chunk, dim)
+    if not merge_graphs:
+        htorch.core.mark_step()
+    return tensor
+
+
+def grouped_roll(tensor_groups, chunk, dims, merge_graphs):
+    tensor_groups = [
+        [roll(t, chunk, dim, merge_graphs) for t in tensors]
+        for tensors, dim in zip(tensor_groups, dims)
+    ]
+    if merge_graphs:
+        htorch.core.mark_step()
+    return tensor_groups
+
+
+@torch_compile_for_eager
+def grouped_shift(tensor_groups, dims, offset, merge_graphs):
+    chunks = calculate_chunks(offset)
+    for c in chunks:
+        tensor_groups = grouped_roll(tensor_groups, c, dims, merge_graphs)
+    return tensor_groups
+
+
+def move(dst_tensors, dst_indices, src_tensors):
+    bs_dim = 0
+    num_indices = dst_indices.size(0)
+    for i, (dst_t, src_t) in enumerate(zip(dst_tensors, src_tensors)):
+        if src_t.size(bs_dim) != num_indices:
+            src_t = torch.narrow(src_t, bs_dim, 0, num_indices)
+        dst_t.index_copy_(bs_dim, dst_indices, src_t)
+    htorch.core.mark_step()
+
+
+def grouped_move(dst_tensor_groups, dst_indices, src_tensor_groups):
+    for dst_tensors, src_tensors in zip(dst_tensor_groups, src_tensor_groups):
+        move(dst_tensors, dst_indices, src_tensors)
+
+
+@torch_compile_for_eager
+def extend_tensor(tensor, padding, dim):
+    result = torch.cat([tensor, padding], dim=dim)
+    htorch.core.mark_step()
+    return result
+
+
+@torch_compile_for_eager
+def extend_batch(tensors, target_bs, dim):
+    diff = target_bs - tensors[0].size(dim)
+    # TODO: add support for shrinking bs
+    if diff <= 0:
+        return tensors
+    shape = list(tensors[0].shape)
+    shape[dim] = diff
+    padding = torch.empty(shape, device=tensors[0].device, dtype=tensors[0].dtype)
+    tensors = [extend_tensor(t, padding, dim) for t in tensors]
+    return tensors
+
+
+def grouped_extend_batch(tensor_groups, target_bs, bs_dims):
+    tensor_groups = [
+        extend_batch(tensors, target_bs, dim)
+        for tensors, dim in zip(tensor_groups, bs_dims)
+    ]
+    return tensor_groups
+
+
+@torch_compile_for_eager
+def merge(tensor_group):
+    tensor_group = [torch.stack(tensor_group)]
+    htorch.core.mark_step()
+    return tensor_group
+
+
+@torch_compile_for_eager
+def split(tensor_group, clone_data):
+    tensor_group = [t.squeeze(0) for t in torch.split(tensor_group[0], 1)]
+    if clone_data:
+        tensor_group = [t.clone() for t in tensor_group]
+    htorch.core.mark_step()
+    return tensor_group
+
+
+def remove_kv_cache_from_output(module):
+    orig_fwd = module.forward
+
+    @wraps(orig_fwd)
+    def forward(*args, **kwargs):
+        if kwargs["past_key_values"] is not None:
+            kwargs["return_dict"] = False
+            output = orig_fwd(*args, **kwargs)
+            first_value, second_value, *_ = output
+            if first_value.nelement() < 2:
+                return second_value
+            else:
+                return first_value
+        else:
+            kwargs["return_dict"] = True
+            return orig_fwd(*args, **kwargs)
+
+    module.forward = forward
+    return module
+
+
+@dataclass
+class CausalLMRequest:
+    idx: int
+    data: generate_pb2.Request
+    input_length: int
+    prefix_offset: int
+    read_offset: int
+    stopping_criteria: StoppingCriteria
+
+    all_input_ids: torch.Tensor
+
+    @classmethod
+    def from_pb(
+        cls, idx: int, data: generate_pb2.Request, tokenizer: PreTrainedTokenizerBase
+    ):
+        return cls(
+            idx=idx,
+            data=data,
+            input_length=None,
+            prefix_offset=None,
+            read_offset=None,
+            stopping_criteria=StoppingCriteria.from_pb(
+                data.stopping_parameters, tokenizer
+            ),
+            all_input_ids=None,
+        )
+
+    def update_idx(self, new_idx):
+        prev = self.idx
+        self.idx = new_idx
+        return (new_idx, prev)
+
+
+@dataclass
+class CausalLMBatch(Batch):
+    batch_id: int
+    requests: List[CausalLMRequest]
+
+    # Decoder values
+    input_ids: torch.Tensor
+    attention_mask: torch.Tensor
+    position_ids: torch.Tensor
+    past_key_values: Optional[List[Tuple]]
+    merged_kv_cache: bool
+
+    # Lengths of all generations present in the batch
+    input_length: int
+
+    # Generation helpers
+    next_token_chooser: HeterogeneousNextTokenChooser
+    top_n_tokens: List[int]
+    top_n_tokens_tensor: torch.Tensor
+
+    input_length: int
+
+    # Past metadata
+    logits = None
+    past = None
+
+    keys_head_dim_last: bool = True
+
+    def to_pb(self) -> generate_pb2.CachedBatch:
+        return generate_pb2.CachedBatch(
+            id=self.batch_id,
+            request_ids=[r.data.id for r in self.requests],
+            size=len(self),
+            max_tokens=self.max_tokens,
+        )
+
+    def detach_kv_cache(self):
+        past_keys = [past[0] for past in self.past_key_values]
+        past_values = [past[1] for past in self.past_key_values]
+        del self.past_key_values
+        return past_keys, past_values
+
+    def attach_kv_cache(self, past_keys, past_values):
+        # TODO: Add support for models that don't store kv_cache in a list
+        self.past_key_values = list(zip(past_keys, past_values))
+
+    def merge_kv_cache_if_needed(self, target_bs, offset):
+        pad_needed = self.seq_length < MAX_TOTAL_TOKENS
+        shift_needed = offset != 0
+        expand_needed = target_bs > self.batch_size
+        # Very simple heuristic to determine whether we should merge tensors
+        # this needs tuning for other models/scenarios
+        small_bs = len(self.past_key_values) > self.batch_size
+        if (
+            not self.merged_kv_cache
+            and small_bs
+            and (pad_needed or shift_needed or expand_needed)
+        ):
+            past_keys, past_values = self.detach_kv_cache()
+            past_keys = merge(past_keys)
+            past_values = merge(past_values)
+            self.attach_kv_cache(past_keys, past_values)
+            self.merged_kv_cache = True
+
+    def split_kv_cache_if_needed(self, clone_data):
+        if self.merged_kv_cache:
+            past_keys, past_values = self.detach_kv_cache()
+            past_keys = split(past_keys, clone_data)
+            past_values = split(past_values, clone_data)
+            self.attach_kv_cache(past_keys, past_values)
+            self.merged_kv_cache = False
+
+    def get_tensor_groups(self):
+        past_keys, past_values = self.detach_kv_cache()
+        seq_dim = -1
+        key_dim = -2 if self.keys_head_dim_last else -1
+        value_dim = -2
+        tensors = [
+            [self.input_ids],
+            [self.attention_mask],
+            [self.position_ids],
+            past_keys,
+            past_values,
+        ]
+        # We don't need to align position_ids
+        seq_dims = [seq_dim, seq_dim, None, key_dim, value_dim]
+        bs_dims = [0, 0, 0] + ([1, 1] if self.merged_kv_cache else [0, 0])
+        return tensors, seq_dims, bs_dims
+
+    def set_tensor_groups(self, tensors):
+        self.input_ids = tensors.pop(0)[0]
+        self.attention_mask = tensors.pop(0)[0]
+        self.position_ids = tensors.pop(0)[0]
+        past_keys = tensors.pop(0)
+        past_values = tensors.pop(0)
+        self.attach_kv_cache(past_keys, past_values)
+
+    def realign(self, target_bs, offset, pad_token_id):
+        tensors, seq_dims, _ = self.get_tensor_groups()
+        tensors = grouped_pad(tensors, seq_dims, [pad_token_id, 0, 0, 0, 0])
+        tensors = grouped_shift(tensors, seq_dims, offset, self.merged_kv_cache)
+        self.set_tensor_groups(tensors)
+
+    def expand_bs(self, target_bs):
+        tensors, _, bs_dims = self.get_tensor_groups()
+        tensors = grouped_extend_batch(tensors, target_bs, bs_dims)
+        self.set_tensor_groups(tensors)
+
+    def used_indices(self):
+        return [req.idx for req in self.requests]
+
+    def update_indices(self, new_indices):
+        for req, new_idx in zip(self.requests, new_indices):
+            req.idx = new_idx
+        return self.used_indices()
+
+    def free_indices_generator(self):
+        used = set(req.idx for req in self.requests)
+        return (i for i in range(self.batch_size) if i not in used)
+
+    def move_data(self, src_batches):
+        dst_tensors, _, dst_dims = self.get_tensor_groups()
+        free_indices_gen = self.free_indices_generator()
+        for src_b in src_batches:
+            dst_indices = to_tensor_indices(
+                src_b.update_indices(free_indices_gen), self.input_ids.device
+            )
+            src_tensors, _, src_dims = src_b.get_tensor_groups()
+            grouped_move(dst_tensors, dst_indices, src_tensors)
+        self.set_tensor_groups(dst_tensors)
+
+    @classmethod
+    def recombine(
+        cls, batches: List["CausalLMBatch"], pad_token_id: int
+    ) -> "CausalLMBatch":
+        if not all(b.past_key_values is not None for b in batches):
+            raise ValueError("KV cache not allocated! Cannot recombine before prefill!")
+
+        total_requests = sum(len(b) for b in batches)
+        new_bs = total_requests
+        new_bs = round_up(total_requests, BATCH_BUCKET_SIZE)
+
+        batch_id = batches[0].batch_id
+        device = batches[0].input_ids.device
+
+        input_lengths = [b.input_length for b in batches]
+        max_input_length = max(input_lengths)
+        offsets = [max_input_length - b.input_length for b in batches]
+
+        cur_padding = [b.right_padding for b in batches]
+        # For prefill there is a space allocated only for first token
+        # Need to add padding to the max total tokens before first decode
+
+        moves_needed = [
+            total_requests - len(b) if b.batch_size == new_bs else total_requests
+            for b in batches
+        ]
+        dst_batch_idx = min(enumerate(moves_needed), key=lambda idx_val: idx_val[1])[0]
+        reshape = batches[dst_batch_idx].batch_size < new_bs
+
+        # TODO: Add support for changing max seq len, i.e. due to output length bucketing
+        # FIXME: max_seq_len for non optimized code
+        if len(batches) > 1:
+            scenario = "CONCAT"
+        elif reshape:
+            scenario = "RESHAPE"
+        elif cur_padding[dst_batch_idx] <= 0:
+            scenario = "SHIFT"
+            offsets = [
+                biggest_single_chunk(b.max_input_length - max_input_length)
+                for b in batches
+            ]
+            max_input_length = max_input_length + offsets[dst_batch_idx]
+        else:
+            # Nothing to do
+            return batches[0]
+
+        dbg_trace(
+            scenario,
+            f"bs:{[b.batch_size for b in batches]}->{new_bs}"
+            f" reqs:{[len(b) for b in batches]}"
+            f" offsets:{offsets}"
+            f" input_lengths:{input_lengths}"
+            f" cur_padding:{cur_padding}"
+            f" dst_batch:{dst_batch_idx}",
+        )
+
+        grouped_requests = [[req for req in batch.requests] for batch in batches]
+        flat_requests = list(itertools.chain(*grouped_requests))
+
+        for i in range(len(batches)):
+            target_bs = new_bs if i == dst_batch_idx else batches[i].batch_size
+            batches[i].merge_kv_cache_if_needed(target_bs, offsets[i])
+            batches[i].realign(target_bs, offsets[i], pad_token_id)
+            batches[i].split_kv_cache_if_needed(i == dst_batch_idx)
+        batches[dst_batch_idx].expand_bs(new_bs)
+        batches[dst_batch_idx].move_data(
+            [batches[i] for i in range(len(batches)) if i != dst_batch_idx]
+        )
+
+        top_n_tokens = [r.data.top_n_tokens for r in flat_requests]
+        top_n_tokens.extend([-1] * (new_bs - total_requests))
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+
+        parameters = [r.data.parameters for r in flat_requests]
+        # append the dummy parameters for dummy requests
+        batch_size = batches[dst_batch_idx].batch_size
+        parameters = pad_next_token_chooser_parameters(parameters, batch_size)
+
+        # update past grammar states
+        fsm_grammar_states = [0] * batch_size
+        for batch in batches:
+            for i, req in enumerate(batch.requests):
+                fsm_grammar_states[req.idx] = (
+                    batch.next_token_chooser.fsm_grammar_states[i]
+                )
+
+        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
+            parameters,
+            batches[dst_batch_idx].next_token_chooser.dtype,
+            batches[dst_batch_idx].next_token_chooser.device,
+            batches[dst_batch_idx].next_token_chooser.tokenizer,
+            fsm_grammar_states,
+            quantization_enabled=hq_env.is_quantization_enabled,
+        )
+
+        input_ids = batches[dst_batch_idx].input_ids
+        attention_mask = batches[dst_batch_idx].attention_mask
+        position_ids = batches[dst_batch_idx].position_ids
+        past_key_values = batches[dst_batch_idx].past_key_values
+        input_length = max_input_length
+
+        htorch.core.mark_step()
+
+        return cls(
+            batch_id=batch_id,
+            requests=flat_requests,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            merged_kv_cache=False,
+            next_token_chooser=next_token_chooser,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            input_length=input_length,
+        )
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "CausalLMBatch":
+        dbg_trace("FROM_PB", f"num_reqs:{len(pb.requests)}")
+        requests = [
+            CausalLMRequest.from_pb(idx, req, tokenizer)
+            for idx, req in enumerate(pb.requests)
+        ]
+        inputs = []
+        top_n_tokens = []
+
+        # Parse batch
+        max_truncation = 0
+        for i, r in enumerate(pb.requests):
+            inputs.append(concat_text_chunks(r.input_chunks.chunks))
+            top_n_tokens.append(r.top_n_tokens)
+            max_truncation = max(max_truncation, r.truncate)
+
+        max_input_length = max_truncation
+        if max_input_length < PAD_SEQUENCE_TO_MULTIPLE_OF:
+            max_input_length = PAD_SEQUENCE_TO_MULTIPLE_OF
+        max_new_tokens = max(r.stopping_criteria.max_new_tokens for r in requests)
+
+        # TODO: by tokenizing all inputs at once we loose information on actual input lengths
+        # this means that we cannot shift inputs to the left after a long input sequence
+        # was filtered out
+        new_bs = round_up(len(requests), PREFILL_BATCH_BUCKET_SIZE)
+        missing_inputs = new_bs - len(inputs)
+        dummy_inputs = ["?"] * missing_inputs
+        parameters = [r.parameters for r in pb.requests]
+        # append the dummy parameters for dummy request
+        parameters = pad_next_token_chooser_parameters(parameters, new_bs)
+
+        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
+            pb=parameters,
+            dtype=dtype,
+            device=device,
+            tokenizer=tokenizer,
+            quantization_enabled=hq_env.is_quantization_enabled,
+        )
+
+        tokenized_inputs = tokenizer(
+            inputs + dummy_inputs,
+            return_tensors="pt",
+            padding="longest",
+            return_token_type_ids=False,
+            truncation=True,
+            max_length=max_truncation,
+        )
+
+        input_len = tokenized_inputs["input_ids"].shape[1]
+        # Round up sequence length
+        bucket_size = max_input_length
+        left_padding = max_input_length - input_len
+        if input_len < max_input_length and PAD_SEQUENCE_TO_MULTIPLE_OF != 0:
+            assert (
+                PAD_SEQUENCE_TO_MULTIPLE_OF <= max_input_length
+            ), "PAD_SEQUENCE_TO_MULTIPLE_OF cannot be higher than max_input_length"
+            rounded_seq_len = round_up(input_len + 1, PAD_SEQUENCE_TO_MULTIPLE_OF)
+            if rounded_seq_len <= max_input_length:
+                bucket_size = rounded_seq_len - 1
+            else:
+                bucket_size = max_input_length - 1
+            left_padding = bucket_size - input_len
+
+        input_ids = tokenized_inputs["input_ids"]
+        attention_mask = tokenized_inputs["attention_mask"]
+
+        # Allocate space for first token
+        input_ids = torch.nn.functional.pad(
+            input_ids, (left_padding, 1), value=tokenizer.pad_token_id
+        )
+        attention_mask = torch.nn.functional.pad(
+            attention_mask, (left_padding, 1), value=0
+        )
+        all_input_ids = torch.nn.functional.pad(
+            input_ids, (0, max_new_tokens), value=tokenizer.pad_token_id
+        ).T.split(1, dim=1)
+        input_len = bucket_size
+        for r in requests:
+            r.input_length = input_len
+            r.prefix_offset = input_len - 5
+            r.read_offset = input_len
+            r.all_input_ids = all_input_ids[r.idx]
+
+        input_ids = input_ids.to(device)
+        attention_mask = attention_mask.to(device)
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+
+        old_bs = len(requests)
+        top_n_tokens.extend([-1] * (new_bs - old_bs))
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+        htorch.core.mark_step()
+        return cls(
+            batch_id=pb.id,
+            requests=requests,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=None,
+            merged_kv_cache=False,
+            next_token_chooser=next_token_chooser,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            input_length=input_len,
+        )
+
+    @tracer.start_as_current_span("filter")
+    def filter(self, request_ids: List[int]) -> Optional["CausalLMBatch"]:
+        dbg_trace("FILTER", f"num_reqs:{len(self.requests)} -> {len(request_ids)}")
+        request_ids = set(request_ids)
+        self.requests = [req for req in self.requests if req.data.id in request_ids]
+        return self
+
+    @classmethod
+    @tracer.start_as_current_span("concatenate")
+    def concatenate(
+        cls, batches: List["CausalLMBatch"], pad_token_id: int = 0
+    ) -> "CausalLMBatch":
+        return cls.recombine(batches, pad_token_id)
+
+    def __len__(self):
+        return len(self.requests)
+
+    @property
+    def max_input_length(self):
+        return max(req.input_length for req in self.requests)
+
+    @property
+    def batch_size(self):
+        return self.attention_mask.size(0)
+
+    @property
+    def seq_length(self):
+        return self.attention_mask.size(1)
+
+    @property
+    def right_padding(self):
+        return self.seq_length - self.input_length
+
+    # Maximum number of tokens this batch will grow to
+    @property
+    def max_tokens(self):
+        max_total_tokens = self.attention_mask.size(1)
+        return len(self.requests) * max_total_tokens
+
+
+class CausalLM(Model):
+    def __init__(
+        self,
+        model_id: str,
+        model_class: Optional[Type[torch.nn.Module]] = None,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        tokenizer_class=AutoTokenizer,
+        config_class=AutoConfig,
+        batch_class=CausalLMBatch,
+    ):
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
+
+        self.prev_bs = 0
+        self.quantize = quantize
+
+        # Create tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+
+        # Create model
+        world_size = int(os.getenv("WORLD_SIZE", "1"))
+        rank = int(os.getenv("RANK", "0"))
+        dtype = torch.bfloat16 if dtype is None else dtype
+        device = torch.device("hpu")
+
+        if hq_env.is_quantization_enabled:
+            htorch.core.hpu_set_env()
+
+        if world_size > 1:
+            model = self.get_deepspeed_model(model_id, dtype, revision)
+            model = hq_env.prepare_model_for_quantization(model)
+        else:
+            get_repo_root(model_id)
+
+            # Check support for rope scaling
+            model_kwargs = {}
+            config = AutoConfig.from_pretrained(model_id)
+            if hasattr(config, "rope_scaling"):
+                model_kwargs["rope_scaling"] = self.get_rope_scaling()
+
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                revision=revision,
+                torch_dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+            model = hq_env.prepare_model_for_quantization(model)
+            model = model.eval().to(device)
+
+        self.enable_hpu_graph = (
+            os.getenv("ENABLE_HPU_GRAPH", "true").lower() == "true" and LAZY_MODE == 1
+        )
+        self.limit_hpu_graph = os.getenv("LIMIT_HPU_GRAPH", "true").lower() == "true"
+
+        if model.config.model_type not in [
+            "gpt_bigcode"
+        ]:  # gpt_bigcode/starcoderbase-3b skips remove_kv_cache_from_output()
+            model = remove_kv_cache_from_output(model)
+
+        if self.enable_hpu_graph:
+            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+            model = wrap_in_hpu_graph(model, disable_tensor_cache=True)
+        else:
+            if LAZY_MODE == 0:
+                # It is said that "keep_input_mutations" is safe for inference to be done
+                dbg_trace("TORCH COMPILE", "Torch compiling of model")
+                model.model = torch.compile(
+                    model.model,
+                    backend="hpu_backend",
+                    options={"keep_input_mutations": True},
+                )
+
+        model = hq_env.setup_quantization(model)
+
+        if model.config.model_type not in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
+            raise ValueError(f"Model type {model.config.model_type} is not supported!")
+
+        if tokenizer.pad_token_id is None:
+            if model.config.pad_token_id is not None:
+                tokenizer.pad_token_id = model.config.pad_token_id
+            elif model.config.eos_token_id is not None:
+                if isinstance(model.config.eos_token_id, int):
+                    tokenizer.pad_token_id = model.config.eos_token_id
+                elif isinstance(model.config.eos_token_id, list):
+                    tokenizer.pad_token_id = model.config.eos_token_id[0]
+                else:
+                    raise ValueError(
+                        f"{type(model.config.eos_token_id)} type of eos_token_id in the model's config is not supported for tokenizer.pad_token_id"
+                    )
+            elif tokenizer.eos_token_id is not None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            else:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+        self.kwargs = {
+            "use_cache": True,
+            "return_dict": True,
+        }
+
+        if model.config.model_type in [
+            "llama",
+            "mistral",
+            "starcoder2",
+            "qwen2",
+            "falcon",
+            "gpt_bigcode",
+        ]:
+            if model.config.model_type not in ["falcon", "gpt_bigcode"]:
+                self.kwargs["attn_softmax_bf16"] = True
+
+            if model.config.model_type not in ["gpt_bigcode"]:
+                self.kwargs["trim_logits"] = True
+
+            if os.getenv("USE_FLASH_ATTENTION", "true").lower() == "true":
+                self.kwargs["use_flash_attention"] = True
+            if os.getenv("FLASH_ATTENTION_RECOMPUTE", "true").lower() == "true":
+                self.kwargs["flash_attention_recompute"] = True
+
+        self.speculate = get_speculate()
+
+        super(CausalLM, self).__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+        )
+
+        # Create profiler
+        ranks_to_profile = [int(val) for val in os.getenv("PROF_RANKS", "0").split(",")]
+        record_shapes = os.getenv("PROF_RECORD_SHAPES", "false").lower() == "true"
+        output_dir = os.getenv("PROF_PATH", "/tmp/hpu_profile")
+        self.profiling_warmup_steps = (
+            int(os.getenv("PROF_WARMUPSTEP", "0")) if rank in ranks_to_profile else 0
+        )
+        self.profiling_steps = (
+            int(os.getenv("PROF_STEP", "0")) if rank in ranks_to_profile else 0
+        )
+        self.profiling_wait_steps = int(os.getenv("PROF_WAITSTEP", "0"))
+        if self.profiling_steps > 0:
+            self.hb_profiler = HabanaProfile(
+                wait=self.profiling_wait_steps,
+                warmup=self.profiling_warmup_steps,
+                active=self.profiling_steps,
+                output_dir=output_dir,
+                record_shapes=record_shapes,
+            )
+            self.hb_profiler.start()
+        else:
+            self.hb_profiler = None
+        self.step = 0
+
+    def get_deepspeed_model(
+        self, model_id: str, dtype: torch.dtype, revision: Optional[str] = None
+    ) -> torch.nn.Module:
+        import deepspeed
+        from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu
+
+        world_size, rank, local_rank = initialize_distributed_hpu()
+        model_kwargs = {"revision": revision}
+
+        # Initialize process(es) for DeepSpeed
+        deepspeed.init_distributed(dist_backend="hccl")
+        logger.info(
+            "DeepSpeed is enabled. world_size {} rank {} local_rank {}".format(
+                world_size, rank, local_rank
+            )
+        )
+        config = AutoConfig.from_pretrained(model_id, **model_kwargs)
+        load_to_meta = model_on_meta(config)
+
+        # Check support for rope scaling
+        if hasattr(config, "rope_scaling"):
+            config.rope_scaling = self.get_rope_scaling()
+            model_kwargs["rope_scaling"] = self.get_rope_scaling()
+
+        if load_to_meta:
+            # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
+            with deepspeed.OnDevice(dtype=dtype, device="meta"):
+                model = AutoModelForCausalLM.from_config(config, torch_dtype=dtype)
+        else:
+            get_repo_root(model_id, local_rank=os.getenv("LOCAL_RANK"))
+            # TODO: revisit placement on CPU when auto-injection is possible
+            with deepspeed.OnDevice(dtype=dtype, device="cpu"):
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_id, torch_dtype=dtype, **model_kwargs
+                )
+        model = model.eval()
+
+        # Initialize the model
+        ds_inference_kwargs = {"dtype": dtype}
+        ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size}
+        ds_inference_kwargs["enable_cuda_graph"] = False
+
+        if load_to_meta:
+            # model loaded to meta is managed differently
+            checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
+            write_checkpoints_json(model_id, local_rank, checkpoints_json)
+            ds_inference_kwargs["checkpoint"] = checkpoints_json.name
+        model = deepspeed.init_inference(model, **ds_inference_kwargs)
+
+        return model.module
+
+    def get_rope_scaling(self) -> Optional[Dict]:
+        rope_scaling = os.getenv("ROPE_SCALING", None)
+        if rope_scaling is None:
+            return None
+
+        rope_factor = float(os.getenv("ROPE_FACTOR", 1.0))
+        return {"type": rope_scaling, "factor": float(rope_factor)}
+
+    @property
+    def batch_type(self) -> Type[CausalLMBatch]:
+        return CausalLMBatch
+
+    def decode(self, generated_ids: List[int]) -> str:
+        return self.tokenizer.decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+    def decode_token(
+        self,
+        all_input_ids: List[int],
+        prefix_offset: int = 0,
+        read_offset: int = 0,
+    ) -> Tuple[str, int, int]:
+        if is_tokenizer_transparent(self.tokenizer):
+            new_text = self.tokenizer.decode(
+                all_input_ids[read_offset:], skip_special_tokens=False
+            )
+            return new_text, read_offset, len(all_input_ids)
+        else:
+            return super().decode_token(all_input_ids, prefix_offset, read_offset)
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        token_idx,
+        past_key_values: Optional[List[Tuple]] = None,
+        bypass_hpu_graph: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Model Forward
+        kwargs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "token_idx": token_idx,
+        }
+
+        # Optimum Habana got "lazy_mode" key-val only supported for llama type of models
+        if self.model.config.model_type == "llama":
+            kwargs["lazy_mode"] = LAZY_MODE == 1
+
+        if self.has_position_ids:
+            kwargs["position_ids"] = position_ids
+
+        if bypass_hpu_graph is not None:
+            kwargs["bypass_hpu_graphs"] = bypass_hpu_graph
+
+        kwargs.update(self.kwargs)
+
+        if past_key_values is not None and self.model.config.model_type not in [
+            "gpt_bigcode"
+        ]:
+            return self.model.forward(**kwargs)
+        else:
+            outputs = self.model.forward(**kwargs)
+            return outputs.logits, outputs.past_key_values
+
+    @tracer.start_as_current_span("generate_token")
+    def generate_token(
+        self, batches: List[CausalLMBatch]
+    ) -> Tuple[List[Generation], Optional[CausalLMBatch], Tuple[int, int]]:
+        start = time.time_ns()
+        # Results
+        generations: List[Generation] = []
+        prev_batches = []
+        requests_to_generate = []
+        # In order to pipeline any actions on CPU we perform the operation in 3 main stages:
+        # Stage 1. Collect next token ids of any previously started generations
+        for batch_id, batch in enumerate(batches):
+            if batch.logits is not None:
+                logits = batch.logits
+                past = batch.past
+                prefill = batch.past_key_values is None
+                if prefill:
+                    # no right padding for prefill
+                    token_idx_scalar = batch.attention_mask.shape[-1] - 1
+                    token_idx = torch.tensor(token_idx_scalar).to(self.device)
+                else:
+                    token_idx_scalar = (
+                        batch.attention_mask.shape[-1] - batch.right_padding
+                    )
+                    token_idx = torch.tensor(token_idx_scalar).to(self.device)
+
+                # Select next token
+                input_length = batch.input_length
+                if logits.shape[-2] > 1:
+                    next_token_ids, next_token_logprobs, logprobs, _, _ = (
+                        batch.next_token_chooser(
+                            batch.input_ids,
+                            logits[:, input_length - 1 : input_length, :].squeeze(-2),
+                            self.speculate,
+                        )
+                    )
+                else:
+                    next_token_ids, next_token_logprobs, logprobs, _, _ = (
+                        batch.next_token_chooser(
+                            batch.input_ids, logits.squeeze(-2), self.speculate
+                        )
+                    )
+                # Speculation is not active for causal
+                accepted_ids = torch.ones_like(batch.input_ids)[:, 0]
+                batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
+                    batch.top_n_tokens,
+                    batch.top_n_tokens_tensor,
+                    logprobs,
+                    accepted_ids,
+                )
+
+                prev_batches.append(
+                    {
+                        "next_token_ids": next_token_ids,
+                        "next_token_logprobs": next_token_logprobs,
+                    }
+                )
+
+                for req_idx, req in enumerate(batch.requests):
+                    requests_to_generate.append(
+                        {
+                            "req": req,
+                            "prev_req_idx": req.idx,
+                            "batch_id": batch_id,
+                            "seed": batch.next_token_chooser.seeds[req_idx],
+                            "do_sample": batch.next_token_chooser.do_sample[req_idx],
+                            "top_n_tokens": batch.top_n_tokens[req_idx],
+                            "top_token_ids": batch_top_token_ids[req_idx],
+                            "top_token_logprobs": batch_top_token_logprobs[req_idx],
+                            "grammar_state": batch.next_token_chooser.fsm_grammar_states[
+                                req.idx
+                            ],
+                        }
+                    )
+
+                htorch.core.mark_step()
+
+                # Add new token into input_ids
+                batch.input_ids.index_copy_(1, token_idx, next_token_ids.unsqueeze(1))
+
+                # Update attention_mask as we added a new token to input_ids
+                batch.attention_mask.index_fill_(1, token_idx, 1)
+
+                # Adjust lengths
+                batch.input_length += 1
+
+                # Update position_ids
+                if prefill:
+                    batch.position_ids = (
+                        torch.index_select(batch.position_ids, 1, token_idx - 1) + 1
+                    )
+                else:
+                    batch.position_ids += 1
+                # Update past key values
+                if prefill or self.model.config.model_type in ["gpt_bigcode"]:
+                    batch.past_key_values = past
+
+        htorch.core.mark_step()
+
+        # Stage 2. Prepare new batch for speculative scheduling
+        if len(batches) > 1:
+            batch = self.batch_type.concatenate(batches, self.tokenizer.pad_token_id)
+        else:
+            batch = batches[0]
+
+        prefill = batch.past_key_values is None
+
+        # Check if we need to do any bookkeeping first
+        if not prefill:
+            batch = batch.__class__.recombine([batch], self.tokenizer.pad_token_id)
+
+        scenario = "PREFILL" if prefill else "GENERATE"
+        if (
+            self.enable_hpu_graph
+            and self.limit_hpu_graph
+            and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs
+        ):
+            self.model.clear_cache()
+            self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE)
+        dbg_trace(
+            scenario,
+            f"bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}",
+        )
+        assert batch.right_padding > 0, "No more room for next token!"
+
+        # Execute batch
+        if prefill:
+            # no right padding for prefill
+            token_idx = torch.tensor(batch.attention_mask.shape[-1] - 1).to(self.device)
+            batch.logits, batch.past = self.forward(
+                batch.input_ids,
+                batch.attention_mask,
+                batch.position_ids,
+                token_idx,
+                batch.past_key_values,
+                bypass_hpu_graph=(
+                    prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
+                ),
+            )
+        elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]):
+            # Don't schedule next forward if max_new_tokens for all requests equals 1
+            # - we've already generated the first and only needed token in the prefill phase
+            pass
+        else:
+            token_idx = torch.tensor(
+                batch.attention_mask.shape[-1] - batch.right_padding
+            ).to(self.device)
+            input_ids = torch.index_select(batch.input_ids, 1, token_idx - 1)
+            logits = self.forward(
+                input_ids,
+                batch.attention_mask,
+                batch.position_ids,
+                token_idx,
+                batch.past_key_values,
+                bypass_hpu_graph=(
+                    prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
+                ),
+            )
+            if self.model.config.model_type in ["gpt_bigcode"]:
+                batch.logits, batch.past = logits
+            else:
+                batch.logits = logits
+
+        htorch.core.mark_step()
+
+        start_decode = time.time_ns()
+
+        # Stage 3. Finish and return previous generations
+        stopped = len(requests_to_generate) > 0
+        for prev_batch in prev_batches:
+            prev_batch["next_token_logprobs"] = prev_batch[
+                "next_token_logprobs"
+            ].tolist()
+            prev_batch["next_token_ids_cpu"] = prev_batch["next_token_ids"].cpu()
+        htorch.core.mark_step()
+
+        for req_data in requests_to_generate:
+            req = req_data["req"]
+            i = req_data["prev_req_idx"]
+            prev_batch_id = req_data["batch_id"]
+            assert len(prev_batches) > prev_batch_id
+            next_token_ids_cpu = prev_batches[prev_batch_id]["next_token_ids_cpu"]
+            next_token_logprobs = prev_batches[prev_batch_id]["next_token_logprobs"]
+
+            request = req.data
+            input_length = req.input_length
+            prefix_offset = req.prefix_offset
+            read_offset = req.read_offset
+            do_sample = req_data["do_sample"]
+            seed = req_data["seed"]
+            stopping_criteria = req.stopping_criteria
+            all_input_ids = req.all_input_ids
+            next_token_id = next_token_ids_cpu[i]
+            next_token_logprob = next_token_logprobs[i]
+            top_n_tokens = req_data["top_n_tokens"]
+            top_token_ids = req_data["top_token_ids"]
+            top_token_logprobs = req_data["top_token_logprobs"]
+            grammar_state = req_data["grammar_state"]
+
+            # Append next token to all tokens
+            all_input_ids[input_length] = next_token_id
+            new_input_length = input_length + 1
+
+            # Generated token
+            if (
+                is_tokenizer_transparent(self.tokenizer)
+                and len(stopping_criteria.stop_sequence_criterias) == 0
+            ):
+                next_token_text = ""
+            else:
+                next_token_text, prefix_offset, read_offset = self.decode_token(
+                    all_input_ids[0:new_input_length, 0], prefix_offset, read_offset
+                )
+
+            # Evaluate stopping criteria
+            stop, reason = stopping_criteria(
+                next_token_id,
+                next_token_text,
+            )
+
+            if not stop:
+                stopped = False
+
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Decode generated tokens
+                    if is_tokenizer_transparent(self.tokenizer):
+                        output_text = None
+                    else:
+                        output_text = self.decode(
+                            all_input_ids[
+                                new_input_length
+                                - stopping_criteria.current_tokens : new_input_length,
+                                0,
+                            ]
+                        )
+                    generated_text = GeneratedText(
+                        output_text,
+                        stopping_criteria.current_tokens,
+                        reason,
+                        seed if do_sample else None,
+                    )
+                else:
+                    generated_text = None
+
+                # Prefill
+                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
+                    # Remove generated token to only have prefill and add nan for first prompt token
+                    prefill_logprobs = [float("nan")] + next_token_logprobs
+                    prefill_token_ids = all_input_ids[0 : new_input_length - 1]
+                    prefill_texts = self.tokenizer.batch_decode(
+                        prefill_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+                    prefill_tokens = Tokens(
+                        prefill_token_ids,
+                        prefill_logprobs,
+                        prefill_texts,
+                        is_special=[],
+                    )
+                else:
+                    prefill_tokens = None
+
+                if top_n_tokens > 0:
+                    all_top_tokens = []
+                    for top_token_ids, top_token_logprobs in zip(
+                        top_token_ids, top_token_logprobs
+                    ):
+                        toptoken_texts = self.tokenizer.batch_decode(
+                            top_token_ids,
+                            clean_up_tokenization_spaces=False,
+                            skip_special_tokens=False,
+                        )
+                        special_toptokens = [
+                            token_id in self.all_special_ids
+                            for token_id in top_token_ids
+                        ]
+                        top_tokens = Tokens(
+                            top_token_ids,
+                            top_token_logprobs,
+                            toptoken_texts,
+                            special_toptokens,
+                        )
+                        all_top_tokens.append(top_tokens)
+                    top_tokens = all_top_tokens
+                else:
+                    top_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    Tokens(
+                        [next_token_id],
+                        [next_token_logprob],
+                        [next_token_text],
+                        [next_token_id in self.all_special_ids],
+                    ),
+                    generated_text,
+                    top_tokens,
+                )
+
+                generations.append(generation)
+
+            batch.next_token_chooser = (
+                batch.next_token_chooser.advance_grammar_single_with_past_state(
+                    req.idx, next_token_id, grammar_state
+                )
+            )
+
+            req.all_input_ids = all_input_ids
+            req.input_length = new_input_length
+            req.prefix_offset = prefix_offset
+            req.read_offset = read_offset
+
+        htorch.core.mark_step()
+        self.step = self.step + 1
+        if self.hb_profiler is not None:
+            if (
+                self.step
+                > self.profiling_wait_steps
+                + self.profiling_warmup_steps
+                + self.profiling_steps
+            ):
+                self.hb_profiler.stop()
+            else:
+                self.hb_profiler.step()
+
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch if not stopped else None, (forward_ns, decode_ns)
+
+    def generate_warmup_batch(self, request, seq_len, batch_size):
+        batch = copy.deepcopy(request.batch)
+        for req in batch.requests:
+            req.truncate = seq_len
+
+        for i in range(len(batch.requests) - batch_size):
+            batch.requests.pop()
+
+        return self.batch_type.from_pb(batch, self.tokenizer, self.dtype, self.device)
+
+    def warmup(
+        self, request: generate_pb2.WarmupRequest
+    ) -> Tuple[Optional[int], Optional[int], Optional[int]]:
+        assert (
+            MAX_BATCH_SIZE is not None
+        ), "MAX_BATCH_SIZE is not set, it should be set in the launcher"
+        MAX_BATCH_TOTAL_TOKENS = MAX_BATCH_SIZE * request.max_total_tokens
+        logger.info(f"MAX_BATCH_SIZE: {MAX_BATCH_SIZE}")
+        logger.info(f"MAX_BATCH_TOTAL_TOKENS: {MAX_BATCH_TOTAL_TOKENS}")
+        MAX_TOTAL_TOKENS = request.max_total_tokens
+
+        batch = self.batch_type.from_pb(
+            request.batch, self.tokenizer, self.dtype, self.device
+        )
+        max_prefill_batch_size = batch.input_ids.shape[0]
+        try:
+            # max prefill batch size warmup
+            _, prefill_batch, _ = self.generate_token([batch])
+        except Exception:
+            raise RuntimeError(
+                f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
+                f"You need to decrease `--max-batch-prefill-tokens`"
+            )
+
+        del prefill_batch
+
+        # Warmup prefill batch_size
+        max_input_tokens = request.max_input_tokens
+        prefill_batch_size_list = [
+            batch
+            for batch in range(
+                PREFILL_BATCH_BUCKET_SIZE,
+                max_prefill_batch_size,
+                PREFILL_BATCH_BUCKET_SIZE,
+            )
+        ]
+        prefill_batch_size_list.append(max_prefill_batch_size)
+        prefill_seqlen_list = [
+            seq
+            for seq in range(
+                PAD_SEQUENCE_TO_MULTIPLE_OF,
+                max_input_tokens,
+                PAD_SEQUENCE_TO_MULTIPLE_OF,
+            )
+        ]
+        prefill_seqlen_list.append(max_input_tokens)
+        prefill_batch_size_list.sort(reverse=True)
+        prefill_seqlen_list.sort(reverse=True)
+        try:
+            for batch_size in prefill_batch_size_list:
+                for seq_len in prefill_seqlen_list:
+                    batch = self.generate_warmup_batch(request, seq_len - 1, batch_size)
+                    _, prefill_batch, _ = self.generate_token([batch])
+        except Exception:
+            prefill_batch_size_list.sort()
+            prefill_seqlen_list.sort()
+            raise RuntimeError(
+                f"Not enough memory to run following prefill batch_size."
+                f"Prefill batch size list:{prefill_batch_size_list}"
+                f"Prefill sequence length list:{prefill_seqlen_list}"
+                f"You need to decrease `--max-batch-prefill-tokens`"
+            )
+        prefill_seqlen_list.sort()
+        prefill_batch_size_list.sort()
+        mem_stats = get_hpu_memory_stats(self.device)
+        logger.info(
+            f"\nFollowing prefill warmup successfully.\n"
+            f"Prefill batch size list:{prefill_batch_size_list}\n"
+            f"Prefill sequence length list:{prefill_seqlen_list}\n"
+            f"Memory stats: {mem_stats} "
+        )
+
+        max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS)
+        max_decode_batch_size = round_up(max_decode_batch_size, BATCH_BUCKET_SIZE)
+        decode_batch_size_list = [
+            i
+            for i in range(BATCH_BUCKET_SIZE, max_decode_batch_size, BATCH_BUCKET_SIZE)
+        ]
+        decode_batch_size_list.append(max_decode_batch_size)
+        decode_batch_size_list.sort(reverse=True)
+
+        try:
+            for batch_size in decode_batch_size_list:
+                batches = []
+                iters = math.floor(batch_size / max_prefill_batch_size)
+                for i in range(iters):
+                    batch = self.generate_warmup_batch(
+                        request, PAD_SEQUENCE_TO_MULTIPLE_OF - 1, max_prefill_batch_size
+                    )
+                    _, prefill_batch, _ = self.generate_token([batch])
+                    batches.append(prefill_batch)
+
+                if batch_size % max_prefill_batch_size != 0:
+                    batch = self.generate_warmup_batch(
+                        request,
+                        PAD_SEQUENCE_TO_MULTIPLE_OF - 1,
+                        batch_size % max_prefill_batch_size,
+                    )
+                    _, prefill_batch, _ = self.generate_token([batch])
+                    batches.append(prefill_batch)
+
+                _, decode_batch, _ = self.generate_token(batches)
+                _, decode_batch, _ = self.generate_token([decode_batch])
+                del decode_batch
+                batches.clear()
+
+        except Exception:
+            raise RuntimeError(
+                f"Not enough memory to warmup decode batch_sizes({decode_batch_size_list})."
+                f"You need to decrease `--max-batch-total-tokens`"
+            )
+
+        decode_batch_size_list.sort()
+        max_supported_total_tokens = MAX_TOTAL_TOKENS * decode_batch_size_list[-1]
+        mem_stats = get_hpu_memory_stats(self.device)
+        logger.info(
+            f"\nFollowing decode warmup successfully.\n"
+            f"Decode batch size list:{decode_batch_size_list}\n"
+            f"Memory stats: {mem_stats} "
+        )
+
+        max_input_tokens = max_input_tokens
+        max_total_tokens = MAX_TOTAL_TOKENS
+
+        return max_supported_total_tokens, max_input_tokens, max_total_tokens
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/__init__.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py
new file mode 100644
index 00000000..e2719fad
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -0,0 +1,923 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BLOOM model."""
+
+import math
+import os
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.distributed
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import LayerNorm
+from torch.nn import functional as F
+
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from transformers import BloomConfig, PreTrainedModel
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    SpeculativeHead,
+)
+
+CUSTOM_KERNELS_ENABLED = False
+if (
+    torch.cuda.is_available()
+    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
+):
+    try:
+        from custom_kernels import fused_bloom_attention_cuda
+
+        CUSTOM_KERNELS_ENABLED = True
+    except ImportError:
+        pass
+
+_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
+_CONFIG_FOR_DOC = "BloomConfig"
+
+BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bigscience/bigscience-small-testing",
+    "bigscience/bloom-560m",
+    "bigscience/bloom-1b1",
+    "bigscience/bloom-1b7",
+    "bigscience/bloom-3b",
+    "bigscience/bloom-7b1",
+    "bigscience/bloom",
+]
+
+
+def _make_causal_mask(
+    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
+) -> torch.BoolTensor:
+    """
+    Make causal mask used for self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = torch.ones(
+        (target_length, target_length + past_key_values_length),
+        dtype=torch.bool,
+        device=device,
+    )
+    mask = mask.triu(1 + past_key_values_length)
+
+    expanded_mask = mask.unsqueeze(0).expand(
+        batch_size, target_length, target_length + past_key_values_length
+    )
+    return expanded_mask
+
+
+def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    expanded_mask = ~(mask[:, None, :].to(torch.bool))
+    return expanded_mask.expand(batch_size, tgt_length, src_length)
+
+
+def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int) -> torch.Tensor:
+    """
+    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+
+    Args:
+    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+        attention_mask (`torch.Tensor`):
+            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+        num_heads (`int`, *required*):
+            number of heads
+        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
+            dtype of the output tensor
+    """
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
+        device=attention_mask.device,
+        dtype=torch.float32,
+    )
+    powers = torch.arange(
+        1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32
+    )
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
+            device=attention_mask.device,
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(
+            1,
+            1 + 2 * num_remaining_heads,
+            2,
+            device=attention_mask.device,
+            dtype=torch.int32,
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None] * arange_tensor
+    return alibi
+
+
+# @torch.jit.script
+def dropout_add(
+    x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool
+) -> torch.Tensor:
+    """
+    Dropout add function
+
+    Args:
+        x (`torch.tensor`, *required*):
+            input tensor
+        residual (`torch.tensor`, *required*):
+            esidual tensor
+        prob (`float`, *required*):
+            dropout probability
+        training (`bool`, *required*):
+            training mode
+    """
+    out = F.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+# @torch.jit.script # this is shit for unknow reasons.
+def _split_heads(
+    fused_qkv: torch.Tensor, num_heads: int, head_dim: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
+    storage as `fused_qkv`
+
+    Args:
+        fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+
+    Returns:
+        query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
+        value: [batch_size, seq_length, num_heads, head_dim]
+    """
+    batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+    fused_qkv = fused_qkv.view(batch_size, seq_length, num_heads, 3 * head_dim)
+    query_layer, key_layer, value_layer = fused_qkv.split(head_dim, dim=-1)
+
+    query_layer = query_layer.transpose(1, 2).reshape(
+        batch_size * num_heads, seq_length, head_dim
+    )
+    key_layer = key_layer.permute(0, 2, 3, 1).reshape(
+        batch_size * num_heads, head_dim, seq_length
+    )
+    value_layer = value_layer.transpose(1, 2).reshape(
+        batch_size * num_heads, seq_length, head_dim
+    )
+
+    return query_layer, key_layer, value_layer
+
+
+# @torch.jit.script
+def _merge_heads(x: torch.Tensor, num_heads: int, head_dim: int) -> torch.Tensor:
+    """
+    Merge heads together over the last dimenstion
+
+    Args:
+        x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+
+    Returns:
+        torch.tensor: [batch_size, seq_length, num_heads * head_dim]
+    """
+    # What we want to achieve is:
+    # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
+    batch_size_and_num_heads, seq_length, _ = x.shape
+    batch_size = batch_size_and_num_heads // num_heads
+
+    # First view to decompose the batch size
+    # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
+    x = x.view(batch_size, num_heads, seq_length, head_dim)
+
+    # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
+    x = x.permute(0, 2, 1, 3)
+
+    # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
+    return x.reshape(batch_size, seq_length, num_heads * head_dim)
+
+
+class BloomAttention(nn.Module):
+    def __init__(self, prefix, config: BloomConfig, weights):
+        super().__init__()
+
+        self.pretraining_tp = config.pretraining_tp
+        self.slow_but_exact = config.slow_but_exact
+
+        self.process_group = weights.process_group
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.n_head
+        self.head_dim = self.hidden_size // self.num_heads
+        self.split_size = self.hidden_size
+        self.hidden_dropout = config.hidden_dropout
+
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.beta = 1.0
+
+        process_group = weights.process_group
+        if self.num_heads % process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {process_group.size()}"
+            )
+        self.num_heads = self.num_heads // process_group.size()
+        self.query_key_value = TensorParallelColumnLinear.load(
+            config=config,
+            prefix=f"{prefix}.query_key_value",
+            weights=weights,
+            bias=True,
+        )
+        self.dense = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.dense", weights=weights, bias=True
+        )
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
+    @staticmethod
+    def compute_attention(
+        fused_qkv: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        alibi: torch.Tensor,
+        attention_mask: torch.Tensor,
+        head_mask: Optional[torch.Tensor],
+        beta: float,
+        inv_norm_factor: float,
+        num_heads: int,
+        use_cache: bool,
+    ):
+        batch_size, q_length, three_times_hidden_size = fused_qkv.shape
+        head_dim = three_times_hidden_size // (3 * num_heads)
+        batch_size * num_heads
+
+        ### TODO @thomasw21: this takes quite a bit of time, how do I accelerate that?
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = _split_heads(
+            fused_qkv, num_heads=num_heads, head_dim=head_dim
+        )
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # concatenate along seq_length dimension:
+            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
+            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
+            past_key = past_key.view(-1, *past_key.shape[-2:])
+            key_layer = torch.cat((past_key, key_layer), dim=2)
+            past_value = past_value.view(-1, *past_value.shape[-2:])
+            value_layer = torch.cat((past_value, value_layer), dim=1)
+
+        _, _, kv_length = key_layer.shape
+
+        if use_cache is True:
+            present = (key_layer, value_layer)
+        else:
+            present = None
+        ###
+
+        # [batch_size * num_heads, q_length, kv_length]
+        # we use `torch.Tensor.baddbmm` instead of `torch.baddbmm` as the latter isn't supported by TorchScript v1.11
+        attention_scores = alibi.baddbmm(
+            batch1=query_layer,
+            batch2=key_layer,
+            beta=beta,
+            alpha=inv_norm_factor,
+        )
+
+        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+        input_dtype = attention_scores.dtype
+        # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+        if input_dtype == torch.float16:
+            attention_scores = attention_scores.to(torch.float)
+        # torch.finfo not supported by torch.jit, we temporarily remplace with `-1e34`
+        attn_weights = attention_scores.masked_fill_(
+            attention_mask, torch.finfo(attention_scores.dtype).min
+        )
+        attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+            input_dtype
+        )
+
+        # # [batch_size, num_heads, q_length, kv_length]
+        # attention_probs = self.attention_dropout(attention_probs)
+
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        # matmul: [batch_size * num_heads, q_length, head_dim]
+        context_layer = torch.bmm(attention_probs, value_layer, out=query_layer)
+
+        # change view [batch_size, num_heads, q_length, head_dim]
+        context_layer = _merge_heads(
+            context_layer, num_heads=num_heads, head_dim=head_dim
+        )
+
+        return context_layer, present, attention_probs
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        alibi: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        fused_qkv = self.query_key_value(
+            hidden_states
+        )  # [batch_size, seq_length, 3 x hidden_size]
+        batch_size, q_length, _ = fused_qkv.shape
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            layer_past = (
+                past_key.view(-1, *past_key.shape[-2:]),
+                past_value.view(-1, *past_value.shape[-2:]),
+            )
+
+        if CUSTOM_KERNELS_ENABLED:
+            assert self.training is False, "Only foward pass was implemented"
+            assert (
+                attention_mask.shape[-1] < 4096
+            ), "Custom kernel support only up to 4096 tokens"
+            (
+                context_layer,
+                present,
+                attention_probs,
+            ) = fused_bloom_attention_cuda.forward(
+                fused_qkv,
+                layer_past,
+                alibi,
+                attention_mask,
+                head_mask,
+                self.beta,
+                self.inv_norm_factor,
+                self.num_heads,
+                use_cache,
+            )
+        else:
+            context_layer, present, attention_probs = self.compute_attention(
+                fused_qkv=fused_qkv,
+                layer_past=layer_past,
+                alibi=alibi,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                beta=self.beta,
+                inv_norm_factor=self.inv_norm_factor,
+                num_heads=self.num_heads,
+                use_cache=use_cache,
+            )
+
+        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
+        if self.pretraining_tp > 1 and self.slow_but_exact:
+            slices = self.hidden_size / self.pretraining_tp
+            output_tensor = torch.zeros_like(context_layer)
+            for i in range(self.pretraining_tp):
+                output_tensor = output_tensor + F.linear(
+                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
+                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
+                )
+        else:
+            output_tensor = self.dense(context_layer)
+
+        # output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
+        output_tensor += residual
+
+        outputs = (output_tensor, present)
+        if output_attentions:
+            outputs += (attention_probs,)
+
+        return outputs
+
+
+class BloomMLP(nn.Module):
+    def __init__(self, prefix, config: BloomConfig, weights):
+        super().__init__()
+
+        self.pretraining_tp = config.pretraining_tp
+        self.slow_but_exact = config.slow_but_exact
+        self.dense_h_to_4h = TensorParallelColumnLinear.load(
+            config=config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=True
+        )
+        self.dense_4h_to_h = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=True
+        )
+        self.gelu_impl = torch.nn.GELU(approximate="tanh")
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(
+        self, hidden_states: torch.Tensor, residual: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states))
+
+        if self.pretraining_tp > 1 and self.slow_but_exact:
+            intermediate_output = torch.zeros_like(residual)
+            slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp
+            for i in range(self.pretraining_tp):
+                intermediate_output = intermediate_output + F.linear(
+                    hidden_states[:, :, int(i * slices) : int((i + 1) * slices)],
+                    self.dense_4h_to_h.weight[
+                        :, int(i * slices) : int((i + 1) * slices)
+                    ],
+                )
+        else:
+            intermediate_output = self.dense_4h_to_h(hidden_states)
+
+        # output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training)
+        intermediate_output += residual
+
+        return intermediate_output
+
+
+class BloomBlock(nn.Module):
+    def __init__(self, layer_id: int, config: BloomConfig, weights):
+        super().__init__()
+
+        prefix = f"h.{layer_id}"
+        self.input_layernorm = LayerNorm.load(
+            prefix=f"{prefix}.input_layernorm",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+        self.num_heads = config.n_head
+        self.self_attention = BloomAttention(
+            prefix=f"{prefix}.self_attention", config=config, weights=weights
+        )
+        self.post_attention_layernorm = LayerNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+        self.mlp = BloomMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.apply_residual_connection_post_layernorm = (
+            config.apply_residual_connection_post_layernorm
+        )
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        # hidden_states: [batch_size, seq_length, hidden_size]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Layer norm post the self attention.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # Self attention.
+        attn_outputs = self.self_attention(
+            layernorm_output,
+            residual,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = attn_outputs[0]
+
+        outputs = attn_outputs[1:]
+
+        layernorm_output = self.post_attention_layernorm(attention_output)
+
+        # Get residual
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = attention_output
+
+        # MLP.
+        output = self.mlp(layernorm_output, residual)
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+
+class BloomPreTrainedModel(PreTrainedModel):
+    config_class = BloomConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["BloomBlock"]
+
+    @staticmethod
+    def _convert_to_standard_cache(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
+        num_heads, ...]))
+        """
+        batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        num_heads = batch_size_times_num_heads // batch_size
+        # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
+        # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].view(batch_size, num_heads, head_dim, seq_length),
+                layer_past[1].view(batch_size, num_heads, seq_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+    @staticmethod
+    def _convert_to_bloom_cache(
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
+        """
+        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        batch_size_times_num_heads = batch_size * num_heads
+        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
+        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
+                layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
+            )
+            for layer_past in past_key_value
+        )
+
+
+class BloomModel(BloomPreTrainedModel):
+    def __init__(self, config: BloomConfig, weights):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.n_head
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+
+        self.word_embeddings = TensorParallelEmbedding(
+            prefix="word_embeddings", weights=weights
+        )
+
+        self.word_embeddings_layernorm = LayerNorm.load(
+            prefix="word_embeddings_layernorm",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+        # Transformer blocks
+        self.h = nn.ModuleList(
+            [
+                BloomBlock(layer_id=layer_id, config=config, weights=weights)
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        # Final Layer Norm
+        self.ln_f = LayerNorm.load(
+            prefix="ln_f", weights=weights, eps=config.layer_norm_epsilon
+        )
+
+    def _prepare_attn_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int, int],
+        past_key_values_length: int,
+    ) -> torch.BoolTensor:
+        # create causal mask
+        # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
+        combined_attention_mask = None
+        device = attention_mask.device
+        _, src_length = input_shape
+
+        if src_length > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                device=device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
+        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
+        combined_attention_mask = (
+            expanded_attn_mask
+            if combined_attention_mask is None
+            else expanded_attn_mask | combined_attention_mask
+        )
+
+        return combined_attention_mask
+
+    def set_input_embeddings(self, new_embeddings: torch.Tensor):
+        self.word_embeddings = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[-1]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), device=hidden_states.device
+            )
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        alibi = build_alibi_tensor(attention_mask, self.num_heads)
+
+        causal_mask = self._prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+
+        if hasattr(self, "tp_rank"):
+            assert self.num_heads % self.tp_world_size == 0
+            block_size = self.num_heads // self.tp_world_size
+            alibi = alibi[
+                :, self.tp_rank * block_size : (self.tp_rank + 1) * block_size
+            ]
+            alibi = alibi.reshape(batch_size * block_size, 1, seq_length_with_past)
+            causal_mask = torch.repeat_interleave(causal_mask, block_size, dim=0)
+        else:
+            alibi = alibi.reshape(batch_size * self.num_heads, 1, seq_length_with_past)
+            causal_mask = torch.repeat_interleave(causal_mask, self.num_heads, dim=0)
+
+        alibi = alibi.to(hidden_states.dtype)
+
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                layer_past=layer_past,
+                attention_mask=causal_mask,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                alibi=alibi,
+            )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    outputs[2 if use_cache else 1],
+                )
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    presents,
+                    all_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class BloomForCausalLM(BloomPreTrainedModel):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__(config)
+        self.transformer = BloomModel(config, weights)
+
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="word_embeddings",
+            weights=weights,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
+            if past_key_values[0][0].shape[0] == input_ids.shape[0]:
+                past_key_values = self._convert_to_bloom_cache(past_key_values)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        logits, speculative_logits = self.lm_head(hidden_states)
+        loss = None
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return (
+            CausalLMOutputWithCrossAttentions(
+                loss=loss,
+                logits=logits,
+                past_key_values=transformer_outputs.past_key_values,
+                hidden_states=transformer_outputs.hidden_states,
+                attentions=transformer_outputs.attentions,
+            ),
+            speculative_logits,
+        )
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/clip.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/clip.py
new file mode 100644
index 00000000..ab824da5
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/clip.py
@@ -0,0 +1,817 @@
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import (
+    _create_4d_causal_attention_mask,
+    _prepare_4d_attention_mask,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPooling,
+)
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+
+from text_generation_server.layers import (
+    TensorParallelEmbedding,
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+
+
+class CLIPVisionEmbeddings(nn.Module):
+    def __init__(self, prefix, config: CLIPVisionConfig, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        # TODO Should we TP this ?
+        self.class_embedding = weights.get_tensor(f"{prefix}.class_embedding")
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.position_embedding", weights=weights
+        )
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions, device=weights.device).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(
+            config.max_position_embeddings, embed_dim
+        )
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = (
+            input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        )
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = self.embed_dim // self.num_heads
+        if self.head_size * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.embed_dim = self.embed_dim // weights.process_group.size()
+        self.scale = self.head_size**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=True,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_size)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+
+        qkv = self.qkv(hidden_states)
+        query_states, key_states, value_states = qkv.split(
+            [
+                self.head_size * self.num_heads,
+            ]
+            * 3,
+            dim=2,
+        )
+        query_states = query_states * self.scale
+        key_states = self._shape(key_states, -1, bsz)
+        value_states = self._shape(value_states, -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_size)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + causal_attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None
+
+
+class CLIPMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, prefix, config: CLIPConfig, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.layer_norm1 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
+        )
+        self.mlp = CLIPMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.layer_norm2 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class CLIPPreTrainedModel(nn.Module):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CLIPConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+
+
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+"""
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, prefix, config: CLIPConfig, weights):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                CLIPEncoderLayer(
+                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+        """
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+            )
+
+        return hidden_states
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, prefix: str, config: CLIPTextConfig, weights=None):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        # Initialize weights and apply final processing with `self.post_init()`
+        self.encoder = CLIPEncoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Returns:
+
+        """
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask, hidden_states.dtype
+            )
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            last_hidden_state[
+                torch.arange(
+                    last_hidden_state.shape[0], device=last_hidden_state.device
+                ),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(
+                    dim=-1
+                ),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            last_hidden_state[
+                torch.arange(
+                    last_hidden_state.shape[0], device=last_hidden_state.device
+                ),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (
+                    input_ids.to(dtype=torch.int, device=last_hidden_state.device)
+                    == self.eos_token_id
+                )
+                .int()
+                .argmax(dim=-1),
+            ]
+
+        return last_hidden_state
+
+
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
+
+    def __init__(self, prefix, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(prefix, config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, prefix, config: CLIPVisionConfig, weights):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = CLIPVisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
+        self.pre_layrnorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.pre_layrnorm", weights=weights, eps=config.layer_norm_eps
+        )
+        self.encoder = CLIPEncoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+        # self.post_layernorm = nn.LayerNorm.load(prefix=f"{prefix}.post_layernorm", weights=weights, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Returns:
+
+        """
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+        )
+        last_hidden_state = encoder_outputs
+        # pooled_output = last_hidden_state[:, 0, :]
+        # pooled_output = self.post_layernorm(pooled_output)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            # pooler_output=pooled_output,
+            # hidden_states=encoder_outputs,
+        )
+
+
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+        )
+
+
+class CLIPModel(nn.Module):
+    def __init__(self, prefix, config: CLIPConfig, weights):
+        super().__init__()
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(
+            self.vision_embed_dim, self.projection_dim, bias=False
+        )
+        self.text_projection = nn.Linear(
+            self.text_embed_dim, self.projection_dim, bias=False
+        )
+        self.logit_scale = nn.Parameter(
+            torch.tensor(self.config.logit_scale_init_value)
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        return logits_per_image, logits_per_text
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
new file mode 100644
index 00000000..30656038
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -0,0 +1,543 @@
+# coding=utf-8
+# Copyright 2024 Cohere team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.utils.weights import UnquantizedWeight
+
+if SYSTEM == "cuda":
+    import dropout_layer_norm
+else:
+    dropout_layer_norm = None
+
+
+class CohereRotary(PositionRotaryEmbedding):
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            import rotary_emb
+
+            q1 = query[..., ::2]
+            q2 = query[..., 1::2]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., ::2]
+            k2 = key[..., 1::2]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            from vllm._C import ops
+
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            ops.rotary_embedding(query, key, head_size, cos, sin, False)
+        elif SYSTEM == "ipex":
+            import intel_extension_for_pytorch as ipex
+
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), False
+            )
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+
+class CohereLayerNorm(nn.Module):
+    def __init__(self, prefix, weights, eps):
+        super().__init__()
+        weight = weights.get_sharded(f"{prefix}.weight", dim=0)
+        self.weight = nn.Parameter(weight)
+        # Fake weights
+        self.ones = weight.new_ones(weight.shape[1])
+        self.eps = eps
+
+    def forward(self, hidden_states):
+        if hidden_states.shape[-1] > 8192 or SYSTEM != "cuda":
+            hidden_states = hidden_states.reshape(
+                -1, self.weight.shape[0], self.weight.shape[1]
+            )
+            input_dtype = hidden_states.dtype
+            hidden_states = hidden_states.to(torch.float32)
+            mean = hidden_states.mean(-1, keepdim=True)
+            hidden_states_minus_mean = hidden_states - mean
+            variance = hidden_states_minus_mean.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states_minus_mean * torch.rsqrt(variance + self.eps)
+            hidden_states = self.weight.to(torch.float32) * hidden_states
+            hidden_states = hidden_states.view(-1, self.weight.shape[1])
+            return hidden_states.to(input_dtype)
+
+        (
+            hidden_states,
+            *rest,
+        ) = dropout_layer_norm.dropout_add_ln_fwd(
+            hidden_states,
+            None,
+            self.ones,
+            None,
+            None,
+            None,
+            None,
+            None,
+            0.0,
+            self.eps,
+            1.0,
+            0,
+            None,
+            False,
+            False,
+        )
+
+        # Required to apply one weight matrix per head
+        hidden_states = hidden_states.view(
+            -1, self.weight.shape[0], self.weight.shape[1]
+        )
+        hidden_states = self.weight * hidden_states
+        hidden_states = hidden_states.view(-1, self.weight.shape[1])
+
+        return hidden_states
+
+
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=config.attention_bias,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+    )
+
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.hidden_size // config.num_attention_heads
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    if config.attention_bias:
+        w = [
+            weights.get_sharded(f"{p}.bias", dim=0)
+            for p in [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
+        ]
+        bias = torch.cat(w, dim=0).to(dtype=weights.dtype).to(device=weights.device)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=bias))
+
+
+class FlashCohereAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = CohereRotary.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.use_qk_norm = config.use_qk_norm
+        if self.use_qk_norm:
+            self.q_norm = CohereLayerNorm(
+                prefix=f"{prefix}.q_norm",
+                weights=weights,
+                eps=config.layer_norm_eps,
+            )
+            self.k_norm = CohereLayerNorm(
+                prefix=f"{prefix}.k_norm",
+                weights=weights,
+                eps=config.layer_norm_eps,
+            )
+        else:
+            self.q_norm = None
+            self.k_norm = None
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=config.attention_bias,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, key, value = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                self.head_size * self.num_key_value_heads,
+                self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+
+        if self.use_qk_norm:
+            query = query.reshape(-1, self.head_size)
+            key = key.reshape(-1, self.head_size)
+            query = self.q_norm(query.contiguous())
+            key = self.k_norm(key.contiguous())
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_key_value_heads, self.head_size)
+        value = value.view(-1, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, key, cos, sin)
+
+        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else key,
+                kv_cache[1] if PREFILL_IN_KV_CACHE else value,
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), reduce=False
+        )
+
+
+class CohereMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=False
+        )
+
+
+class FlashCohereLayer(nn.Module):
+    def __init__(self, prefix: str, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+        self.self_attn = FlashCohereAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = CohereMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.input_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+        self.process_group = weights.process_group
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        mlp_output = self.mlp(normed_hidden_states)
+        output = attn_output + mlp_output
+
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(output, group=self.process_group)
+
+        return output, res
+
+
+class FlashCohereModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                FlashCohereLayer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.layer_norm_eps
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: torch.Tensor,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashCohereForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = FlashCohereModel(prefix, config, weights)
+        try:
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix="lm_head",
+                weights=weights,
+            )
+        except RuntimeError:
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix=f"{prefix}.embed_tokens",
+                weights=weights,
+            )
+        self.logit_scale = config.logit_scale
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        logits *= self.logit_scale
+        if speculative_logits is not None:
+            speculative_logits *= self.logit_scale
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
new file mode 100644
index 00000000..1137a453
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -0,0 +1,756 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple, Any
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM != "ipex":
+    from vllm.model_executor.layers.fused_moe import fused_moe
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+    PREFILL_IN_KV_CACHE,
+)
+from text_generation_server.layers import (
+    FastLinear,
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+
+class DbrxFFNConfig(PretrainedConfig):
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        if uniform_expert_assignment:
+            raise ValueError("`uniform_expert_assignment = True` is not supported")
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+
+class DbrxConfig(PretrainedConfig):
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "n_heads",
+        "num_hidden_layers": "n_layers",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def num_key_value_heads(self):
+        # We can't use the attribute map, since this the number of KV
+        # heads is not top-level.
+        return self.attn_config.kv_n_heads
+
+
+def promote_scalar(x: torch.Tensor) -> torch.Tensor:
+    return x.view(1) if len(x.size()) == 0 else x
+
+
+def load_attention(config, prefix, weights):
+    return TensorParallelColumnLinear.load_qkv(
+        config,
+        prefix=f"{prefix}.Wqkv",
+        weights=weights,
+        bias=False,
+        num_heads=config.n_heads,
+        num_key_value_heads=config.attn_config.kv_n_heads,
+    )
+
+
+def _load_experts(config, prefix, weights):
+    world_size = weights.process_group.size()
+    rank = weights.process_group.rank()
+
+    assert (
+        config.ffn_config.ffn_hidden_size % world_size == 0
+    ), f"The chosen size {config.ffn_config.ffn_hidden_size} is not compatible with sharding on {world_size} shards"
+
+    expert_size = config.ffn_config.ffn_hidden_size
+    block_size = expert_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+
+    tensor = torch.empty(
+        (config.ffn_config.moe_num_experts * block_size, config.d_model),
+        dtype=weights.dtype,
+        device=weights.device,
+    )
+
+    slice_ = weights._get_slice(f"{prefix}")
+
+    for i in range(config.ffn_config.moe_num_experts):
+        offset = i * expert_size
+        expert_slice = slice_[start + offset : stop + offset]
+
+        tensor[i * block_size : (i + 1) * block_size] = expert_slice.to(
+            dtype=weights.dtype
+        ).to(device=weights.device)
+    return tensor
+
+
+def _load_experts_quantized(config, prefix, weights, cls):
+    world_size = weights.process_group.size()
+    rank = weights.process_group.rank()
+
+    assert (
+        config.ffn_config.ffn_hidden_size % world_size == 0
+    ), f"The chosen size {config.ffn_config.ffn_hidden_size} is not compatible with sharding on {world_size} shards"
+
+    expert_size = config.ffn_config.ffn_hidden_size
+    block_size = expert_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+
+    slice_ = weights._get_slice(f"{prefix}")
+
+    experts = []
+    for i in range(config.ffn_config.moe_num_experts):
+        if config.quantize in ["gptq", "awq"]:
+            raise NotImplementedError(
+                "Dbrx does not support gptq/awq quantization yet."
+            )
+        else:
+            offset = i * expert_size
+            expert_slice = (
+                slice_[start + offset : stop + offset]
+                .to(dtype=weights.dtype)
+                .to(device=weights.device)
+            )
+
+        if cls == TensorParallelRowLinear:
+            expert_slice = expert_slice.t().contiguous()
+            linear = get_linear(expert_slice, None)
+            experts.append(cls(linear, weights.process_group))
+        else:
+            linear = get_linear(expert_slice, None)
+            experts.append(cls(linear))
+
+    return experts
+
+
+class DbrxAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.num_heads = config.n_heads
+        self.hidden_size = config.d_model
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.attn_config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.attn_config.kv_n_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        if self.clip_qkv is not None:
+            qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class DbrxNormAttentionNorm(nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.norm_1 = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.norm_1", weights=weights, eps=1e-5
+        )
+        self.self_attn = DbrxAttention(
+            prefix=f"{prefix}.attn", config=config, weights=weights
+        )
+        self.norm_2 = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.norm_2",
+            weights=weights,
+            eps=1e-5,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        normed_hidden_states, res = self.norm_1(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.norm_2(attn_output, res)
+
+        return normed_attn_res_output, attn_res
+
+
+@torch.jit.script
+def select_experts(
+    gate_logits: torch.Tensor, top_k: int, moe_normalize_expert_weights: int
+):
+    # all_probs: (sequence_length, n_experts) and upcast for softmax
+    all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
+    # weights, selected_experts: (sequence_length, top-k)
+    weights, selected_experts = torch.topk(all_probs, top_k, dim=-1)
+    if moe_normalize_expert_weights:
+        weights = weights / torch.norm(
+            weights, p=moe_normalize_expert_weights, dim=-1, keepdim=True
+        )
+    weights = weights.view(-1)
+    selected_experts = selected_experts.view(-1)
+
+    return selected_experts, weights
+
+
+@torch.jit.script
+def round_up(x: torch.Tensor, value: int):
+    return torch.div(x + (value - 1), value, rounding_mode="trunc") * value
+
+
+class BlockSparseMoE(nn.Module):
+    def __init__(self, prefix, config: DbrxConfig, weights):
+        super().__init__()
+        self.moe_normalize_expert_weights = (
+            config.ffn_config.moe_normalize_expert_weights
+        )
+        self.hidden_dim = config.d_model
+        self.ffn_dim = config.ffn_config.ffn_hidden_size // weights.process_group.size()
+        self.num_experts = config.ffn_config.moe_num_experts
+        self.top_k = config.ffn_config.moe_top_k
+
+        act = config.ffn_config.ffn_act_fn["name"]
+        if "gelu" in act:
+            self.act = lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        elif "silu" in act:
+            self.act = torch.nn.functional.silu
+        else:
+            self.act = ACT2FN[act]
+
+        # gating
+        self.gate = FastLinear.load(
+            config, f"{prefix}.router.layer", weights, bias=False
+        )
+
+        # merged expert weights, all of size  (n_experts * ffn_dim, hidden_dim)
+        w1 = _load_experts(config, f"{prefix}.experts.mlp.w1", weights).view(
+            self.num_experts, self.ffn_dim, self.hidden_dim
+        )
+        v1 = _load_experts(config, f"{prefix}.experts.mlp.v1", weights).view(
+            self.num_experts, self.ffn_dim, self.hidden_dim
+        )
+        self.wv1 = torch.cat([w1, v1], dim=1)
+        self.w2 = (
+            _load_experts(config, f"{prefix}.experts.mlp.w2", weights)
+            .view(self.num_experts, self.ffn_dim, self.hidden_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(x)
+        out = fused_moe(
+            x,
+            self.wv1,
+            self.w2,
+            router_logits,
+            self.top_k,
+            renormalize=self.moe_normalize_expert_weights,
+            inplace=True,
+        )
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
+class DenseMoE(nn.Module):
+    def __init__(self, prefix, config: DbrxConfig, weights):
+        super().__init__()
+
+        self.moe_normalize_expert_weights = (
+            config.ffn_config.moe_normalize_expert_weights
+        )
+        self.hidden_dim = config.d_model
+        self.ffn_dim = config.ffn_config.ffn_hidden_size // weights.process_group.size()
+        self.num_experts = config.ffn_config.moe_num_experts
+        self.top_k = config.ffn_config.moe_top_k
+
+        act = config.ffn_config.ffn_act_fn["name"]
+        if "gelu" in act:
+            self.act = lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        elif "silu" in act:
+            self.act = torch.nn.functional.silu
+        else:
+            self.act = ACT2FN[act]
+
+        # gating
+        self.gate = FastLinear.load(
+            config, f"{prefix}.router.layer", weights, bias=False
+        )
+
+        self.w1 = _load_experts_quantized(
+            config,
+            prefix=f"{prefix}.experts.mlp.w1",
+            weights=weights,
+            cls=TensorParallelColumnLinear,
+        )
+        self.w2 = _load_experts_quantized(
+            config,
+            prefix=f"{prefix}.experts.mlp.w2",
+            weights=weights,
+            cls=TensorParallelRowLinear,
+        )
+        self.v1 = _load_experts_quantized(
+            config,
+            prefix=f"{prefix}.experts.mlp.v1",
+            weights=weights,
+            cls=TensorParallelColumnLinear,
+        )
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: (sequence_length, model_dim)
+        gate_logits: (sequence_length, n_experts)
+        """
+        # optional reshape
+        input_shape = x.shape
+        x = x.view(-1, input_shape[-1])
+
+        # gate_logits: (sequence_length, n_experts)
+        gate_logits = self.gate(x)
+        # all_probs: (sequence_length, n_experts) and upcast for softmax
+        weights = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
+
+        if self.top_k < self.num_experts:
+            _, not_selected_experts = torch.topk(
+                weights,
+                self.num_experts - self.top_k,
+                largest=False,
+                sorted=False,
+                dim=1,
+            )
+            # Mask not selected experts
+            weights.scatter_(1, not_selected_experts, 0)
+
+        # Re-normalize
+        if self.moe_normalize_expert_weights:
+            weights = weights / torch.norm(
+                weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True
+            )
+        weights = weights.to(x.dtype)
+
+        # Final output tensor
+        out = x.new_zeros(x.shape[0], self.hidden_dim)
+        for i in range(self.num_experts):
+            h = self.act(self.w1[i](x)) * self.v1[i](x)
+            h = self.w2[i](h, reduce=False)
+            # Add expert output to out with masking
+            out += h * weights[:, i].view(-1, 1)
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out
+
+
+class DbrxLayer(nn.Module):
+    def __init__(self, prefix: str, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.blocks.{layer_id}"
+
+        self.attn = DbrxNormAttentionNorm(
+            prefix=f"{prefix}.norm_attn_norm", config=config, weights=weights
+        )
+
+        moe_cls = BlockSparseMoE if config.quantize is None else DenseMoE
+        self.moe = moe_cls(f"{prefix}.ffn", config, weights)
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        # Self Attention
+        attn_output, attn_res = self.attn(
+            hidden_states,
+            residual,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        moe_output = self.moe(attn_output)
+
+        return moe_output, attn_res
+
+
+class DbrxModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.wte", weights=weights
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                DbrxLayer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.n_layers)
+            ]
+        )
+        self.norm = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.norm_f", weights=weights, eps=1e-5
+        )
+
+        self.head_size = self.layers[0].attn.self_attn.head_size
+        self.num_heads = self.layers[0].attn.self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].attn.self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].attn.self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashDbrxForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        self.model = DbrxModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
new file mode 100644
index 00000000..88c2cf80
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
@@ -0,0 +1,659 @@
+# coding=utf-8
+# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Type
+
+import torch
+import torch.distributed
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+
+from text_generation_server.layers import (
+    FastLinear,
+    SpeculativeHead,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    get_linear,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+    attention,
+    paged_attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.layernorm import FastRMSNorm
+from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
+from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.weights import Weights
+
+if SYSTEM == "rocm":
+    try:
+        from vllm import _custom_C
+    except Exception as e:
+        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+
+
+class DeepseekV2Config(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=2,
+        n_routed_experts=160,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method="gready",
+        n_group=8,
+        topk_group=3,
+        num_experts_per_tok=6,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func="softmax",
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError(
+                "tie_word_embeddings is not supported for Deepseek V2 models."
+            )
+
+        if ep_size != 1:
+            raise ValueError(
+                f"Currently only ep_size == 1 is supported for Deepseek V2 models, was {ep_size}"
+            )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekV2Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights: Weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.kv_lora_rank = config.kv_lora_rank
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.value_head_size = config.v_head_dim
+        self.head_pad_size = max(self.head_size, self.value_head_size)
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.qk_rope_head_dim,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        mscale = get_mscale(
+            self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
+        )
+        self.softmax_scale = self.head_size**-0.5 * mscale * mscale
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        if self.q_lora_rank is None:
+            self.q_proj = TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.q_proj",
+                weights=weights,
+                bias=config.attention_bias,
+            )
+        else:
+            self.q_a_proj = get_linear(
+                weight=weights.get_weights(f"{prefix}.q_a_proj"),
+                bias=(
+                    weights.get_tensor(f"{prefix}.q_a_proj.bias")
+                    if config.attention_bias
+                    else None
+                ),
+            )
+            self.q_a_layernorm = FastRMSNorm.load(
+                prefix=f"{prefix}.q_a_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+            self.q_b_proj = TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.q_b_proj",
+                weights=weights,
+                bias=config.attention_bias,
+            )
+
+        self.kv_a_proj_with_mqa = get_linear(
+            weight=weights.get_weights(f"{prefix}.kv_a_proj_with_mqa"),
+            bias=(
+                weights.get_tensor(f"{prefix}.kv_a_proj_with_mqa.bias")
+                if config.attention_bias
+                else None
+            ),
+        )
+
+        self.kv_a_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.kv_b_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.kv_b_proj",
+            weights=weights,
+            bias=config.attention_bias,
+        )
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        cu_seqlen_prefill: torch.Tensor,
+        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ):
+        if self.q_lora_rank is None:
+            query = self.q_proj(hidden_states)
+        else:
+            query = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))[0])
+        query = query.view(-1, self.num_heads, self.head_size)
+
+        _, query_pe = torch.split(
+            query, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, key_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+
+        key_pe = key_pe.view(-1, 1, self.qk_rope_head_dim)
+        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv.contiguous())[0]).view(
+            -1, self.num_key_value_heads, self.qk_nope_head_dim + self.value_head_size
+        )
+
+        key_nope, value = torch.split(
+            kv, [self.qk_nope_head_dim, self.value_head_size], dim=-1
+        )
+
+        batch_size, heads, head_dim = query_pe.shape
+        query_pe = (
+            query_pe.view(batch_size, heads, head_dim // 2, 2)
+            .transpose(2, 3)
+            .reshape(batch_size, heads, head_dim)
+        )
+        batch_size, heads, head_dim = key_pe.shape
+        key_pe = (
+            key_pe.view(batch_size, heads, head_dim // 2, 2)
+            .transpose(2, 3)
+            .reshape(batch_size, heads, head_dim)
+        )
+        self.rotary_emb(query_pe, key_pe, cos, sin)
+
+        query[..., self.qk_nope_head_dim :] = query_pe
+        key = torch.empty_like(query)
+        key[..., : self.qk_nope_head_dim] = key_nope
+        key[..., self.qk_nope_head_dim :] = key_pe
+
+        # We need to pad the heads because Flash Attention does not support
+        # qk and v with different head sizes.
+        query = torch.nn.functional.pad(
+            query, (0, self.head_pad_size - self.head_size), value=0
+        )
+        key = torch.nn.functional.pad(
+            key, (0, self.head_pad_size - self.head_size), value=0
+        )
+        value = torch.nn.functional.pad(
+            value, (0, self.head_pad_size - self.value_head_size), value=0
+        )
+
+        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else key,
+                kv_cache[1] if PREFILL_IN_KV_CACHE else value,
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        # Remove padding.
+        attn_output = attn_output[..., : self.value_head_size]
+
+        return self.o_proj(
+            attn_output.reshape(-1, self.num_heads * self.value_head_size)
+        )
+
+
+class DeepseekV2MLP(nn.Module):
+    def __init__(self, prefix: str, config, weights, intermediate_size: int):
+        super().__init__()
+        self.hidden_act = config.hidden_act
+        if self.hidden_act != "silu":
+            # Bail out because MoE only supports silu.
+            raise NotImplementedError(
+                "Currently only `silu` is supported as an activation for Deepseek V2."
+            )
+        self.act = ACT2FN[self.hidden_act]
+
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.intermediate_size = intermediate_size // weights.process_group.size()
+
+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
+    def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
+        if (
+            SYSTEM == "rocm"
+            and self.hidden_act == "silu"
+            and hidden_states.dtype == torch.float16
+            and hidden_states.shape[0] == 1
+            and not self.quantize
+        ):
+            out = torch.empty(
+                hidden_states.shape[0],
+                self.intermediate_size,
+                dtype=hidden_states.dtype,
+                device="cuda",
+            )
+            _custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
+            return self.down_proj(out, reduce=reduce)
+        else:
+            gate_up_states = self.gate_up_proj(hidden_states)
+            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+            return self.down_proj(
+                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
+            )
+
+
+class DeepseekV2MoE(nn.Module):
+    def __init__(
+        self,
+        prefix,
+        config: DeepseekV2Config,
+        moe_layer_cls: Type[MoELayer],
+        weights,
+    ):
+        super().__init__()
+
+        self.hidden_dim = config.hidden_size
+        self.moe_intermediate_size = (
+            config.moe_intermediate_size // weights.process_group.size()
+        )
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        # Gating
+        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
+
+        self.moe_layer = moe_layer_cls(
+            prefix=f"{prefix}.experts",
+            n_experts=config.n_routed_experts,
+            n_expert_group=config.n_group,
+            renormalize=config.norm_topk_prob,
+            topk=config.num_experts_per_tok,
+            topk_group=config.topk_group,
+            weights=weights,
+        )
+        assert isinstance(self.moe_layer, MoELayer)
+
+        if config.n_shared_experts is not None:
+            self.shared_experts = DeepseekV2MLP(
+                prefix=f"{prefix}.shared_experts",
+                config=config,
+                weights=weights,
+                intermediate_size=config.moe_intermediate_size
+                * config.n_shared_experts,
+            )
+        else:
+            self.shared_experts = None
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(x, reduce=False)
+        else:
+            shared_output = None
+
+        router_logits = self.gate(x)
+
+        out = self.moe_layer(x, gating_output=router_logits)
+
+        if shared_output is not None:
+            out = out + shared_output
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
+class DeepseekV2Layer(nn.Module):
+    def __init__(self, prefix, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+
+        self.self_attn = DeepseekV2Attention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+        )
+
+        if (
+            config.n_routed_experts is not None
+            and layer_id >= config.first_k_dense_replace
+            and layer_id % config.moe_layer_freq == 0
+        ):
+            moe_layer_cls = (
+                SparseMoELayer
+                if SparseMoELayer.is_supported(weights)
+                else DenseMoELayer
+            )
+            self.mlp = DeepseekV2MoE(f"{prefix}.mlp", config, moe_layer_cls, weights)
+        else:
+            self.mlp = DeepseekV2MLP(
+                prefix=f"{prefix}.mlp",
+                config=config,
+                weights=weights,
+                intermediate_size=config.intermediate_size,
+            )
+
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        cu_seqlen_prefill: torch.Tensor,
+        kv_cache,
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ):
+        normed_hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, residual = self.post_attention_layernorm(
+            attn_output, residual
+        )
+
+        output = self.mlp(normed_attn_res_output)
+
+        return output, residual
+
+
+class DeepseekV2Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights: Weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                DeepseekV2Layer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashDeepseekV2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights: Weights):
+        super().__init__()
+
+        self.model = DeepseekV2Model(
+            "model" if not prefix else f"{prefix}.model", config, weights
+        )
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
new file mode 100644
index 00000000..7a3d60c9
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
@@ -0,0 +1,560 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+class Gemma2Config(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=256128,
+        hidden_size=3072,
+        intermediate_size=24576,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=256,
+        hidden_act="gelu_pytorch_tanh",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.head_dim = head_dim
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class Gemma2FastRMSNorm(FastRMSNorm):
+    @classmethod
+    def load(cls, prefix: str, weights, eps=1e-6):
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        weight = weights.get_tensor(f"{prefix}.weight") + 1
+        weights.dtype = dtype
+        new = cls(weight, eps)
+        new.dtype = dtype
+        return new
+
+    # perform the multiplication in full precision and downcast after
+    def forward(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states * self.weight
+        return hidden_states.to(self.dtype), residual
+
+
+def load_attention(config, prefix: str, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+    )
+
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.head_dim
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=None))
+
+
+class FlashGemma2Attention(torch.nn.Module):
+    def __init__(
+        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.head_dim
+        self.causal = causal
+        if is_sliding:
+            self.window_size = config.sliding_window
+        else:
+            self.window_size = -1
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        # self.softmax_scale = self.head_size**-0.5
+        self.softmax_scale = config.query_pre_attn_scalar**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+        self.softcap = config.attn_logit_softcapping
+
+        query_key_value = load_attention(config, prefix, weights)
+        self.query_key_value = TensorParallelMultiAdapterLinear.load(
+            query_key_value,
+            layer_id,
+            ["q_proj", "k_proj", "v_proj"],
+            sizes=[
+                self.head_size * config.num_attention_heads,
+                self.head_size * config.num_key_value_heads,
+                self.head_size * config.num_key_value_heads,
+            ],
+            process_group=weights.process_group,
+        )
+
+        o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            layer_id,
+            "o_proj",
+            process_group=weights.process_group,
+        )
+
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        adapter_data,
+    ):
+        qkv = self.query_key_value(hidden_states, adapter_data)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+                causal=self.causal,
+                window_size_left=self.window_size,
+                softcap=self.softcap,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+                softcap=self.softcap,
+            )
+
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
+
+
+class Gemma2MLP(nn.Module):
+    def __init__(self, prefix, config, weights, layer_id):
+        super().__init__()
+        act = config.hidden_activation
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            layer_id,
+            ["gate_proj", "up_proj"],
+            sizes=[
+                config.intermediate_size,
+                config.intermediate_size,
+            ],
+            process_group=weights.process_group,
+        )
+
+        down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            layer_id,
+            "down_proj",
+            process_group=weights.process_group,
+        )
+
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states, adapter_data):
+        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+        )
+
+
+class FlashGemma2Layer(nn.Module):
+    def __init__(
+        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
+    ):
+        super().__init__()
+        self.self_attn = FlashGemma2Attention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+            layer_id=layer_id,
+            causal=causal,
+            is_sliding=is_sliding,
+        )
+        self.mlp = Gemma2MLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
+        )
+
+        self.input_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.pre_feedforward_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.pre_feedforward_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.post_feedforward_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.post_feedforward_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        adapter_data,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            adapter_data,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, _ = self.post_attention_layernorm(attn_output)
+        normed_attn_res_output = normed_attn_res_output + res
+        res = normed_attn_res_output
+
+        pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output)
+        mlp_output = self.mlp(pre_normed, adapter_data)
+        post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output)
+
+        return post_hidden_states, normed_attn_res_output
+
+
+class FlashGemma2Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                FlashGemma2Layer(
+                    prefix=f"{prefix}.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                    layer_id=layer_id,
+                    causal=causal,
+                    is_sliding=layer_id % 2 == 0,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+                adapter_data,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGemma2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
+        super().__init__()
+
+        embed_norm = config.hidden_size**0.5
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.embed_tokens.weight *= embed_norm
+
+        self.model = FlashGemma2Model(
+            prefix=prefix, config=config, weights=weights, causal=causal
+        )
+        self.lm_head = SpeculativeHead.load(
+            prefix=(
+                f"{prefix}.embed_tokens"
+                if config.tie_word_embeddings
+                else f"{prefix}.lm_head"
+            ),
+            config=config,
+            weights=weights,
+        )
+        self.softcap = config.final_logit_softcapping
+        assert isinstance(self.softcap, float)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.model(
+            input_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            adapter_data,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+
+        logits /= self.softcap
+        logits = torch.tanh(logits)
+        logits *= self.softcap
+
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
new file mode 100644
index 00000000..4c1be6f6
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -0,0 +1,471 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+    PREFILL_IN_KV_CACHE,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+class GemmaConfig(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=256128,
+        hidden_size=3072,
+        intermediate_size=24576,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=256,
+        hidden_act="gelu_pytorch_tanh",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.head_dim = head_dim
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class GemmaFastRMSNorm(FastRMSNorm):
+    @classmethod
+    def load(cls, prefix: str, weights, eps=1e-6):
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        weight = weights.get_tensor(f"{prefix}.weight") + 1
+        weights.dtype = dtype
+        new = cls(weight, eps)
+        new.dtype = dtype
+        return new
+
+    # perform the multiplication in full precision and downcast after
+    def forward(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states * self.weight
+        return hidden_states.to(self.dtype), residual
+
+
+def load_attention(config, prefix: str, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+    )
+
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.head_dim
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=None))
+
+
+class FlashGemmaAttention(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.head_dim
+        self.causal = causal
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+                causal=self.causal,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class GemmaMLP(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+
+
+class FlashGemmaLayer(nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool):
+        super().__init__()
+        self.self_attn = FlashGemmaAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights, causal=causal
+        )
+        self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = GemmaFastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = GemmaFastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.mlp(normed_attn_res_output)
+
+        return mlp_output, attn_res
+
+
+class FlashGemmaModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                FlashGemmaLayer(
+                    prefix=f"{prefix}.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                    causal=causal,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = GemmaFastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGemmaForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
+        super().__init__()
+
+        embed_norm = config.hidden_size**0.5
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.embed_tokens.weight *= embed_norm
+
+        self.model = FlashGemmaModel(
+            prefix=prefix, config=config, weights=weights, causal=causal
+        )
+        self.lm_head = SpeculativeHead.load(
+            prefix=(
+                f"{prefix}.embed_tokens"
+                if config.tie_word_embeddings
+                else f"{prefix}.lm_head"
+            ),
+            config=config,
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.model(
+            input_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
new file mode 100644
index 00000000..44c015cf
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
@@ -0,0 +1,461 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+
+
+def load_qkv(config, prefix: str, weights, head_size, num_heads):
+    if config.quantize == "gptq":
+        return _load_qkv_gptq(
+            config,
+            prefix,
+            weights,
+        )
+    elif config.quantize == "marlin":
+        raise RuntimeError(
+            "GPT-2 models with marlin quantization are not yet supported"
+        )
+    else:
+        return _load_qkv(config, prefix, weights, head_size, num_heads)
+
+
+def _load_qkv_gptq(config, prefix: str, weights):
+    world_size = weights.process_group.size()
+    rank = weights.process_group.rank()
+
+    # Weights
+    weight = weights.get_weights_col_packed_qkv(
+        f"{prefix}.c_attn",
+        config.num_attention_heads,
+        config.num_attention_heads,
+    )
+
+    # Bias
+    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
+    shape = slice_.get_shape()
+    total_size = shape[0]
+    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
+    single_size = total_size // 3
+    assert single_size % world_size == 0
+    block_size = single_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+    tensors = []
+    for i in range(3):
+        tensor = slice_[start + i * single_size : stop + i * single_size]
+        tensors.append(tensor)
+    bias = torch.cat(tensors, dim=0)
+    bias = bias.to(device=weights.device)
+
+    return TensorParallelColumnLinear(get_linear(weight, bias))
+
+
+def _load_qkv(config, prefix: str, weights, head_size, num_heads):
+    """Load QKV from a single, transposed matrix."""
+
+    slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
+    shape = slice_.get_shape()
+    total_size = shape[1]
+    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
+    world_size = weights.process_group.size()
+    single_size = total_size // 3
+    assert single_size % world_size == 0
+    rank = weights.process_group.rank()
+
+    # Weights
+    block_size = single_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+    tensors = []
+    for i in range(3):
+        tensor = slice_[:, start + i * single_size : stop + i * single_size]
+        tensors.append(tensor)
+    weight = torch.cat(tensors, dim=1).T
+    weight = weight.to(dtype=weights.dtype)
+    weight = weight.to(device=weights.device)
+
+    # Bias
+    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
+    shape = slice_.get_shape()
+    total_size = shape[0]
+    single_size = total_size // 3
+    block_size = single_size // world_size
+    assert single_size % world_size == 0
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+    b = []
+    for i in range(3):
+        tensor = slice_[start + i * single_size : stop + i * single_size]
+        b.append(tensor)
+    bias = torch.cat(b, dim=0)
+    bias = bias.to(dtype=weights.dtype)
+    bias = bias.to(device=weights.device)
+    assert list(bias.shape) == [
+        3 * num_heads * head_size
+    ], f"{weight.shape} != {[3 * num_heads * head_size]}"
+
+    return TensorParallelColumnLinear(get_linear(weight, bias))
+
+
+def load_row(config, prefix: str, weights, bias: bool):
+    """load_row, but with transposed weight matrices."""
+
+    if config.quantize == "gptq":
+        weight = weights.get_weights_row(prefix)
+    else:
+        weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
+
+    if bias and weights.process_group.rank() == 0:
+        # Rank is only on the first rank process
+        bias = weights.get_tensor(f"{prefix}.bias")
+    else:
+        bias = None
+
+    return TensorParallelRowLinear(
+        get_linear(weight, bias), process_group=weights.process_group
+    )
+
+
+def load_col(config, prefix: str, weights, bias: bool):
+    """load_col, but with transposed weight matrices."""
+    if config.quantize == "gptq":
+        weight = weights.get_multi_weights_col([prefix], dim=1)
+    else:
+        weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
+
+    if bias:
+        bias = weights.get_sharded(f"{prefix}.bias", dim=0)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(get_linear(weight, bias))
+
+
+class FlashGPT2Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+
+        self.head_size = self.hidden_size // self.num_heads
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+
+        self.query_key_value = load_qkv(
+            config,
+            prefix=prefix,
+            weights=weights,
+            head_size=self.head_size,
+            num_heads=self.num_heads,
+        )
+
+        self.o_proj = load_row(
+            config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=True,
+        )
+
+        self.kv_head_mapping = torch.arange(
+            0, self.num_heads, dtype=torch.int32, device=weights.device
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        query, key, value = self.query_key_value(hidden_states).split(
+            self.head_size * self.num_heads, dim=1
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_heads, self.head_size)
+        value = value.view(-1, self.num_heads, self.head_size)
+
+        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else key,
+                kv_cache[1] if PREFILL_IN_KV_CACHE else value,
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class GPT2MLP(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        act = config.activation_function
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        self.c_fc = load_col(
+            config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
+        )
+        self.c_proj = load_row(
+            config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=True,
+        )
+
+        intermediate_size = (
+            config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+        )
+
+        self.intermediate_size = intermediate_size // weights.process_group.size()
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        return self.c_proj(hidden_states)
+
+
+class FlashGPT2Layer(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.self_attn = FlashGPT2Attention(
+            prefix=f"{prefix}.attn", config=config, weights=weights
+        )
+        self.mlp = GPT2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
+        )
+        self.post_attention_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.ln_2",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            hidden_states,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        hidden_states = attn_output + residual
+        residual = hidden_states
+
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        mlp_output = self.mlp(hidden_states)
+
+        return residual + mlp_output, residual
+
+
+class FlashGPT2Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                FlashGPT2Layer(
+                    prefix=(
+                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
+                    ),
+                    config=config,
+                    weights=weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.norm = nn.LayerNorm.load(
+            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class FlashGPT2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=("wte" if not prefix else f"{prefix}.wte"),
+            weights=weights,
+        )
+        self.embed_positions = TensorParallelEmbedding(
+            prefix=("wpe" if not prefix else f"{prefix}.wpe"),
+            weights=weights,
+        )
+
+        self.model = FlashGPT2Model(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="wte" if not prefix else f"{prefix}.wte",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        token_embeds = self.embed_tokens(input_ids)
+        position_embeds = self.embed_positions(position_ids)
+        inputs_embeds = token_embeds + position_embeds
+        hidden_states = self.model(
+            inputs_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
new file mode 100644
index 00000000..aca97004
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
@@ -0,0 +1,406 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+
+
+def load_attention(config, prefix: str, weights):
+    return TensorParallelColumnLinear.load_multi(
+        config,
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+        weights=weights,
+        bias=False,
+    )
+
+
+def load_row(config, prefix: str, weights, bias: bool):
+    weight = weights.get_weights_row(prefix)
+
+    if bias and weights.process_group.rank() == 0:
+        # Rank is only on the first rank process
+        bias = weights.get_tensor(f"{prefix}.bias")
+    else:
+        bias = None
+
+    linear = get_linear(weight, bias)
+    return TensorParallelRowLinear(linear, process_group=weights.process_group)
+
+
+class GPTJRotary(PositionRotaryEmbedding):
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            import rotary_emb
+
+            q1 = query[..., ::2]
+            q2 = query[..., 1::2]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., ::2]
+            k2 = key[..., 1::2]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            from vllm._C import ops
+
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            ops.rotary_embedding(query, key, head_size, cos, sin, False)
+        elif SYSTEM == "ipex":
+            import intel_extension_for_pytorch as ipex
+
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), False
+            )
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+
+class FlashGPTJAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+
+        self.head_size = self.hidden_size // self.num_heads
+        self.softmax_scale = self.head_size**-0.5
+        self.rotary_dim = config.rotary_dim
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+
+        self.query_key_value = load_attention(
+            config,
+            prefix=prefix,
+            weights=weights,
+        )
+
+        self.o_proj = load_row(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.kv_head_mapping = torch.arange(
+            0, self.num_heads, dtype=torch.int32, device=weights.device
+        )
+
+        self.rotary_emb = GPTJRotary.static(
+            config=config,
+            dim=self.rotary_dim,
+            base=10000,
+            device=weights.device,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        query, key, value = self.query_key_value(hidden_states).split(
+            self.head_size * self.num_heads, dim=1
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_heads, self.head_size)
+        value = value.view(-1, self.num_heads, self.head_size)
+
+        # Compute rotary embeddings on rotary_ndims
+        if self.rotary_dim is not None:
+            self.rotary_emb(
+                query[..., : self.rotary_dim], key[..., : self.rotary_dim], cos, sin
+            )
+        else:
+            self.rotary_emb(query, key, cos, sin)
+
+        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else key,
+                kv_cache[1] if PREFILL_IN_KV_CACHE else value,
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class GPTJMLP(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        act = config.activation_function
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        self.fc_in = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.fc_in", weights=weights, bias=True
+        )
+
+        self.fc_out = load_row(
+            config,
+            prefix=f"{prefix}.fc_out",
+            weights=weights,
+            bias=True,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        return self.fc_out(hidden_states)
+
+
+class FlashGPTJLayer(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.self_attn = FlashGPTJAttention(
+            prefix=f"{prefix}.attn", config=config, weights=weights
+        )
+        self.mlp = GPTJMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = FastLayerNorm.load(
+            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        # Self Attention
+        attn_output = self.self_attn(
+            hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        feed_forward_hidden_states = self.mlp(hidden_states)
+
+        return attn_output + feed_forward_hidden_states, residual
+
+
+class FlashGPTJModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.config = config
+
+        self.wte = TensorParallelEmbedding(prefix=f"{prefix}.wte", weights=weights)
+        self.layers = nn.ModuleList(
+            [
+                FlashGPTJLayer(
+                    prefix=(
+                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
+                    ),
+                    config=config,
+                    weights=weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.ln_f = FastLayerNorm.load(
+            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGPTJForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+        self.model = FlashGPTJModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            prefill_cache_indices=prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
new file mode 100644
index 00000000..c9ec70cc
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -0,0 +1,668 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import contextmanager
+from typing import List, Optional, Tuple, Type
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+    FastLayerNorm,
+)
+from text_generation_server.layers import (
+    FastLinear,
+)
+from text_generation_server.utils.weights import (
+    Weights,
+)
+from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
+
+if SYSTEM != "ipex":
+    pass
+
+if SYSTEM == "rocm":
+    try:
+        from vllm import _custom_C
+    except Exception as e:
+        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+
+
+def load_attention(config, prefix: str, weights, layer_id):
+    # Only defined in granite.
+    bias = getattr(config, "attention_bias", False)
+    head_size = config.hidden_size // config.num_attention_heads
+    sizes = None
+    prefixes = None
+
+    if config.model_type == "phi3":
+        base_layer = TensorParallelColumnLinear.load_qkv(
+            config,
+            prefix=f"{prefix}.qkv_proj",
+            weights=weights,
+            bias=bias,
+            num_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+        )
+        prefixes = ["qkv_proj"]
+    elif config.model_type == "baichuan":
+        prefix = f"{prefix}.W_pack"
+        base_layer = TensorParallelColumnLinear.load_qkv(
+            config,
+            prefix=prefix,
+            weights=weights,
+            bias=bias,
+            num_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+        )
+        prefixes = [prefix]
+    else:
+        prefixes = ["q_proj", "k_proj", "v_proj"]
+        sizes = [
+            head_size * config.num_attention_heads,
+            head_size * config.num_key_value_heads,
+            head_size * config.num_key_value_heads,
+        ]
+        base_layer = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=bias,
+        )
+
+    return TensorParallelMultiAdapterLinear.load(
+        base_layer=base_layer,
+        layer_id=layer_id,
+        layer_names=prefixes,
+        sizes=sizes,
+        process_group=weights.process_group,
+    )
+
+
+@contextmanager
+def no_fp8(weights: Weights):
+    """De-activate fp8 auto conversion for the duration of this context manager"""
+    weights_loader = weights.weights_loader
+    if isinstance(weights_loader, HybridFP8UnquantLoader) and weights_loader.to_fp8:
+        weights_loader = HybridFP8UnquantLoader(
+            weights_loader.activation_scale_ub, to_fp8=False
+        )
+
+    with weights.use_loader(weights_loader):
+        yield
+
+
+class FlashLlamaAttention(torch.nn.Module):
+    def __init__(
+        self,
+        index: int,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        # Setting defaults for baichuan custom config which doesn't apply them.
+        config.rope_theta = getattr(config, "rope_theta", 10000)
+        config.num_key_value_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        if config.num_key_value_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_key_value_heads` must be divisible by `num_shards` (got `num_key_value_heads`: {config.num_key_value_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights, index)
+        self.index = index
+
+        o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            index,
+            "o_proj",
+            process_group=weights.process_group,
+        )
+
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        adapter_data,
+    ):
+        qkv = self.query_key_value(hidden_states, adapter_data)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
+
+
+class Phi3MoE(nn.Module):
+    def __init__(
+        self, prefix: str, config, moe_layer_cls: Type[MoELayer], weights: Weights
+    ):
+        super().__init__()
+
+        # gating
+        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
+
+        self.moe = moe_layer_cls(
+            prefix=f"{prefix}.experts",
+            n_experts=config.num_local_experts,
+            n_expert_group=None,
+            renormalize=True,
+            topk=config.num_experts_per_tok,
+            topk_group=None,
+            weights=weights,
+            gate_proj_name="w1",
+            up_proj_name="w3",
+            down_proj_name="w2",
+        )
+
+        self.process_group = weights.process_group
+
+    def forward(self, x, adapter_data) -> torch.Tensor:
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(x)
+        out = self.moe(x, gating_output=router_logits)
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
+class LlamaMLP(nn.Module):
+    def __init__(self, prefix, config, weights, index):
+        super().__init__()
+        self.hidden_act = config.hidden_act
+        self.act = (
+            ACT2FN[self.hidden_act]
+            if "gelu" not in self.hidden_act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh"
+                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
+                    else "none"
+                ),
+            )
+        )
+        prefixes = None
+        sizes = None
+
+        # Fuse gate and up proj
+        bias = getattr(config, "mlp_bias", False)
+        if config.model_type == "phi3":
+            gate_up_proj = TensorParallelColumnLinear.load_gate_up(
+                config,
+                prefix=f"{prefix}.gate_up_proj",
+                weights=weights,
+                bias=bias,
+            )
+        else:
+            prefixes = ["gate_proj", "up_proj"]
+            sizes = [
+                config.intermediate_size,
+                config.intermediate_size,
+            ]
+            gate_up_proj = TensorParallelColumnLinear.load_multi(
+                config,
+                prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+                weights=weights,
+                dim=0,
+                bias=bias,
+            )
+
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            index,
+            layer_names=prefixes,
+            sizes=sizes,
+            process_group=weights.process_group,
+        )
+
+        down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=bias,
+        )
+
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            index,
+            "down_proj",
+            process_group=weights.process_group,
+        )
+
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
+        self.hidden_size = config.hidden_size
+
+    def forward(self, hidden_states, adapter_data):
+        if (
+            SYSTEM == "rocm"
+            and self.hidden_act == "silu"
+            and hidden_states.dtype == torch.float16
+            and hidden_states.shape[0] == 1
+            and not self.quantize
+            and self.hidden_size
+            != 16384  # TODO: Temporary workaround for `LLMM_Silu` kernel not working with LLama3.1 405B; needs refactoring once fixed.
+        ):
+            out = torch.empty(
+                hidden_states.shape[0],
+                self.intermediate_size,
+                dtype=hidden_states.dtype,
+                device="cuda",
+            )
+            _custom_C.LLMM_Silu(
+                self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
+            )
+            return self.down_proj(out, adapter_data)
+        else:
+            gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
+            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+            return self.down_proj(
+                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+            )
+
+
+class FlashLlamaLayer(nn.Module):
+    def __init__(self, index, prefix, config, weights):
+        super().__init__()
+
+        with no_fp8(weights):
+            self.self_attn = FlashLlamaAttention(
+                index=index,
+                prefix=f"{prefix}.self_attn",
+                config=config,
+                weights=weights,
+            )
+
+        if config.model_type == "phimoe":
+            moe_layer_cls = (
+                SparseMoELayer
+                if SparseMoELayer.is_supported(weights)
+                else DenseMoELayer
+            )
+            self.dense = Phi3MoE(
+                f"{prefix}.block_sparse_moe", config, moe_layer_cls, weights
+            )
+            # with moe the layernorms are are not rmsnorms and they have bias
+            self.input_layernorm = FastLayerNorm.load(
+                prefix=f"{prefix}.input_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+            self.post_attention_layernorm = FastLayerNorm.load(
+                prefix=f"{prefix}.post_attention_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+        else:
+            self.dense = LlamaMLP(
+                prefix=f"{prefix}.mlp", config=config, weights=weights, index=index
+            )
+            self.input_layernorm = FastRMSNorm.load(
+                prefix=f"{prefix}.input_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+            self.post_attention_layernorm = FastRMSNorm.load(
+                prefix=f"{prefix}.post_attention_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        adapter_data,
+        cross_attention_states,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            adapter_data,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.dense(normed_attn_res_output, adapter_data)
+
+        return mlp_output, attn_res
+
+
+class FlashLlamaModel(torch.nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+
+        # Skip fp8 quant for first and last layers
+        self.layers = nn.ModuleList()
+        self.cross_attention_layers = getattr(config, "cross_attention_layers", [])
+        with no_fp8(weights):
+            self.layers.append(
+                FlashLlamaLayer(
+                    index=0,
+                    prefix=(
+                        "model.layers.0" if not prefix else f"{prefix}.model.layers.0"
+                    ),
+                    config=config,
+                    weights=weights,
+                )
+            )
+
+        # Skip first and last layers
+        for layer_id in range(1, config.num_hidden_layers - 1):
+            if layer_id in self.cross_attention_layers:
+                from text_generation_server.models.custom_modeling.mllama import (
+                    FlashLlamaCrossLayer,
+                )
+
+                self.layers.append(
+                    FlashLlamaCrossLayer(
+                        index=layer_id,
+                        prefix=(
+                            f"model.layers.{layer_id}"
+                            if not prefix
+                            else f"{prefix}.model.layers.{layer_id}"
+                        ),
+                        config=config,
+                        weights=weights,
+                    )
+                )
+            else:
+                self.layers.append(
+                    FlashLlamaLayer(
+                        index=layer_id,
+                        prefix=(
+                            f"model.layers.{layer_id}"
+                            if not prefix
+                            else f"{prefix}.model.layers.{layer_id}"
+                        ),
+                        config=config,
+                        weights=weights,
+                    )
+                )
+
+        with no_fp8(weights):
+            last_layer_id = config.num_hidden_layers - 1
+            self.layers.append(
+                FlashLlamaLayer(
+                    index=last_layer_id,
+                    prefix=(
+                        f"model.layers.{last_layer_id}"
+                        if not prefix
+                        else f"{prefix}.model.layers.{last_layer_id}"
+                    ),
+                    config=config,
+                    weights=weights,
+                )
+            )
+
+        self.norm = FastRMSNorm.load(
+            prefix="model.norm" if not prefix else f"{prefix}.model.norm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        adapter_data,
+        cross_attention_states=None,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+                adapter_data,
+                cross_attention_states,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashLlamaForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        with no_fp8(weights):
+            self.embed_tokens = TensorParallelEmbedding(
+                prefix=(
+                    "model.embed_tokens"
+                    if not prefix
+                    else f"{prefix}.model.embed_tokens"
+                ),
+                weights=weights,
+            )
+        self.model = FlashLlamaModel(prefix, config, weights)
+        if config.tie_word_embeddings:
+            suffix = "model.embed_tokens"
+        else:
+            suffix = "lm_head"
+
+        with no_fp8(weights):
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix=suffix if not prefix else f"{prefix}.{suffix}",
+                weights=weights,
+            )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+        cross_attention_states=None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.model(
+            inputs_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=prefill_cache_indices,
+            adapter_data=adapter_data,
+            cross_attention_states=cross_attention_states,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
new file mode 100644
index 00000000..341a2352
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -0,0 +1,535 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+
+
+if SYSTEM == "rocm":
+    try:
+        from vllm import _custom_C
+    except Exception as e:
+        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+
+
+class MistralConfig(PretrainedConfig):
+    model_type = "mistral"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class MistralAttention(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, layer_id):
+        super().__init__()
+        self.max_past = (
+            config.sliding_window if config.sliding_window is not None else -1
+        )
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        if hasattr(config, "head_dim"):
+            self.head_size = config.head_dim
+        else:
+            self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        query_key_value = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+        self.query_key_value = TensorParallelMultiAdapterLinear.load(
+            query_key_value,
+            layer_id,
+            ["q_proj", "k_proj", "v_proj"],
+            sizes=[
+                self.head_size * config.num_attention_heads,
+                self.head_size * config.num_key_value_heads,
+                self.head_size * config.num_key_value_heads,
+            ],
+            process_group=weights.process_group,
+        )
+
+        o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            layer_id,
+            "o_proj",
+            process_group=weights.process_group,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        prefill_cache_indices,
+        adapter_data,
+    ):
+        qkv = self.query_key_value(hidden_states, adapter_data)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
+        reshape_and_cache(
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+                window_size_left=self.max_past,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, prefix: str, config, weights, layer_id):
+        super().__init__()
+        self.hidden_act = config.hidden_act
+        self.act = (
+            ACT2FN[self.hidden_act]
+            if "gelu" not in self.hidden_act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh"
+                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
+                    else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            layer_id,
+            ["gate_proj", "up_proj"],
+            sizes=[
+                config.intermediate_size,
+                config.intermediate_size,
+            ],
+            process_group=weights.process_group,
+        )
+
+        down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            layer_id,
+            "down_proj",
+            process_group=weights.process_group,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
+    def forward(self, hidden_states, adapter_data):
+        if (
+            SYSTEM == "rocm"
+            and self.hidden_act == "silu"
+            and hidden_states.dtype == torch.float16
+            and hidden_states.shape[0] == 1
+            and not self.quantize
+        ):
+            out = torch.empty(
+                hidden_states.shape[0],
+                self.intermediate_size,
+                dtype=hidden_states.dtype,
+                device="cuda",
+            )
+            _custom_C.LLMM_Silu(
+                self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
+            )
+            return self.down_proj(out, adapter_data)
+        else:
+            gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
+            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+            return self.down_proj(
+                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+            )
+
+
+class MistralLayer(nn.Module):
+    def __init__(self, prefix: str, config, weights, layer_id):
+        super().__init__()
+        self.self_attn = MistralAttention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+            layer_id=layer_id,
+        )
+        self.mlp = MistralMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
+        )
+
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        prefill_cache_indices,
+        adapter_data,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            prefill_cache_indices,
+            adapter_data,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.mlp(normed_attn_res_output, adapter_data)
+
+        return mlp_output, attn_res
+
+
+class MistralModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                MistralLayer(
+                    prefix=f"{prefix}.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                    layer_id=layer_id,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        adapter_data: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = inputs_embeds
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, true_max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+                prefill_cache_indices,
+                adapter_data,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class FlashMistralForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, name=None):
+        if name is None:
+            name = "model"
+        super().__init__()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=(
+                f"{name}.embed_tokens"
+                if not prefix
+                else f"{prefix}.{name}.embed_tokens"
+            ),
+            weights=weights,
+        )
+        self.model = MistralModel(
+            prefix=name if not prefix else f"{prefix}.{name}",
+            config=config,
+            weights=weights,
+        )
+        self.lm_head = SpeculativeHead.load(
+            config,
+            # TODO dirty hack for idefics2.
+            prefix=(
+                "lm_head" if not prefix or name != "model" else f"{prefix}.lm_head"
+            ),
+            weights=weights,
+        )
+        self.max_past = config.sliding_window
+        self.max_past_tensor = (
+            torch.tensor(config.sliding_window, device=weights.device)
+            if self.max_past is not None
+            else None
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        true_max_s = max_s
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif self.max_past is not None:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            seqlen = seqlen.clamp(max=self.max_past_tensor)
+
+        inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.model(
+            inputs_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            true_max_s,
+            prefill_cache_indices,
+            adapter_data,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
new file mode 100644
index 00000000..5836d30a
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -0,0 +1,542 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Type
+
+import torch
+import torch.distributed
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from text_generation_server.layers import (
+    FastLinear,
+    SpeculativeHead,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    get_linear,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+    attention,
+    paged_attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.layernorm import FastRMSNorm
+from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+class MixtralConfig(PretrainedConfig):
+    model_type = "mixtral"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=None,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+def promote_scalar(x: torch.Tensor) -> torch.Tensor:
+    return x.view(1) if len(x.size()) == 0 else x
+
+
+def load_attention(config, prefix: str, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+    )
+
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.hidden_size // config.num_attention_heads
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=None))
+
+
+def _load_experts(config, prefix: str, mat, weights):
+    if config.quantize is not None:
+        raise NotImplementedError("Mixtral does not support weight quantization yet.")
+
+    assert mat in ["w1", "w2", "w3"]
+
+    world_size = weights.process_group.size()
+    rank = weights.process_group.rank()
+
+    assert (
+        config.intermediate_size % world_size == 0
+    ), f"The chosen size {config.intermediate_size} is not compatible with sharding on {world_size} shards"
+
+    block_size = config.intermediate_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+
+    tensor = torch.empty(
+        (config.num_local_experts * block_size, config.hidden_size),
+        dtype=weights.dtype,
+        device=weights.device,
+    )
+
+    for i in range(config.num_local_experts):
+        slice_ = weights._get_slice(f"{prefix}.{i}.{mat}.weight")
+
+        if mat == "w2":
+            expert_slice = slice_[:, start:stop].t().contiguous()
+        else:
+            expert_slice = slice_[start:stop]
+        tensor[i * block_size : (i + 1) * block_size] = expert_slice.to(
+            dtype=weights.dtype
+        ).to(device=weights.device)
+    return tensor
+
+
+class MixtralAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.max_past = (
+            config.sliding_window if config.sliding_window is not None else -1
+        )
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        prefill_cache_indices,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
+        reshape_and_cache(
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+                window_size_left=self.max_past,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+@torch.jit.script
+def select_experts(gate_logits: torch.Tensor, top_k: int):
+    # all_probs: (sequence_length, n_experts) and upcast for softmax
+    all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
+    # weights, selected_experts: (sequence_length, top-k)
+    weights, selected_experts = torch.topk(all_probs, top_k, dim=-1)
+    weights /= weights.sum(dim=-1, keepdim=True)
+    weights = weights.view(-1)
+    selected_experts = selected_experts.view(-1)
+
+    return selected_experts, weights
+
+
+@torch.jit.script
+def round_up(x: torch.Tensor, value: int):
+    return torch.div(x + (value - 1), value, rounding_mode="trunc") * value
+
+
+class MixtralMoE(nn.Module):
+    def __init__(
+        self, prefix, config: MixtralConfig, moe_layer_cls: Type[MoELayer], weights
+    ):
+        super().__init__()
+
+        # gating
+        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
+
+        self.moe = moe_layer_cls(
+            n_expert_group=None,
+            n_experts=config.num_local_experts,
+            prefix=f"{prefix}.experts",
+            renormalize=True,
+            topk=config.num_experts_per_tok,
+            topk_group=None,
+            weights=weights,
+            gate_proj_name="w1",
+            up_proj_name="w3",
+            down_proj_name="w2",
+        )
+        assert isinstance(self.moe, MoELayer)
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(x)
+        out = self.moe(x, gating_output=router_logits)
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
+class MixtralLayer(nn.Module):
+    def __init__(self, prefix: str, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+
+        self.self_attn = MixtralAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+
+        moe_layer_cls = (
+            SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
+        )
+        self.moe = MixtralMoE(
+            f"{prefix}.block_sparse_moe", config, moe_layer_cls, weights
+        )
+
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        prefill_cache_indices,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            prefill_cache_indices,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        moe_output = self.moe(normed_attn_res_output)
+
+        return moe_output, attn_res
+
+
+class MixtralModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=(
+                "model.embed_tokens" if not prefix else f"{prefix}.model.embed_tokens"
+            ),
+            weights=weights,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                MixtralLayer(
+                    "model" if not prefix else f"{prefix}.model",
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix="model.norm" if not prefix else f"{prefix}.model.norm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, true_max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+                prefill_cache_indices,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashMixtralForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        self.model = MixtralModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
+            weights=weights,
+        )
+        self.max_past = config.sliding_window
+        self.max_past_tensor = (
+            torch.tensor(config.sliding_window, device=weights.device)
+            if self.max_past is not None
+            else None
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        true_max_s = max_s
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif self.max_past is not None:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            seqlen = seqlen.clamp(max=self.max_past_tensor)
+
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            true_max_s,
+            prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
new file mode 100644
index 00000000..ad4e382f
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -0,0 +1,425 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.gpt_neox import GPTNeoXConfig as TransformersGPTNeoXConfig
+from typing import Optional, List, Tuple
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+class GPTNeoXConfig(TransformersGPTNeoXConfig):
+    attribute_map = {
+        "num_key_value_heads": "num_attention_heads",
+    }
+
+
+def load_row(config, prefix: str, weights, bias: bool):
+    weight = weights.get_weights_row(prefix)
+
+    if bias and weights.process_group.rank() == 0:
+        # Rank is only on the first rank process
+        bias = weights.get_tensor(f"{prefix}.bias")
+    else:
+        bias = None
+
+    linear = get_linear(weight, bias)
+    if config.use_parallel_residual:
+        return linear
+    else:
+        return TensorParallelRowLinear(linear, process_group=weights.process_group)
+
+
+def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_size):
+    weight = weights.get_multi_weights_col([prefix], dim=0)
+    if isinstance(weight, UnquantizedWeight):
+        # Only on non quantized versions
+        weight.weight = (
+            weight.weight.view(
+                num_heads,
+                3,
+                head_size,
+                hidden_size,
+            )
+            .permute(1, 0, 2, 3)
+            .reshape(-1, hidden_size)
+        )
+
+    bias = weights.get_sharded(f"{prefix}.bias", dim=0)
+    bias = bias.view(num_heads, 3, head_size).permute(1, 0, 2).reshape(-1)
+
+    linear = get_linear(weight, bias)
+    if config.use_parallel_residual:
+        return linear
+    else:
+        return TensorParallelColumnLinear(linear)
+
+
+class FlashNeoxAttention(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        num_heads = config.num_attention_heads
+        hidden_size = config.hidden_size
+
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.head_size = hidden_size // num_heads
+
+        self.rotary_dim = int(config.rotary_pct * self.head_size)
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.rotary_dim,
+            base=config.rotary_emb_base,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size ** (-0.5)
+
+        self.query_key_value = load_qkv(
+            config,
+            prefix=f"{prefix}.query_key_value",
+            weights=weights,
+            num_heads=self.num_heads,
+            head_size=self.head_size,
+            hidden_size=self.hidden_size,
+        )
+        self.dense = load_row(
+            config, prefix=f"{prefix}.dense", weights=weights, bias=True
+        )
+        self.kv_head_mapping = torch.arange(
+            0, self.num_heads, dtype=torch.int32, device=weights.device
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        qkv = qkv.view(-1, 3, self.num_heads, self.head_size)
+
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = qkv[:, 0][..., : self.rotary_dim]
+        query_pass = qkv[:, 0][..., self.rotary_dim :]
+        key_rot = qkv[:, 1][..., : self.rotary_dim]
+        key_pass = qkv[:, 1][..., self.rotary_dim :]
+
+        # Inplace rotary
+        self.rotary_emb(query_rot, key_rot, cos, sin)
+        qkv[:, 0] = torch.cat((query_rot, query_pass), dim=-1)
+        qkv[:, 1] = torch.cat((key_rot, key_pass), dim=-1)
+
+        reshape_and_cache(qkv[:, 1], qkv[:, 2], kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                qkv[:, 0],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else qkv[:, 1],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else qkv[:, 2],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                qkv[:, 0],
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class FlashMLP(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        self.dense_h_to_4h = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=True
+        )
+        self.dense_4h_to_h = load_row(
+            config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class FlashNeoXLayer(nn.Module):
+    def __init__(self, layer_id, config, weights):
+        super().__init__()
+
+        layer_norm_eps = config.layer_norm_eps
+
+        prefix = f"gpt_neox.layers.{layer_id}"
+
+        self.use_parallel_residual = config.use_parallel_residual
+        self.input_layernorm = FastLayerNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=layer_norm_eps
+        )
+        self.post_attention_layernorm = FastLayerNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=layer_norm_eps,
+        )
+        self.attention = FlashNeoxAttention(
+            config, prefix=f"{prefix}.attention", weights=weights
+        )
+
+        self.mlp = FlashMLP(config, prefix=f"{prefix}.mlp", weights=weights)
+        self.process_group = weights.process_group
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        if self.use_parallel_residual:
+            ln1_hidden_states, _ = self.input_layernorm(hidden_states)
+
+            attn_output = self.attention(
+                ln1_hidden_states,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache,
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+            ln2_hidden_states, _ = self.post_attention_layernorm(hidden_states)
+
+            mlp_output = self.mlp(ln2_hidden_states)
+            intermediate = mlp_output + attn_output
+
+            if self.process_group.size() > 1:
+                torch.distributed.all_reduce(intermediate, group=self.process_group)
+
+            return intermediate + hidden_states, None
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+            hidden_states = self.attention(
+                hidden_states,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache,
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual
+            )
+
+            mlp_output = self.mlp(hidden_states)
+
+            return mlp_output, residual
+
+
+class FlashGPTNeoXPreTrainedModel(PreTrainedModel):
+    config_class = GPTNeoXConfig
+    base_model_prefix = "gpt_neox"
+    supports_gradient_checkpointing = False
+    _no_split_modules = None
+
+
+class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__(config)
+        self.config = config
+
+        self.embed_in = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_in", weights=weights
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                FlashNeoXLayer(layer_id, config, weights)
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.final_layer_norm = FastLayerNorm.load(
+            prefix=f"{prefix}.final_layer_norm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].attention.head_size
+        self.num_heads = self.layers[0].attention.num_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_in(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].attention.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.final_layer_norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
+    def __init__(self, prefix, config, weights):
+        super().__init__(config)
+
+        if not prefix:
+            prefix = "gpt_neox"
+        else:
+            prefix = f"{prefix}.gpt_neox"
+
+        self.gpt_neox = FlashGPTNeoXModel(prefix, config, weights)
+
+        self.embed_out = SpeculativeHead.load(
+            config, prefix="embed_out", weights=weights
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.gpt_neox(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.embed_out(hidden_states)
+        return logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
new file mode 100644
index 00000000..0024f2bb
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+from torch import nn
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear
+from text_generation_server.layers.attention import Seqlen
+from text_generation_server.models.custom_modeling.vlm import (
+    load_text_model,
+    load_vision_model,
+)
+
+
+class PaliGemmaForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = config.quantize
+        self.vision_tower = load_vision_model(
+            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
+            config=config.vision_config,
+            weights=weights,
+        )
+        self.post_vision_tower_layernorm = nn.LayerNorm.load(
+            prefix="vision_tower.vision_model.post_layernorm",
+            weights=weights,
+            eps=config.vision_config.layer_norm_eps,
+        )
+
+        self.multi_modal_projector = TensorParallelColumnLinear.load(
+            config,
+            prefix="multi_modal_projector.linear",
+            weights=weights,
+            bias=True,
+        )
+
+        self.vocab_size = config.text_config.vocab_size
+        self.config = config
+
+        text_config = config.text_config
+        text_config.speculator = config.speculator
+        text_config.quantize = config.quantize
+        self.text_model = load_text_model(
+            prefix="language_model" if not prefix else f"{prefix}.language_model",
+            config=config.text_config,
+            weights=weights,
+        )
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        # Unused here
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
+        # TODO This is odd but apparently pali gemma position ids start at 1.
+        if cu_seqlen_prefill is not None:
+            max_s += 1
+            position_ids += 1
+
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(dtype=inputs_embeds.dtype)
+            image_outputs = self.vision_tower(pixel_values)
+            last_hidden_state = self.post_vision_tower_layernorm(
+                image_outputs.last_hidden_state
+            )
+            image_features = self.multi_modal_projector(last_hidden_state)
+
+            # mask where image or padding tokens
+            mask = input_ids == self.config.image_token_index
+
+            # insert image features into input embeddings
+            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+
+        hidden_states = self.text_model.model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+        )
+
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
+
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
new file mode 100644
index 00000000..2a0dc606
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -0,0 +1,418 @@
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+
+
+class PhiConfig(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=51200,
+        hidden_size=2560,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="gelu_fast",  # llama uses silu
+        layer_norm_eps=1e-05,  # rms in llama,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        resid_pdrop=0.1,  # llama doesn't have this
+        partial_rotary_factor=0.5,  # important difference between llama and phi
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.rope_theta = rope_theta
+        self.resid_pdrop = resid_pdrop
+        self.partial_rotary_factor = partial_rotary_factor
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+# this is the same as llama except for Phi uses bias=True
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+    )
+
+    if config.quantize not in ["gptq", "awq", "marlin"]:
+        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.hidden_size // config.num_attention_heads
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    # this is the same as llama except for Phi uses bias=True
+    return TensorParallelColumnLinear(get_linear(weight, bias=True))
+
+
+class FlashPhiAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.softmax_scale = self.head_size**-0.5
+        self.rotary_dim = int(config.partial_rotary_factor * self.head_size)
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.rotary_dim,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        # in llama the dense layer is called "o_proj" and has bias=False
+        self.dense = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.dense",
+            weights=weights,
+            bias=True,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        # Compute query, key, value and split
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+
+        # Reshape query and key for rotary embeddings
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        # NOTE: this is the main difference between Llama and Phi
+        # in llama the rotary embeddings are applied to the whole query and key.
+        # Phi uses PARTIAL rotary embeddings, which are applied to the first 32 dimensions
+        #
+        # Apply partial positional embeddings in place
+        self.rotary_emb(
+            query[:, :, : self.rotary_dim], kv[:, 0, :, : self.rotary_dim], cos, sin
+        )
+
+        # Reshape key and value and cache
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class PhiMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        # llama weights are up_proj and down_proj and bias=False
+        self.up_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.fc1",
+            weights=weights,
+            bias=True,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.fc2",
+            weights=weights,
+            bias=True,
+        )
+
+    def forward(self, hidden_states):
+        # NOTE: Llama requires the gate up states to an intermediate size
+        # Phi does not and we can avoid the `view` operation
+        return self.down_proj(self.act(self.up_proj(hidden_states)))
+
+
+class FlashPhiLayer(nn.Module):
+    def __init__(self, prefix: str, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+        self.self_attn = FlashPhiAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = PhiMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.input_layernorm = FastLayerNorm.load(
+            prefix=f"{prefix}.input_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+        self.resid_dropout = torch.nn.Dropout(config.resid_pdrop)
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        hidden_states, res = self.input_layernorm(hidden_states, residual)
+        # Self Attention
+        attn_output = self.self_attn(
+            hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        hidden_states = self.resid_dropout(attn_output).add(
+            self.resid_dropout(self.mlp(hidden_states))
+        )
+
+        return hidden_states, res
+
+
+class FlashPhiModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                FlashPhiLayer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+        self.norm = FastLayerNorm.load(
+            prefix="model.final_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashPhiForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = FlashPhiModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+
+        return self.lm_head(hidden_states)
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py
new file mode 100644
index 00000000..bb585cc4
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch Phi-MoE model."""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+PHIMOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/Phi-3.5-MoE-instruct": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/resolve/main/config.json",
+}
+
+
+class PhiMoEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PhiMoEModel`]. It is used to instantiate a Phi-MoE
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32064):
+            Vocabulary size of the PhiMoE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`PhiMoEModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 6400):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mixtral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor`, `long_factor`, `short_mscale`, `long_mscale` and
+            `original_max_position_embeddings`. The `type` must be `longrope`, the `short_mscale` and `long_scale` must
+            be numbers, the `short_factor` and `long_factor` must be lists of numbers with the same length as half of
+            the attention head size and the `original_max_position_embeddings` must be an integer.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `262144`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 16):
+            Number of experts per Sparse MLP layer.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabeling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.0):
+            The aux loss factor for the total loss.
+        router_jitter_noise (`float`, *optional*, defaults to 0.01):
+            Amount of noise to add to the router.
+
+    ```python
+    >>> from transformers import PhiMoEModel, PhiMoEConfig
+
+    >>> # Initializing a Phi-3 style configuration
+    >>> configuration = PhiMoEConfig.from_pretrained("microsoft/Phi-3.5-MoE-instruct")
+
+    >>> # Initializing a model from the configuration
+    >>> model = PhiMoEModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "phimoe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32064,
+        hidden_size=4096,
+        intermediate_size=6400,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        rope_scaling=None,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=16,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.01,
+        input_jitter_noise=0.0,
+        attention_bias=False,
+        lm_head_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.attention_bias = attention_bias
+        self.lm_head_bias = lm_head_bias
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        self.input_jitter_noise = input_jitter_noise
+
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 6:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor`, `long_factor`, "
+                f"`short_mscale`, `long_mscale` and `original_max_position_embeddings`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        rope_scaling_short_mscale = self.rope_scaling.get("short_mscale", None)
+        rope_scaling_long_mscale = self.rope_scaling.get("long_mscale", None)
+        original_max_position_embeddings = self.rope_scaling.get(
+            "original_max_position_embeddings", None
+        )
+        if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}"
+            )
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        if (
+            not len(rope_scaling_short_factor)
+            == self.hidden_size // self.num_attention_heads // 2
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if (
+            not len(rope_scaling_long_factor)
+            == self.hidden_size // self.num_attention_heads // 2
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
+            )
+        if not isinstance(rope_scaling_short_mscale, (int, float)):
+            raise ValueError(
+                f"`rope_scaling`'s short_mscale field must be a number, got {rope_scaling_short_mscale}"
+            )
+        if not isinstance(rope_scaling_long_mscale, (int, float)):
+            raise ValueError(
+                f"`rope_scaling`'s long_mscale field must be a number, got {rope_scaling_long_mscale}"
+            )
+        if not isinstance(original_max_position_embeddings, int):
+            raise ValueError(
+                f"`rope_scaling`'s original_max_position_embeddings field must be an integer, got {original_max_position_embeddings}"
+            )
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
new file mode 100644
index 00000000..02c788d3
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -0,0 +1,394 @@
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+
+
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    return TensorParallelColumnLinear.load_multi(
+        config,
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+        weights=weights,
+        bias=True,
+    )
+
+
+class Qwen2Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.max_past = (
+            config.sliding_window if config.sliding_window is not None else -1
+        )
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        prefill_cache_indices,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
+        reshape_and_cache(
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+                window_size_left=self.max_past,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class Qwen2MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+
+
+class Qwen2Layer(nn.Module):
+    def __init__(self, prefix, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+        self.self_attn = Qwen2Attention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = Qwen2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        prefill_cache_indices,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            prefill_cache_indices,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.mlp(normed_attn_res_output)
+
+        return mlp_output, attn_res
+
+
+class Qwen2Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        prefix = f"{prefix}.model" if prefix else "model"
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen2Layer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, true_max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+                prefill_cache_indices,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class Qwen2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        self.model = Qwen2Model(prefix, config, weights)
+
+        if config.tie_word_embeddings:
+            suffix = "model.embed_tokens"
+        else:
+            suffix = "lm_head"
+
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix=f"{prefix}.{suffix}" if prefix else suffix,
+            weights=weights,
+        )
+        self.max_past = config.sliding_window
+        self.max_past_tensor = (
+            torch.tensor(config.sliding_window, device=weights.device)
+            if self.max_past is not None
+            else None
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        true_max_s = max_s
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif self.max_past is not None:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            seqlen = seqlen.clamp(max=self.max_past_tensor)
+
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            true_max_s,
+            prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
new file mode 100644
index 00000000..6671d85e
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -0,0 +1,699 @@
+from typing import List, Optional, Tuple
+
+import torch
+import torch.distributed
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from text_generation_server.layers import (
+    SpeculativeHead,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    get_linear,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.layernorm import FastLayerNorm
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.attention import (
+    attention,
+    paged_attention,
+    reshape_and_cache,
+    Seqlen,
+)
+
+
+def load_row(config, prefix: str, weights, bias: bool):
+    weight = weights.get_weights_row(prefix)
+
+    if bias and weights.process_group.rank() == 0:
+        # Rank is only on the first rank process
+        bias = weights.get_tensor(f"{prefix}.bias")
+    else:
+        bias = None
+
+    linear = get_linear(weight, bias)
+    if config.parallel_attn:
+        return linear
+    else:
+        return TensorParallelRowLinear(linear, process_group=weights.process_group)
+
+
+class RWConfig(PretrainedConfig):
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "num_key_value_heads": "n_head_kv",
+    }
+
+    def __init__(
+        self,
+        model_type="RefinedWeb",
+        vocab_size=250880,
+        hidden_size=64,
+        num_hidden_layers=None,
+        num_attention_heads=None,
+        num_ln_in_prallel_attention=None,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        num_kv_heads=None,
+        multi_query=False,
+        alibi=False,
+        new_decoder_architecture=None,
+        bias=False,
+        parallel_attn=False,
+        rope_theta=10_000.0,
+        **kwargs,
+    ):
+        if alibi:
+            raise NotImplementedError(
+                "alibi is not supported by this version of the model"
+            )
+
+        self.model_type = model_type
+        self.alibi = False
+        self.rotary = True
+        self.rope_theta = rope_theta
+
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = (
+            num_hidden_layers
+            if num_hidden_layers is not None
+            else kwargs.pop("n_layer", 2)
+        )
+        self.n_head = (
+            num_attention_heads
+            if num_attention_heads is not None
+            else kwargs.pop("n_head", 8)
+        )
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.num_ln_in_parallel_attn = num_ln_in_prallel_attention
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.bias = bias
+        self.parallel_attn = parallel_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        if num_kv_heads is not None:
+            self.n_head_kv = num_kv_heads
+        else:
+            old_n_head_kv = kwargs.pop("n_head_kv", None)
+            if old_n_head_kv is not None:
+                self.n_head_kv = old_n_head_kv
+            else:
+                self.n_head_kv = 1 if multi_query else self.n_head
+
+        if new_decoder_architecture is not None:
+            self.new_decoder_architecture = new_decoder_architecture
+        elif model_type == "RefinedWeb":
+            self.new_decoder_architecture = True
+        else:
+            self.new_decoder_architecture = False
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+class FlashRWAttention(torch.nn.Module):
+    def __init__(
+        self,
+        config,
+        prefix: str,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.n_head
+        self.num_heads_kv = config.n_head_kv
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+        self.rope_theta = config.rope_theta
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=self.rope_theta,
+            device=weights.device,
+        )
+        self.softmax_scale = self.head_size ** (-0.5)
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+
+        self.query_key_value = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.query_key_value",
+            weights=weights,
+            bias=config.bias,
+        )
+        self.dense = load_row(
+            config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
+        )
+
+        if self.num_heads_kv == 1:
+            self.kv_head_mapping = torch.zeros(
+                self.num_heads, dtype=torch.int32, device=weights.device
+            )
+        else:
+            self.kv_head_mapping = torch.arange(
+                0, self.num_heads, dtype=torch.int32, device=weights.device
+            )
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+
+        # Split query from key_value
+        query, kv = qkv.split(
+            [self.head_size * self.num_heads, 2 * self.head_size * self.num_heads_kv],
+            dim=1,
+        )
+
+        # Prepare query and key_value for indexing
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_heads_kv, self.head_size)
+
+        # Inplace rotary
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class FlashRWLargeAttention(torch.nn.Module):
+    def __init__(
+        self,
+        config,
+        prefix: str,
+        weights,
+    ):
+        super().__init__()
+
+        hidden_size = config.hidden_size
+        num_heads = config.n_head
+        # num_heads_kv = config.n_head_kv
+        num_groups = config.n_head_kv
+
+        self.hidden_size = hidden_size
+        self.head_size = hidden_size // num_heads
+        self.num_groups = num_groups
+        self.rope_theta = config.rope_theta
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=self.rope_theta,
+            device=weights.device,
+        )
+        self.softmax_scale = self.head_size ** (-0.5)
+
+        # self.num_groups = num_heads // (num_heads_kv * 2)
+        self.num_heads = num_heads // self.num_groups
+        # self.num_heads_kv = num_heads_kv // self.num_groups
+        process_group = weights.process_group
+
+        if process_group.size() > self.num_groups:
+            raise NotImplementedError(
+                "Tensor Parallelism is not implemented for world_size > n groups"
+            )
+        if self.num_groups % process_group.size() != 0:
+            raise NotImplementedError(
+                f"Tensor Parallelism is not implemented for {self.num_groups} not divisible by {process_group.size()}"
+            )
+
+        self.num_groups = self.num_groups // process_group.size()
+
+        self.query_key_value = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.query_key_value",
+            weights=weights,
+            bias=config.bias,
+        )
+        self.dense = load_row(
+            config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
+        )
+
+        self.kv_head_mapping = torch.arange(
+            0, self.num_groups, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_heads)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        qkv = qkv.view(-1, self.num_groups, self.num_heads + 2, self.head_size)
+
+        # Split on group dimension
+        query, kv = qkv.split(
+            [self.num_heads, 2],
+            dim=2,
+        )
+        # Merge groups and heads
+        query = query.reshape(-1, self.num_groups * self.num_heads, self.head_size)
+
+        # Inplace rotary
+        self.rotary_emb(query, torch.select(kv, dim=2, index=0), cos, sin)
+
+        reshape_and_cache(
+            kv[:, :, 0].contiguous(),
+            kv[:, :, 1].contiguous(),
+            kv_cache[0],
+            kv_cache[1],
+            slots,
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, :, 0].contiguous(),
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, :, 1].contiguous(),
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.dense(
+            attn_output.view(-1, self.num_groups * self.num_heads * self.head_size)
+        )
+
+
+class FlashMLP(nn.Module):
+    def __init__(self, config, prefix: str, weights):
+        super().__init__()
+        self.act = torch.nn.functional.gelu
+
+        self.dense_h_to_4h = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=config.bias
+        )
+        self.dense_4h_to_h = load_row(
+            config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=config.bias
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class FlashRWLayer(nn.Module):
+    def __init__(
+        self,
+        layer_id,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+
+        parallel_attn = config.parallel_attn
+        self.parallel_attn = parallel_attn
+
+        prefix = f"{prefix}.h.{layer_id}"
+
+        # NOTE: Falcon 180B uses the ln_attn prefix
+        ln_prefix = "input_layernorm"
+        if config.num_hidden_layers == 80:
+            ln_prefix = "ln_attn"
+
+        self.input_layernorm = FastLayerNorm.load(
+            prefix=f"{prefix}.{ln_prefix}",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+        self.self_attention = FlashRWAttention(
+            config,
+            prefix=f"{prefix}.self_attention",
+            weights=weights,
+        )
+        self.post_attention_layernorm = (
+            FastLayerNorm.load(
+                prefix=f"{prefix}.post_attention_layernorm",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+            if not parallel_attn
+            else None
+        )
+
+        self.mlp = FlashMLP(
+            config,
+            prefix=f"{prefix}.mlp",
+            weights=weights,
+        )
+
+        self.process_group = weights.process_group
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        if self.parallel_attn:
+            ln_hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+            attn_output = self.self_attention(
+                ln_hidden_states,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache,
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+            mlp_output = self.mlp(ln_hidden_states)
+            intermediate = mlp_output + attn_output
+
+            if self.process_group.size() > 1:
+                torch.distributed.all_reduce(intermediate, group=self.process_group)
+
+            return intermediate, residual
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+            hidden_states = self.self_attention(
+                hidden_states,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache,
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+            if self.post_attention_layernorm is not None:
+                hidden_states, residual = self.post_attention_layernorm(
+                    hidden_states, residual
+                )
+
+            mlp_output = self.mlp(hidden_states)
+
+            return mlp_output, residual
+
+
+class FlashRWLayerNorm(nn.Module):
+    def __init__(self, config, prefix: str, weights):
+        super().__init__()
+        # Falcon2 includes the number of layer norms in the config
+        # in the case no number of layer norms is provided, we default to 1
+        self.num_ln = getattr(config, "num_ln_in_parallel_attn", 1)
+
+        # Falcon 180B uses the ln_attn prefix and has 2 layer norms
+        if config.num_hidden_layers == 80:
+            self.num_ln = 2
+
+        if self.num_ln == 1:
+            self.input_ln = FastLayerNorm.load(
+                prefix=f"{prefix}.input_layernorm",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+        elif self.num_ln == 2:
+            self.ln_attn = FastLayerNorm.load(
+                prefix=f"{prefix}.ln_attn",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+            self.ln_mlp = FastLayerNorm.load(
+                prefix=f"{prefix}.ln_mlp",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+        else:
+            raise ValueError("Number of layer norms can either be 1 or 2.")
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+    ):
+        if self.num_ln == 1:
+            ln_hidden_states, residual = self.input_ln(hidden_states, residual)
+            return ln_hidden_states, ln_hidden_states, residual
+        elif self.num_ln == 2:
+            ln_attn, residual = self.ln_attn(hidden_states, residual)
+            ln_mlp, _ = self.ln_mlp(residual)
+            return ln_attn, ln_mlp, residual
+
+
+class FlashRWLargeLayer(nn.Module):
+    def __init__(self, layer_id, prefix: str, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.h.{layer_id}"
+
+        self.ln_layer = FlashRWLayerNorm(config, prefix, weights)
+
+        self.self_attention = FlashRWLargeAttention(
+            config,
+            prefix=f"{prefix}.self_attention",
+            weights=weights,
+        )
+        assert config.parallel_attn, "This version doesn't support non parallel_attn"
+
+        self.mlp = FlashMLP(config, prefix=f"{prefix}.mlp", weights=weights)
+
+        self.process_group = weights.process_group
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        # Layer norm.
+        ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual)
+
+        # Self attention.
+        attn_output = self.self_attention(
+            ln_attn,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        # MLP.
+        mlp_output = self.mlp(ln_mlp)
+
+        intermediate = attn_output + mlp_output
+
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(intermediate, group=self.process_group)
+
+        return intermediate, residual
+
+
+class FlashRWPreTrainedModel(PreTrainedModel):
+    config_class = RWConfig
+
+
+class FlashRWModel(FlashRWPreTrainedModel):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__(config)
+        self.config = config
+
+        self.word_embeddings = TensorParallelEmbedding(
+            prefix=f"{prefix}.word_embeddings", weights=weights
+        )
+
+        if config.new_decoder_architecture:
+            self.h = nn.ModuleList(
+                [
+                    FlashRWLargeLayer(layer_id, prefix, config, weights)
+                    for layer_id in range(config.num_hidden_layers)
+                ]
+            )
+            self.cache_size = self.h[0].self_attention.num_groups
+        else:
+            self.h = nn.ModuleList(
+                [
+                    FlashRWLayer(layer_id, prefix, config, weights)
+                    for layer_id in range(config.num_hidden_layers)
+                ]
+            )
+            self.cache_size = self.h[0].self_attention.num_heads_kv
+
+        self.ln_f = FastLayerNorm.load(
+            prefix=f"{prefix}.ln_f",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+        self.head_size = self.h[0].self_attention.head_size
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.word_embeddings(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.h[0].self_attention.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.h):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashRWForCausalLM(FlashRWPreTrainedModel):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__(config)
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        self.transformer = FlashRWModel(prefix, config, weights)
+
+        self.lm_head = SpeculativeHead.load(config, prefix="lm_head", weights=weights)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
new file mode 100644
index 00000000..43eb9687
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -0,0 +1,508 @@
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    SpeculativeHead,
+    TensorParallelEmbedding,
+    get_linear,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.gptq import GPTQWeightsLoader
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+
+
+def load_multi_mqa(
+    config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
+):
+    if config.quantize == "gptq":
+        return _load_multi_mqa_gptq(
+            config, prefix, weights, bias, head_size, num_heads, hidden_size
+        )
+    elif config.quantize == "marlin":
+        raise RuntimeError(
+            "santacoder models with marlin quantization are not yet supported"
+        )
+    else:
+        return _load_multi_mqa(
+            config, prefix, weights, bias, head_size, num_heads, hidden_size
+        )
+
+
+def _load_multi_mqa_gptq(
+    config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
+):
+    from text_generation_server.layers.gptq import GPTQWeight
+
+    if any("c_attn" in k for k in weights.routing.keys()) and not config.transpose:
+        world_size = weights.process_group.size()
+        rank = weights.process_group.rank()
+
+        slice_ = weights._get_slice(f"{prefix}.c_attn.qweight")
+        shape = slice_.get_shape()
+        block_size = (shape[1] - 2 * head_size) // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        assert (shape[1] - 2 * head_size) % world_size == 0
+        q_tensor = slice_[:, start:stop]
+        kv_tensor = slice_[:, -2 * head_size :]
+        qweight = torch.cat([q_tensor, kv_tensor], dim=1)
+        qweight = qweight.to(device=weights.device)
+
+        slice_ = weights._get_slice(f"{prefix}.c_attn.scales")
+        shape = slice_.get_shape()
+        block_size = (shape[1] - 2 * head_size) // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        assert (shape[1] - 2 * head_size) % world_size == 0
+        q_tensor = slice_[:, start:stop]
+        kv_tensor = slice_[:, -2 * head_size :]
+        scales = torch.cat([q_tensor, kv_tensor], dim=1)
+        scales = scales.to(device=weights.device)
+
+        slice_ = weights._get_slice(f"{prefix}.c_attn.qzeros")
+        shape = slice_.get_shape()
+        block_size = (shape[1] - (2 * head_size) * 4 // 32) // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        assert 2 * head_size % (32 // 4) == 0
+        q_tensor = slice_[:, start:stop]
+        kv_tensor = slice_[:, -2 * head_size * 4 // 32 :]
+        qzeros = torch.cat([q_tensor, kv_tensor], dim=1)
+        qzeros = qzeros.to(device=weights.device)
+
+        loader = weights.weights_loader
+        assert isinstance(loader, GPTQWeightsLoader)
+        loader._get_gptq_params(weights)
+        if loader.quant_method == "gptq":
+            g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
+            g_idx = g_idx.to(device=weights.device)
+        elif loader.quant_method == "awq":
+            g_idx = None
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+
+        from text_generation_server.layers.gptq import HAS_EXLLAMA
+
+        weight = GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=loader.bits,
+            groupsize=loader.groupsize,
+            use_awq_kernel=loader.quantize == "awq",
+            use_exllama=HAS_EXLLAMA,
+        )
+
+        if bias:
+            slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
+            shape = slice_.get_shape()
+            block_size = (shape[0] - 2 * head_size) // world_size
+            assert (shape[0] - 2 * head_size) % world_size == 0
+            q_tensor = slice_[start:stop]
+            start = rank * block_size
+            stop = (rank + 1) * block_size
+            q_tensor = slice_[start:stop]
+            kv_tensor = slice_[-2 * head_size :]
+            bias = torch.cat([q_tensor, kv_tensor], dim=0)
+            bias = bias.to(device=weights.device)
+
+        return TensorParallelColumnLinear(get_linear(weight, bias))
+    else:
+        raise NotImplementedError("Gptq loading with santacoder is not implemented")
+
+
+def _load_multi_mqa(
+    config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
+):
+    if any("c_attn" in k for k in weights.routing.keys()):
+        slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
+        shape = slice_.get_shape()
+        world_size = weights.process_group.size()
+        rank = weights.process_group.rank()
+        if config.transpose:
+            block_size = (shape[1] - 2 * head_size) // world_size
+            start = rank * block_size
+            stop = (rank + 1) * block_size
+            assert (shape[1] - 2 * head_size) % world_size == 0
+            q_tensor = slice_[:, start:stop]
+            kv_tensor = slice_[:, -2 * head_size :]
+            weight = torch.cat([q_tensor, kv_tensor], dim=1).T
+        else:
+            block_size = (shape[0] - 2 * head_size) // world_size
+            start = rank * block_size
+            stop = (rank + 1) * block_size
+            assert (shape[0] - 2 * head_size) % world_size == 0
+            q_tensor = slice_[start:stop]
+            kv_tensor = slice_[-2 * head_size :]
+            weight = torch.cat([q_tensor, kv_tensor], dim=0)
+        if bias:
+            slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
+            shape = slice_.get_shape()
+            block_size = (shape[0] - 2 * head_size) // world_size
+            assert (shape[0] - 2 * head_size) % world_size == 0
+            start = rank * block_size
+            stop = (rank + 1) * block_size
+            q_tensor = slice_[start:stop]
+            kv_tensor = slice_[-2 * head_size :]
+            bias = torch.cat([q_tensor, kv_tensor], dim=0)
+    else:
+        if config.transpose:
+            w = [
+                weights.get_sharded(f"{prefix}.q_attn.weight", dim=1).T,
+                weights.get_tensor(f"{prefix}.kv_attn.weight").T,
+            ]
+            weight = torch.cat(w, dim=0)
+        else:
+            w = [
+                weights.get_sharded(f"{prefix}.q_attn.weight", dim=0),
+                weights.get_tensor(f"{prefix}.kv_attn.weight"),
+            ]
+            weight = torch.cat(w, dim=1)
+
+        if bias:
+            b = [
+                weights.get_sharded(f"{prefix}.q_attn.bias", dim=0),
+                weights.get_tensor(f"{prefix}.kv_attn.bias"),
+            ]
+            bias = torch.cat(b, dim=0)
+        else:
+            bias = None
+
+    weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+    assert list(weight.shape) == [
+        (num_heads + 2) * head_size,
+        hidden_size,
+    ], f"{weight.shape} != {[(num_heads + 2) * head_size, hidden_size]}"
+    if bias is not None:
+        bias = bias.to(dtype=weights.dtype).to(device=weights.device)
+        assert list(bias.shape) == [
+            (num_heads + 2) * head_size
+        ], f"{weight.shape} != {[(num_heads + 2) * head_size]}"
+    return TensorParallelColumnLinear(get_linear(weight, bias))
+
+
+def load_col(config, prefix: str, weights, bias: bool):
+    if config.transpose:
+        weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
+    else:
+        weight = weights.get_multi_weights_col([prefix], dim=0)
+
+    if bias:
+        bias = weights.get_sharded(f"{prefix}.bias", dim=0)
+    else:
+        bias = None
+    return TensorParallelColumnLinear(get_linear(weight, bias))
+
+
+def load_row(config, prefix: str, weights, bias: bool):
+    if config.transpose:
+        weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
+    else:
+        weight = weights.get_weights_row(prefix)
+
+    if bias and weights.process_group.rank() == 0:
+        # Rank is only on the first rank process
+        bias = weights.get_tensor(f"{prefix}.bias")
+    else:
+        bias = None
+    return TensorParallelRowLinear(
+        get_linear(weight, bias), process_group=weights.process_group
+    )
+
+
+class FlashMQAttention(torch.nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        num_heads = config.num_attention_heads
+        hidden_size = config.hidden_size
+
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.head_size = hidden_size // num_heads
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+
+        self.softmax_scale = self.head_size ** (-0.5)
+
+        self.c_attn = load_multi_mqa(
+            config,
+            prefix=prefix,
+            weights=weights,
+            bias=True,
+            head_size=self.head_size,
+            hidden_size=hidden_size,
+            num_heads=self.num_heads,
+        )
+        self.c_proj = load_row(
+            config, prefix=f"{prefix}.c_proj", weights=weights, bias=True
+        )
+        self.kv_head_mapping = torch.zeros(
+            self.num_heads, dtype=torch.int32, device=weights.device
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        qkv = self.c_attn(hidden_states)
+
+        # Split query from key_value
+        query, key_value = qkv.split(
+            [self.head_size * self.num_heads, 2 * self.head_size], dim=1
+        )
+
+        # Prepare query and key_value for indexing
+        query = query.view(-1, self.num_heads, self.head_size)
+        key_value = key_value.view(-1, 2, 1, self.head_size)
+
+        reshape_and_cache(
+            key_value[:, 0], key_value[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else key_value[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else key_value[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.c_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.activation_function
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        self.c_fc = load_col(
+            config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
+        )
+        self.c_proj = load_row(
+            config, prefix=f"{prefix}.c_proj", weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class Block(nn.Module):
+    def __init__(self, prefix: str, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.h.{layer_id}"
+        self.ln_1 = FastLayerNorm.load(
+            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
+        )
+        self.ln_2 = FastLayerNorm.load(
+            prefix=f"{prefix}.ln_2", weights=weights, eps=config.layer_norm_epsilon
+        )
+        self.self_attn = FlashMQAttention(
+            prefix=f"{prefix}.attn",
+            config=config,
+            weights=weights,
+        )
+        self.mlp = MLP(
+            prefix=f"{prefix}.mlp",
+            config=config,
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+    ):
+        hidden_states, residual = self.ln_1(hidden_states, residual)
+        hidden_states = self.self_attn(
+            hidden_states,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+
+        hidden_states, residual = self.ln_2(hidden_states, residual)
+
+        mlp_output = self.mlp(hidden_states)
+
+        return mlp_output, residual
+
+
+class FlashSantacoderModel(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.config = config
+
+        self.process_group = weights.process_group
+        self.wte = TensorParallelEmbedding(
+            prefix=f"{prefix}.wte",
+            weights=weights,
+            reduce=False,
+        )
+        self.wpe = TensorParallelEmbedding(
+            prefix=f"{prefix}.wpe",
+            weights=weights,
+            reduce=False,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                Block(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = FastLayerNorm.load(
+            prefix="transformer.ln_f", weights=weights, eps=config.layer_norm_epsilon
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids) + self.wpe(position_ids)
+
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(hidden_states, group=self.process_group)
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+            )
+
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashSantacoderForCausalLM(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        config.transpose = config.architectures[0].startswith("GPT2")
+        self.model = FlashSantacoderModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config, prefix=f"{prefix}.wte", weights=weights
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
new file mode 100644
index 00000000..4975cf22
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -0,0 +1,554 @@
+# coding=utf-8
+# Copyright 2024 Starcoder2 AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+    Seqlen,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+    FastRMSNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+class Starcoder2Config(PretrainedConfig):
+    model_type = "starcoder2"
+
+    def __init__(
+        self,
+        vocab_size=49152,
+        hidden_size=3072,
+        intermediate_size=12288,
+        num_hidden_layers=30,
+        num_attention_heads=24,
+        num_key_value_heads=2,
+        mlp_type="default",
+        hidden_act="gelu_pytorch_tanh",
+        max_position_embeddings=4096,
+        initializer_range=0.018042,
+        norm_type="layer_norm",
+        norm_epsilon=1e-5,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        rope_theta=10000.0,
+        sliding_window=None,
+        attention_dropout=0.0,
+        residual_dropout=0.0,
+        embedding_dropout=0.0,
+        use_bias: bool = True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.use_bias = use_bias
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_type = mlp_type
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_type = norm_type
+        self.norm_epsilon = norm_epsilon
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.embedding_dropout = embedding_dropout
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=config.use_bias,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+    )
+
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.hidden_size // config.num_attention_heads
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    if config.use_bias:
+        w = [
+            weights.get_sharded(f"{p}.bias", dim=0)
+            for p in [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
+        ]
+        bias = torch.cat(w, dim=0).to(dtype=weights.dtype).to(device=weights.device)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=bias))
+
+
+class Starcoder2Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.max_past = (
+            config.sliding_window if config.sliding_window is not None else -1
+        )
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=config.use_bias,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        prefill_cache_indices,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
+        reshape_and_cache(
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
+                seqlen,
+                block_tables,
+                self.softmax_scale,
+                window_size_left=self.max_past,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class Starcoder2MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.c_fc = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.c_fc",
+            weights=weights,
+            bias=config.use_bias,
+        )
+        self.c_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=config.use_bias,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        return self.c_proj(hidden_states)
+
+
+class Starcoder2GatedMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=config.use_bias,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=config.use_bias,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+
+
+STARCODER2_NORMALIZATION_CLASSES = {
+    "layer_norm": FastLayerNorm,
+    "rms_norm": FastRMSNorm,
+}
+
+STARCODER2_MLP_CLASSES = {
+    "default": Starcoder2MLP,
+    "gated": Starcoder2GatedMLP,
+}
+
+
+class Starcoder2Layer(nn.Module):
+    def __init__(self, layer_id, config, weights):
+        super().__init__()
+        prefix = f"model.layers.{layer_id}"
+        self.self_attn = Starcoder2Attention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+
+        self.mlp = STARCODER2_MLP_CLASSES[config.mlp_type](
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+
+        self.input_layernorm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.norm_epsilon
+        )
+        self.post_attention_layernorm = STARCODER2_NORMALIZATION_CLASSES[
+            config.norm_type
+        ].load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.norm_epsilon,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        prefill_cache_indices,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            prefill_cache_indices,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.mlp(normed_attn_res_output)
+
+        return mlp_output, attn_res
+
+
+class Starcoder2Model(torch.nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                Starcoder2Layer(
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.norm_epsilon
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, true_max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+                prefill_cache_indices,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashStarcoder2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = Starcoder2Model(prefix, config, weights)
+        try:
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix="lm_head",
+                weights=weights,
+            )
+        except RuntimeError:
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix=f"{prefix}.embed_tokens",
+                weights=weights,
+            )
+
+        self.max_past = config.sliding_window
+        self.max_past_tensor = (
+            torch.tensor(config.sliding_window, device=weights.device)
+            if self.max_past is not None
+            else None
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        true_max_s = max_s
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif self.max_past is not None:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            seqlen = seqlen.clamp(max=self.max_past_tensor)
+
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            true_max_s,
+            prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
new file mode 100644
index 00000000..a829c374
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -0,0 +1,839 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Idefics2 model."""
+
+from typing import List, Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import math
+
+from transformers.activations import ACT2FN
+from text_generation_server.models.custom_modeling.vlm import (
+    load_text_model,
+)
+from text_generation_server.layers.attention import Seqlen
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+)
+from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the need to resize them to the same
+    fixed size. In particular, we start from the original pre-trained SigLIP model
+    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+        self.patch_embedding.bias = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.position_embedding", weights=weights
+        )
+
+    def forward(
+        self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics2VisionAttention(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = self.embed_dim // self.num_heads
+        if self.head_size * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_size**-0.5
+        self.dropout = config.attention_dropout
+
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.embed_dim = self.embed_dim // weights.process_group.size()
+
+        self.qkv = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
+        )
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv(hidden_states)
+        query_states, key_states, value_states = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                self.head_size * self.num_heads,
+                self.head_size * self.num_heads,
+            ],
+            dim=2,
+        )
+
+        query_states = query_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = (
+            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        )
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output
+
+
+class Idefics2VisionMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics2EncoderLayer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Idefics2VisionAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.layer_norm1 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm1", eps=config.layer_norm_eps, weights=weights
+        )
+        self.layer_norm2 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm2", eps=config.layer_norm_eps, weights=weights
+        )
+        self.mlp = Idefics2VisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+
+    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Idefics2Encoder(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Idefics2EncoderLayer(
+                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+        return hidden_states
+
+
+class Idefics2VisionTransformer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
+        self.encoder = Idefics2Encoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+        self.post_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.post_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+    ):
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_size = self.config.patch_size
+            patch_attention_mask = torch.ones(
+                (
+                    batch_size,
+                    pixel_values.size(2) // patch_size,
+                    pixel_values.size(3) // patch_size,
+                )
+            )
+            patch_attention_mask = patch_attention_mask.to(
+                dtype=torch.bool, device=pixel_values.device
+            )
+
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask
+        )
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            patch_attention_mask = None
+        else:
+            patch_attention_mask = _prepare_4d_attention_mask(
+                patch_attention_mask, hidden_states.dtype
+            )
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=patch_attention_mask,
+        )
+
+        last_hidden_state = encoder_outputs
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        return last_hidden_state
+
+
+class Idefics2MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.text_config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+
+    def forward(self, hidden_states):
+        start_shape = hidden_states.shape[:-1]
+        gate_up_states = self.gate_up_proj(hidden_states)
+        intermediate_size = gate_up_states.shape[-1] // 2
+        gate_up_states = gate_up_states.view(-1, 2, intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]
+        ).view(*start_shape, -1)
+
+
+class Idefics2RMSNorm(nn.Module):
+    def __init__(self, prefix, weights, eps):
+        """
+        Idefics2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
+        )
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class Idefics2PerceiverAttention(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        self.layer_idx = None
+        self.hidden_size = config.text_config.hidden_size
+        self.num_heads = config.perceiver_config.resampler_n_heads
+        self.head_size = config.perceiver_config.resampler_head_dim
+        self.num_key_value_heads = config.perceiver_config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.attention_dropout = config.perceiver_config.attention_dropout
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            self.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.q_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.q_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.kv = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.o_proj", weights=weights, bias=False
+        )
+
+        self.is_causal = False
+
+    def forward(
+        self,
+        latents: torch.Tensor,
+        context: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = latents.size()
+        kv_seq_len = q_len + context.size()[1]
+
+        hidden_states = torch.concat([context, latents], dim=-2)
+        query_states = self.q_proj(latents)
+        kv = self.kv(hidden_states)
+        key_states, value_states = kv.split(
+            [
+                self.head_size * self.num_key_value_heads,
+                self.head_size * self.num_key_value_heads,
+            ],
+            dim=2,
+        )
+
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, kv_seq_len, self.num_key_value_heads, self.head_size
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, kv_seq_len, self.num_key_value_heads, self.head_size
+        ).transpose(1, 2)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_size)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output
+
+
+class Idefics2PerceiverLayer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.hidden_size = config.text_config.hidden_size
+        self.n_latents = config.perceiver_config.resampler_n_latents
+        self.depth = config.perceiver_config.resampler_depth
+        self.rms_norm_eps = config.text_config.rms_norm_eps
+
+        self.input_latents_norm = Idefics2RMSNorm(
+            prefix=f"{prefix}.input_latents_norm",
+            weights=weights,
+            eps=self.rms_norm_eps,
+        )
+        self.input_context_norm = Idefics2RMSNorm(
+            prefix=f"{prefix}.input_context_norm",
+            weights=weights,
+            eps=self.rms_norm_eps,
+        )
+        self.self_attn = Idefics2PerceiverAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.post_attention_layernorm = Idefics2RMSNorm(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=self.rms_norm_eps,
+        )
+        self.mlp = Idefics2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+    def forward(
+        self,
+        latents: torch.Tensor,
+        context: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+        """
+        residual = latents
+
+        latents = self.input_latents_norm(latents)
+        context = self.input_context_norm(context)
+
+        latents = self.self_attn(
+            latents=latents,
+            context=context,
+            attention_mask=attention_mask,
+        )
+        latents = residual + latents
+        residual = latents
+
+        latents = self.post_attention_layernorm(latents)
+        latents = self.mlp(latents)
+        latents = residual + latents
+
+        return latents
+
+
+class Idefics2PerceiverResampler(nn.Module):
+    def __init__(self, prefix, config, weights) -> None:
+        super().__init__()
+        self.hidden_size = config.text_config.hidden_size
+        self.hidden_act = config.perceiver_config.hidden_act
+        self.n_latents = config.perceiver_config.resampler_n_latents
+        self.depth = config.perceiver_config.resampler_depth
+        self.rms_norm_eps = config.text_config.rms_norm_eps
+
+        # Create Latents for Perceiver
+        self.latents = weights.get_tensor(f"{prefix}.latents")
+
+        # Create Transformer Blocks
+        self.layers = nn.ModuleList(
+            [
+                Idefics2PerceiverLayer(
+                    prefix=f"{prefix}.layers.{idx}", config=config, weights=weights
+                )
+                for idx in range(self.depth)
+            ]
+        )
+        self.norm = Idefics2RMSNorm(
+            prefix=f"{prefix}.norm",
+            weights=weights,
+            eps=config.text_config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        context: torch.Tensor,
+        attention_mask,
+    ) -> torch.Tensor:
+        # seq embed -> bsz seq embed
+        latents = self.latents.unsqueeze(0).expand(
+            (context.shape[0], *self.latents.size())
+        )
+
+        latent_attention_mask = torch.ones(
+            (attention_mask.size(0), latents.size(1)),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        attention_mask = torch.cat([attention_mask, latent_attention_mask], dim=-1)
+        attention_mask = _prepare_4d_attention_mask(
+            attention_mask, latents.dtype, tgt_len=self.n_latents
+        )
+
+        compressed_context = latents
+        for perceiver_layer in self.layers:
+            compressed_context = perceiver_layer(
+                compressed_context,
+                context,
+                attention_mask=attention_mask,
+            )
+        compressed_context = self.norm(compressed_context)
+
+        return compressed_context
+
+
+class Idefics2Connector(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.modality_projection = Idefics2MLP(
+            prefix=f"{prefix}.modality_projection", config=config, weights=weights
+        )
+        self.perceiver_resampler = Idefics2PerceiverResampler(
+            prefix=f"{prefix}.perceiver_resampler", config=config, weights=weights
+        )
+
+    def forward(self, image_hidden_states, attention_mask):
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        image_hidden_states = self.perceiver_resampler(
+            context=image_hidden_states, attention_mask=attention_mask
+        )
+        return image_hidden_states
+
+
+class Idefics2ForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = None
+        config.vision_config.speculator = config.speculator
+        config.text_config.quantize = config.quantize
+        config.text_config.speculator = config.speculator
+
+        vision_config = config.vision_config
+        self.text_model = load_text_model(
+            prefix="model" if not prefix else f"{prefix}.model",
+            config=config.text_config,
+            weights=weights,
+            name="text_model",
+        )
+        self.dtype = weights.dtype
+
+        # The vision and connector models are not quantized.
+        with weights.use_loader(DefaultWeightsLoader(UnquantizedWeight)):
+            self.vision_model = Idefics2VisionTransformer(
+                prefix=(
+                    f"{prefix}.model.vision_model" if prefix else "model.vision_model"
+                ),
+                config=vision_config,
+                weights=weights,
+            )
+
+            config.quantize = None
+            self.connector = Idefics2Connector(
+                prefix=f"{prefix}.model.connector" if prefix else "model.connector",
+                config=config,
+                weights=weights,
+            )
+
+        self.config = config
+        self.image_seq_len = config.perceiver_config.resampler_n_latents
+        self.image_token_id = config.image_token_id
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+
+    def _merge_input_ids_with_image_features(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        image_features: torch.Tensor,
+    ):
+        """In place merges in vision_embeddings with inputs_embeds."""
+        # mask = input_ids == self.config.image_token_index
+        mask = input_ids == self.config.image_token_id
+        # Let's pray we have enabled enough slots !
+        inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        # Unused here
+        image_sizes: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ):
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
+        if pixel_values is not None:
+            batch_size, num_images, num_channels, height, width = pixel_values.shape
+            all_states = []
+            all_pixel_values = pixel_values
+            all_pixel_mask = pixel_attention_mask
+            for i in range(batch_size):
+                pixel_values = all_pixel_values.to(
+                    dtype=self.dtype
+                )  # fp16 compatibility
+                pixel_values = pixel_values[i : i + 1]
+                pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])
+
+                # Remove padding images - padding images are full 0.
+                nb_values_per_image = pixel_values.shape[1:].numel()
+                real_images_inds = (pixel_values == 0.0).sum(
+                    dim=(-1, -2, -3)
+                ) != nb_values_per_image
+                pixel_values = pixel_values[real_images_inds].contiguous()
+
+                # Handle the vision attention mask
+                if pixel_attention_mask is None:
+                    pixel_attention_mask = torch.ones(
+                        size=(
+                            pixel_values.size(0),
+                            pixel_values.size(2),
+                            pixel_values.size(3),
+                        ),
+                        dtype=torch.bool,
+                        device=pixel_values.device,
+                    )
+                else:
+                    # Remove padding images from the mask/pP p
+                    pixel_attention_mask = all_pixel_mask[i : i + 1]
+                    pixel_attention_mask = pixel_attention_mask.view(
+                        1 * num_images, *pixel_attention_mask.shape[2:]
+                    )
+                    pixel_attention_mask = pixel_attention_mask[
+                        real_images_inds
+                    ].contiguous()
+
+                patch_size = self.config.vision_config.patch_size
+                patches_subgrid = pixel_attention_mask.unfold(
+                    dimension=1, size=patch_size, step=patch_size
+                )
+                patches_subgrid = patches_subgrid.unfold(
+                    dimension=2, size=patch_size, step=patch_size
+                )
+                patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+                # Get sequence from the vision encoder
+                image_hidden_states = self.vision_model(
+                    pixel_values=pixel_values,
+                    patch_attention_mask=patch_attention_mask,
+                )
+
+                # Modality projection & resampling
+                image_hidden_states = self.connector(
+                    image_hidden_states,
+                    attention_mask=patch_attention_mask.view(pixel_values.size(0), -1),
+                )
+                all_states.append(image_hidden_states)
+            image_hidden_states = torch.stack(all_states, dim=0)
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            inputs_embeds = self._merge_input_ids_with_image_features(
+                input_ids, inputs_embeds, image_hidden_states
+            )
+
+        hidden_states = self.text_model.model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=None,
+            adapter_data=adapter_data,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py
new file mode 100644
index 00000000..a5565819
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_config.py
@@ -0,0 +1,326 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Idefics model configuration"""
+import copy
+
+from transformers import PretrainedConfig
+
+IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "HuggingFaceM4/idefics-9b": "https://huggingface.co/HuggingFaceM4/idefics-9b/blob/main/config.json",
+    "HuggingFaceM4/idefics-80b": "https://huggingface.co/HuggingFaceM4/idefics-80b/blob/main/config.json",
+}
+
+
+class IdeficsVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        intermediate_size (`int`, *optional*, defaults to 5120):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_num_channels (`int`, *optional*, defaults to `3`):
+            Number of image channels.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
+            testing).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+
+    model_type = "idefics"
+    attribute_map = {
+        "hidden_size": "embed_dim",
+    }
+
+    def __init__(
+        self,
+        embed_dim=768,
+        image_size=224,
+        intermediate_size=5120,
+        patch_size=14,
+        num_hidden_layers=32,
+        num_attention_heads=16,
+        num_channels=3,
+        hidden_act="gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.intermediate_size = intermediate_size
+        self.patch_size = patch_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+
+        super().__init__(**kwargs)
+
+
+class IdeficsPerceiverConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        use_resampler (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the resampler
+        resampler_n_latents (`int`, *optional*, defaults to ):
+            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+        resampler_depth (`int`, *optional*, defaults to 6):
+            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+        resampler_n_heads (`int`, *optional*, defaults to 16):
+            Number of heads in each Transformer block (for multi-headed self-attention).
+        resampler_head_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of each head projection in the Transformer block.
+        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
+            Whether or not to use qk layer norms in perceiver
+    """
+
+    model_type = "idefics"
+
+    def __init__(
+        self,
+        use_resampler=False,
+        resampler_n_latents=64,
+        resampler_depth=6,
+        resampler_n_heads=16,
+        resampler_head_dim=96,
+        qk_layer_norms_perceiver=False,
+        **kwargs,
+    ):
+        self.use_resampler = use_resampler
+        self.resampler_n_latents = resampler_n_latents
+        self.resampler_depth = resampler_depth
+        self.resampler_n_heads = resampler_n_heads
+        self.resampler_head_dim = resampler_head_dim
+        self.qk_layer_norms_perceiver = qk_layer_norms_perceiver
+
+        super().__init__(**kwargs)
+
+
+class IdeficsConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
+    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Idefics-9B.
+    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        additional_vocab_size (`int`, *optional`, defaults to 0):
+            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
+            are always trainable whereas regular vocab tokens can be frozen or not.
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~IdeficsModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
+            Initialization type for the alphas.
+        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
+            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
+            Attention.
+        alpha_type (`str`, *optional*, defaults to `"float"`):
+            Whether the gating alphas should be vectors or single floats.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0)
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1)
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2)
+            End of stream token id.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        cross_layer_interval (`int`, *optional*, default to 1)
+            Interval for cross attention (from text to image) layers.
+        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
+        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
+        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing text layers when `freeze_text_layers` is `True`
+        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
+        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
+        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
+            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
+        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
+        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
+        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict
+    Example:
+    ```python
+    >>> from transformers import IdeficsModel, IdeficsConfig
+    >>> # Initializing a Idefics idefics-9b style configuration
+    >>> configuration = IdeficsConfig()
+    >>> # Initializing a model from the idefics-9b style configuration
+    >>> model = IdeficsModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "idefics"
+    is_composition = True
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        additional_vocab_size=0,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        dropout=0.0,
+        hidden_act="silu",
+        initializer_range=0.02,
+        alpha_initializer="zeros",
+        alphas_initializer_range=0.0,
+        alpha_type="float",
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        cross_layer_interval=1,
+        qk_layer_norms=False,
+        freeze_text_layers=True,
+        freeze_text_module_exceptions=[],
+        freeze_lm_head=False,
+        freeze_vision_layers=True,
+        freeze_vision_module_exceptions=[],
+        use_resampler=False,
+        vision_config=None,
+        perceiver_config=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.alpha_initializer = alpha_initializer
+        self.alphas_initializer_range = alphas_initializer_range
+        self.alpha_type = alpha_type
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+        self.cross_layer_interval = cross_layer_interval
+        self.qk_layer_norms = qk_layer_norms
+        self.freeze_vision_layers = freeze_vision_layers
+
+        self.freeze_text_layers = freeze_text_layers
+        self.freeze_text_module_exceptions = freeze_text_module_exceptions
+        self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
+        self.freeze_lm_head = freeze_lm_head
+
+        self.use_resampler = use_resampler
+
+        if perceiver_config is None:
+            self.perceiver_config = IdeficsPerceiverConfig()
+        elif isinstance(perceiver_config, dict):
+            self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config)
+        elif isinstance(perceiver_config, IdeficsPerceiverConfig):
+            self.perceiver_config = perceiver_config
+
+        if vision_config is None:
+            self.vision_config = IdeficsVisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = IdeficsVisionConfig(**vision_config)
+        elif isinstance(vision_config, IdeficsVisionConfig):
+            self.vision_config = vision_config
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
+        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
+        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
+        # of this object many attributes have default values and haven't yet been overridden.
+        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        output["vision_config"] = self.vision_config.to_dict()
+        output["perceiver_config"] = self.perceiver_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+
+        return output
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_image_processing.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
new file mode 100644
index 00000000..afb8e1f9
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
@@ -0,0 +1,297 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Idefics."""
+
+from typing import Callable, Dict, List, Optional, Union, Iterable
+import numpy as np
+
+from PIL import Image
+
+import transformers
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    resize,
+    to_channel_dimension_format,
+    rescale,
+    normalize,
+)
+from transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from io import BytesIO
+import base64
+import requests
+from transformers import TensorType, is_torch_available
+
+
+IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
+IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+def convert_to_rgb(image):
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+
+
+class IdeficsImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Idefics image processor.
+    Args:
+        image_size (`int`, *optional*, defaults to `224`):
+            Resize to image size
+        image_num_channels (`int`, *optional*, defaults to `3`):
+            Number of image channels.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        image_num_channels: Optional[int] = 3,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.image_num_channels = image_num_channels
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_num_channels: Optional[int] = 3,
+        image_size: Optional[Dict[str, int]] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        transform: Callable = None,
+        **kwargs,
+    ) -> TensorType.PYTORCH:
+        """
+        Preprocess a batch of images.
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            image_size (`int`, *optional*, defaults to `self.image_size`):
+                Resize to image size
+            image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
+                Number of image channels.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+                Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+                channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
+                be overridden by the `image_mean` parameter in the `preprocess` method.
+            image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+                Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+                number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
+                method. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            transform (`Callable`, *optional*, defaults to `None`):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
+                assumed - and then a preset of inference-specific transforms will be applied to the images
+        Returns:
+            a PyTorch tensor of the processed images
+        """
+        image_size = image_size if image_size is not None else self.image_size
+        image_num_channels = (
+            image_num_channels
+            if image_num_channels is not None
+            else self.image_num_channels
+        )
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size = (image_size, image_size)
+
+        if len(images) == 0:
+            return []
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # For training a user needs to pass their own set of transforms as a Callable.
+        # For reference this is what was used in the original IDEFICS training:
+        # transform = transforms.Compose([
+        #     convert_to_rgb,
+        #     transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
+        #     transforms.ToTensor(),
+        #     transforms.Normalize(mean=image_mean, std=image_std),
+        # ])
+        if transform is not None:
+            if not is_torch_available():
+                raise ImportError("To pass in `transform` torch must be installed")
+            import torch
+
+            images = [transform(x) for x in images]
+            return torch.stack(images)
+
+        # for inference we do the exact transforms that were used to train IDEFICS
+        images = [convert_to_rgb(x) for x in images]
+        # further transforms expect numpy arrays
+        images = [to_numpy_array(x) for x in images]
+        images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
+        images = [self.rescale(image=image, scale=1 / 255) for image in images]
+        images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
+        images = [
+            to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images
+        ]
+        # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
+        images = BatchFeature(
+            data={"pixel_values": images}, tensor_type=TensorType.PYTORCH
+        )["pixel_values"]
+
+        return images
+
+    def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
+        """
+        Convert a single or a list of urls into the corresponding `PIL.Image` objects.
+        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
+        returned.
+        """
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
+                " Safari/537.36"
+            )
+        }
+        if isinstance(image_url_or_urls, list):
+            return [self.fetch_images(x) for x in image_url_or_urls]
+        elif isinstance(image_url_or_urls, str):
+            image = image_url_or_urls
+
+            if image.startswith("http://") or image.startswith("https://"):
+                response = requests.get(
+                    image_url_or_urls, stream=True, headers=headers, timeout=(1, 5)
+                )
+                response.raise_for_status()
+                content = response.content
+            elif image.startswith("data:"):
+                # https://stackoverflow.com/questions/17090571/is-there-a-way-to-set-background-image-as-a-base64-encoded-image
+                # data:image/png;base64,xxx
+                image = image.split(",")[-1]
+                content = base64.b64decode(image)
+            else:
+                raise ValueError(f"Unrecognized image {image}")
+
+            try:
+                image = Image.open(BytesIO(content))
+                # image.verify()
+            except Exception:
+                raise ValueError(f"Could not load image from url {image_url_or_urls}")
+            return image
+        else:
+            raise ValueError(
+                f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}"
+            )
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The rescaled image.
+        """
+        # return rescale(image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs)
+        # requires 4.32
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, Iterable[float]],
+        std: Union[float, Iterable[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `Iterable[float]`):
+                Image mean to use for normalization.
+            std (`float` or `Iterable[float]`):
+                Image standard deviation to use for normalization.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The normalized image.
+        """
+        # TODO 4.32
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+
+transformers.IdeficsImageProcessor = IdeficsImageProcessor
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py
new file mode 100644
index 00000000..fc6becc4
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -0,0 +1,1556 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Idefics model."""
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    dataclass,
+)
+from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
+from text_generation_server.models.custom_modeling.idefics_vision import (
+    IdeficsVisionTransformer,
+)
+from text_generation_server.models.custom_modeling.idefics_perceiver import (
+    IdeficsPerceiverResampler,
+)
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    SpeculativeHead,
+    FastLinear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.utils.import_utils import SYSTEM
+from loguru import logger
+
+if SYSTEM == "cuda":
+    import dropout_layer_norm
+elif SYSTEM == "rocm":
+    from vllm._C import ops
+else:
+    dropout_layer_norm = None
+
+
+@dataclass
+class BaseModelOutputWithPastImage(BaseModelOutputWithPast):
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class CausalLMOutputWithPastImage(CausalLMOutputWithPast):
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+# logger = logging.get_logger(__name__)
+
+# _CONFIG_FOR_DOC = "IdeficsConfig"
+
+# IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST = [
+#     "HuggingFaceM4/idefics-9b",
+#     "HuggingFaceM4/idefics-80b",
+#     # See all Idefics models at https://huggingface.co/models?filter=idefics
+# ]
+
+
+def expand_inputs_for_generation(
+    input_ids,
+    expand_size=1,
+    is_encoder_decoder=False,
+    attention_mask=None,
+    encoder_outputs=None,
+    **model_kwargs,
+):
+    expanded_return_idx = (
+        torch.arange(input_ids.shape[0])
+        .view(-1, 1)
+        .repeat(1, expand_size)
+        .view(-1)
+        .to(input_ids.device)
+    )
+    input_ids = input_ids.index_select(0, expanded_return_idx)
+
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = token_type_ids.index_select(
+            0, expanded_return_idx
+        )
+
+    if attention_mask is not None:
+        model_kwargs["attention_mask"] = attention_mask.index_select(
+            0, expanded_return_idx
+        )
+        model_kwargs["image_attention_mask"] = model_kwargs[
+            "image_attention_mask"
+        ].index_select(0, expanded_return_idx)
+        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(
+            0, expanded_return_idx
+        )
+
+    if is_encoder_decoder:
+        if encoder_outputs is None:
+            raise ValueError(
+                "If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined."
+            )
+        encoder_outputs["last_hidden_state"] = (
+            encoder_outputs.last_hidden_state.index_select(
+                0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
+            )
+        )
+        model_kwargs["encoder_outputs"] = encoder_outputs
+    return input_ids, model_kwargs
+
+
+def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+    # must have this key set to at least None
+    model_kwargs["past_key_values"] = model_kwargs.get("past_key_values", None)
+
+    # update past
+    if "past_key_values" in outputs:
+        model_kwargs["past"] = outputs.past_key_values
+    elif "mems" in outputs:
+        model_kwargs["past"] = outputs.mems
+    elif "past_buckets_states" in outputs:
+        model_kwargs["past"] = outputs.past_buckets_states
+    else:
+        model_kwargs["past"] = None
+
+    # update token_type_ids with last value
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = torch.cat(
+            [token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1
+        )
+
+    # update attention masks
+    if not is_encoder_decoder:
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))],
+                dim=-1,
+            )
+        if "image_attention_mask" in model_kwargs:
+            image_attention_mask = model_kwargs["image_attention_mask"]
+            last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
+            model_kwargs["image_attention_mask"] = last_mask
+
+    return model_kwargs
+
+
+def prepare_inputs_for_generation(input_ids, past=None, **kwargs):
+    token_type_ids = kwargs.get("token_type_ids", None)
+    # only last token for inputs_ids if past is defined in kwargs
+    if past:
+        input_ids = input_ids[:, -1].unsqueeze(-1)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+    attention_mask = kwargs.get("attention_mask", None)
+    position_ids = kwargs.get("position_ids", None)
+
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past:
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+    pixel_values = kwargs.get("pixel_values", None)
+    image_attention_mask = kwargs.get("image_attention_mask", None)
+    # if pixel_values is None or image_attention_mask is None:
+    #     raise ValueError("pixel values and image attention mask cannot be None")
+
+    return {
+        "input_ids": input_ids,
+        "past_key_values": past,
+        "use_cache": kwargs.get("use_cache"),
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+        "token_type_ids": token_type_ids,
+        "pixel_values": pixel_values,
+        "image_attention_mask": image_attention_mask,
+    }
+
+
+def freeze_model(model, module_exceptions=[]):
+    mapping = {
+        "LayerNorm": nn.LayerNorm,
+        "Linear": nn.Linear,
+        "Embedding": nn.Embedding,
+    }
+    module_exceptions_mapped = [mapping[m] for m in module_exceptions]
+    for module in model.modules():
+        if module_exceptions and any(
+            [isinstance(module, t) for t in module_exceptions_mapped]
+        ):
+            module.requires_grad_(
+                True
+            )  # Explicitely setting it to true to avoid any mistakes
+        else:
+            module.requires_grad_(False)
+    return model
+
+
+class IdeficsDecoupledPartialTPEmbedding(nn.Module):
+    def __init__(
+        self,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_embeddings = config.vocab_size
+        self.weight = TensorParallelEmbedding(
+            prefix="model.embed_tokens", weights=weights
+        )
+        self.additional_weight = nn.Parameter(
+            weights.get_tensor("model.embed_tokens.additional_embedding.weight")
+        )
+
+    def forward(self, input_ids):
+        # Clone so that we don't modify the original input_ids later on
+        input_ids = input_ids.clone()
+        additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
+        input_ids_additional_vocab = input_ids[additional_vocab_indices]
+        additional_embeddings = torch.nn.functional.embedding(
+            input_ids_additional_vocab - self.num_embeddings, self.additional_weight
+        )
+
+        # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
+        input_ids[additional_vocab_indices] = 0
+        full_vector = self.weight(input_ids)
+
+        # overwrite the records with high indices
+        full_vector[additional_vocab_indices] = additional_embeddings
+
+        return full_vector
+
+
+class IdeficsDecoupledTensorParallelLinear(nn.Module):
+    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear
+    """
+    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
+    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
+    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
+    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
+    """
+
+    def __init__(
+        self,
+        config,
+        weights,
+    ) -> None:
+        super().__init__()
+        self.fc = SpeculativeHead.load(config=config, prefix="lm_head", weights=weights)
+        self.additional_fc = FastLinear.load(
+            config=config,
+            prefix="lm_head.additional_fc",
+            weights=weights,
+            bias=False,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output, speculative_logits = self.fc(input)
+        additional_features = self.additional_fc(input)
+        output = torch.cat((output, additional_features), -1)
+
+        return output, speculative_logits
+
+    def extra_repr(self) -> str:
+        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
+        return "in_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}".format(
+            self.in_features,
+            self.out_features,
+            self.out_additional_features,
+            self.bias is not None,
+            self.partially_freeze,
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    tgt_len, past_key_values_length, dtype=dtype, device=device
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(
+        bsz, 1, tgt_len, tgt_len + past_key_values_length
+    )
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+
+
+class IdeficsRMSNorm(nn.Module):
+    def __init__(self, prefix, weights, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+
+        weight = weights.get_tensor(f"{prefix}.weight")
+        self.weight = nn.Parameter(weight)
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states, residual=None):
+        if SYSTEM == "ipex":
+            import intel_extension_for_pytorch as ipex
+
+            out = ipex.llm.functional.add_rms_norm(
+                residual,
+                hidden_states,
+                self.weight,
+                None,
+                self.variance_epsilon,
+                residual is not None,
+            )
+            return out
+        elif hidden_states.shape[-1] > 8192:
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(
+                variance + self.variance_epsilon
+            )
+
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+
+            return self.weight * hidden_states
+        elif SYSTEM == "cuda":
+            # faster post attention rms norm
+            unwrap = False
+            if len(hidden_states.shape) > 2:
+                unwrap = True
+                shape = hidden_states.shape
+                hidden_states = hidden_states.reshape(-1, shape[-1])
+
+            normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
+                hidden_states,
+                residual,
+                self.weight,
+                None,
+                None,
+                None,
+                None,
+                None,
+                0.0,
+                self.variance_epsilon,
+                1.0,
+                0,
+                None,
+                False,
+                True,  # Activate RMSNorm
+            )
+            if res is None:
+                res = hidden_states
+
+            if unwrap:
+                normed_hidden_states = normed_hidden_states.view(*shape)
+
+            return normed_hidden_states
+        elif SYSTEM == "rocm":
+            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            unwrap = False
+            if len(hidden_states.shape) > 2:
+                unwrap = True
+                shape = hidden_states.shape
+                hidden_states = hidden_states.reshape(-1, shape[-1])
+
+            out = torch.empty_like(hidden_states)
+            ops.rms_norm(
+                out,
+                hidden_states,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+
+            if unwrap:
+                out = out.view(*shape)
+
+            return out
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+
+# this was adapted from LlamaMLP
+class IdeficsMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+        prefix,
+        weights,
+    ):
+        super().__init__()
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        shape = gate_up_states.shape
+        gate_up_states = gate_up_states.view(*shape[:-1], 2, shape[-1] // 2)
+        return self.down_proj(
+            self.act_fn(gate_up_states[:, :, 0]) * gate_up_states[:, :, 1]
+        )
+
+
+# this was adapted from LlamaAttention
+class IdeficsAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config,
+        prefix,
+        weights,
+        qk_layer_norms: bool = False,
+        is_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.dropout = config.dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.is_cross_attention = is_cross_attention
+
+        # if not hasattr(nn.functional, "scaled_dot_product_attention"):
+        #     raise ValueError("this model requires pytorch 2.0 or higher")
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads //= weights.process_group.size()
+
+        if self.is_cross_attention:
+            # kv_input_dim = (
+            #     self.hidden_size if not hasattr(config.vision_config, "embed_dim") else config.vision_config.embed_dim
+            # )
+            self.q_proj = TensorParallelColumnLinear.load(
+                config, prefix=f"{prefix}.q_proj", weights=weights, bias=False
+            )
+            self.k_proj = TensorParallelColumnLinear.load(
+                config, prefix=f"{prefix}.k_proj", weights=weights, bias=False
+            )
+            self.v_proj = TensorParallelColumnLinear.load(
+                config, prefix=f"{prefix}.v_proj", weights=weights, bias=False
+            )
+        else:
+            self.qkv = TensorParallelColumnLinear.load_multi(
+                config,
+                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+                dim=0,
+                weights=weights,
+                bias=False,
+            )
+        self.o_proj = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.o_proj", weights=weights, bias=False
+        )
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config, dim=self.head_dim, base=10000.0, device=weights.device
+        )
+        self.qk_layer_norms = qk_layer_norms
+        if self.qk_layer_norms:
+            self.q_layer_norm = IdeficsRMSNorm(
+                prefix=f"{prefix}.q_layer_norm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+            self.k_layer_norm = IdeficsRMSNorm(
+                prefix=f"{prefix}.q_layer_norm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        is_cross_attention = self.is_cross_attention or key_value_states is not None
+
+        bsz, q_len, _ = hidden_states.size()
+
+        if is_cross_attention:
+            query_states = self.q_proj(hidden_states).view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # .transpose(1, 2)
+            query_states = query_states.transpose(1, 2)
+            (
+                _,
+                kv_len,
+                _,
+            ) = (
+                key_value_states.size()
+            )  # Note that, in this case, `kv_len` == `kv_seq_len`
+            key_states = (
+                self.k_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+            value_states = (
+                self.v_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
+        else:
+            qkv = self.qkv(hidden_states)
+            query_states, key_states, value_states = qkv.split(
+                self.num_heads * self.head_dim, dim=2
+            )
+
+            query_states = query_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # .transpose(1, 2)
+            key_states = key_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # . transpose(1, 2)
+            value_states = value_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # .transpose(1, 2)
+            kv_seq_len = q_len
+            if past_key_value is not None:
+                kv_seq_len += past_key_value[0].shape[-2]
+            max_s = max(kv_seq_len, q_len)
+            cos, sin = self.rotary_emb.get_cos_sin(
+                position_ids.view(-1), max_s, hidden_states.dtype
+            )
+
+            query_shape = query_states.shape
+            key_shape = key_states.shape
+            self.rotary_emb(
+                query_states.view(-1, *query_shape[2:]),
+                key_states.reshape(-1, *key_shape[2:]),
+                cos,
+                sin,
+            )
+
+            query_states = query_states.view(query_shape)
+            key_states = key_states.view(key_shape)
+
+            query_states = query_states.transpose(1, 2)
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        # [bsz, nh, t, hd]
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        if self.qk_layer_norms:
+            query_states = self.q_layer_norm(query_states)
+            key_states = self.k_layer_norm(key_states)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        attn_output = nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        attn_weights = None
+        if output_attentions:
+            logger.warning_once(
+                "attn_weights are not extracted in scaled_dot_product_attention. The model returns None instead"
+            )
+
+        return attn_output, attn_weights, past_key_value
+
+
+# this was adapted from LlamaDecoderLayer
+class IdeficsDecoderLayer(nn.Module):
+    def __init__(self, layer_id: int, config: IdeficsConfig, weights):
+        super().__init__()
+        self.process_group = weights.process_group
+        self.hidden_size = config.hidden_size
+        prefix = f"model.layers.{layer_id}"
+        self.self_attn = IdeficsAttention(
+            config=config,
+            prefix=f"{prefix}.self_attn",
+            weights=weights,
+            qk_layer_norms=False,
+            is_cross_attention=False,
+        )
+        self.mlp = IdeficsMLP(
+            config=config,
+            prefix=f"{prefix}.mlp",
+            weights=weights,
+        )
+        self.input_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.dropout = config.dropout
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        # hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class IdeficsGatedCrossAttentionLayer(nn.Module):
+    def __init__(self, layer_id, config: IdeficsConfig, weights):
+        super().__init__()
+        self.process_group = weights.process_group
+        self.hidden_size = config.hidden_size
+        prefix = f"model.gated_cross_attn_layers.{layer_id}"
+        self.cross_attn = IdeficsAttention(
+            config=config,
+            prefix=f"{prefix}.cross_attn",
+            weights=weights,
+            qk_layer_norms=True,
+            is_cross_attention=True,
+        )
+        self.mlp = IdeficsMLP(
+            config=config,
+            prefix=f"{prefix}.mlp",
+            weights=weights,
+        )
+        self.input_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.config = config.dropout
+
+        self.act_cross_attn = nn.Tanh()
+        self.act_dense = nn.Tanh()
+
+        self.alpha_cross_attn = nn.Parameter(
+            weights.get_tensor(f"{prefix}.alpha_cross_attn")
+        )
+        self.alpha_dense = nn.Parameter(weights.get_tensor(f"{prefix}.alpha_dense"))
+
+        if not (hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")):
+            raise ValueError("Alpha parameters not initialized correctly!")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_hidden_states: Optional[torch.Tensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        no_images: Optional[bool] = False,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            no_images (`bool`, *optional*, defaults to `False`): If `True` the vision part is ignored
+        """
+        if image_hidden_states is None:
+            raise ValueError(
+                "`image_hidden_states` is required for Idefics cross attention module which are visual features to be"
+                " conditioned on."
+            )
+
+        if past_key_value is not None:
+            raise NotImplementedError(
+                "Past key value states are not implemented for Idefics cross attention module."
+            )
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.cross_attn(
+            hidden_states=hidden_states,
+            key_value_states=image_hidden_states,
+            attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+        )
+        # hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
+        # when there are no images the model is used in pure language mode
+        gate = 0 if no_images else 1
+        hidden_states = (
+            residual + gate * self.act_cross_attn(self.alpha_cross_attn) * hidden_states
+        )
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
+        hidden_states = residual + self.act_dense(self.alpha_dense) * hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`IdeficsConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# @add_start_docstrings(
+#     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+#     LLAMA_START_DOCSTRING,
+# )
+class IdeficsPreTrainedModel(PreTrainedModel):
+    config_class = IdeficsConfig
+    # base_model_prefix = "model"
+    # supports_gradient_checkpointing = True
+    # _no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"]
+
+    # def _init_weights(self, module):
+    #     # important: this ported version of Idefics isn't meant for training from scratch - only
+    #     # inference and fine-tuning - so the proper init weights code has been removed - the m4 code
+    #     # base should be used for training from scratch and it contains the correct code.
+    #     std = self.config.initializer_range
+    #     if isinstance(module, nn.Linear):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.bias is not None:
+    #             module.bias.data.zero_()
+    #     elif isinstance(module, nn.Embedding):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.padding_idx is not None:
+    #             module.weight.data[module.padding_idx].zero_()
+
+    # def _set_gradient_checkpointing(self, module, value=False):
+    #     if isinstance(module, IdeficsModel):
+    #         module.gradient_checkpointing = value
+
+
+# LLAMA_INPUTS_DOCSTRING = r"""
+#     Args:
+#         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+#             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+#             it.
+
+#             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+#             [`PreTrainedTokenizer.__call__`] for details.
+
+#             [What are input IDs?](../glossary#input-ids)
+#         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+#             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+#             - 1 for tokens that are **not masked**,
+#             - 0 for tokens that are **masked**.
+
+#             [What are attention masks?](../glossary#attention-mask)
+
+#             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+#             [`PreTrainedTokenizer.__call__`] for details.
+
+#             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+#             `past_key_values`).
+
+#             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+#             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+#             information on the default strategy.
+
+#             - 1 indicates the head is **not masked**,
+#             - 0 indicates the head is **masked**.
+#         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+#             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+#             config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+#         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+#             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+#             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+#             `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+#             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+#             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+#             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+#             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+#             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+#         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+#             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+#             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+#             model's internal embedding lookup matrix.
+#         use_cache (`bool`, *optional*):
+#             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+#             `past_key_values`).
+#         output_attentions (`bool`, *optional*):
+#             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+#             tensors for more detail.
+#         output_hidden_states (`bool`, *optional*):
+#             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+#             more detail.
+#         return_dict (`bool`, *optional*):
+#             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+# """
+
+
+# @add_start_docstrings(
+#     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+#     LLAMA_START_DOCSTRING,
+# )
+class IdeficsModel(IdeficsPreTrainedModel):
+    # """
+    # Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]
+
+    # Args:
+    #     config: IdeficsConfig
+    # """
+
+    def __init__(self, config: IdeficsConfig, weights):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = IdeficsDecoupledPartialTPEmbedding(
+            config=config,
+            weights=weights,
+        )
+
+        self.image_size = config.vision_config.image_size
+        self.vision_config = config.vision_config
+        self.vision_model = IdeficsVisionTransformer(
+            prefix="model.vision_model",
+            config=config.vision_config,
+            weights=weights,
+        )
+
+        # Perceiver Resampler
+        if config.use_resampler:
+            perceiver_config = config.perceiver_config
+            self.perceiver_resampler = IdeficsPerceiverResampler(
+                prefix="model.perceiver_resampler",
+                config=config,
+                embed_dim=config.vision_config.embed_dim,
+                depth=perceiver_config.resampler_depth,
+                n_heads=perceiver_config.resampler_n_heads,
+                head_dim=perceiver_config.resampler_head_dim,
+                n_latents=perceiver_config.resampler_n_latents,
+                weights=weights,
+            )
+
+        self.layers = nn.ModuleList(
+            [
+                IdeficsDecoderLayer(layer_id, config, weights)
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.cross_layer_interval = config.cross_layer_interval
+        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
+        self.gated_cross_attn_layers = nn.ModuleList(
+            [
+                IdeficsGatedCrossAttentionLayer(layer_id, config, weights)
+                for layer_id in range(num_cross_layers)
+            ]
+        )
+        # self.gradient_checkpointing = False
+
+        self.norm = IdeficsRMSNorm(
+            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        # self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()
+
+        # self.freeze_relevant_params(config)
+
+    # def freeze_relevant_params(self, config=None):
+    #     if config is None:
+    #         config = self.config
+
+    #     if config.freeze_text_layers:
+    #         self.freeze_text_layers(config.freeze_text_module_exceptions)
+
+    #     if config.freeze_vision_layers:
+    #         freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
+
+    # def freeze_text_layers(self, module_exceptions=[]):
+    #     for module in [self.layers, self.norm]:
+    #         freeze_model(module, module_exceptions=module_exceptions)
+
+    # def freeze_vision_layers(self, module_exceptions=[]):
+    #     freeze_model(self.vision_model, module_exceptions=module_exceptions)
+
+    # def get_input_embeddings(self):
+    #     return self.embed_tokens
+
+    # def set_input_embeddings(self, value):
+    #     self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    # @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        image_embeddings: Optional[torch.FloatTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastImage]:
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+        elif position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        no_images = False
+
+        if image_hidden_states is None:
+            if pixel_values is None and image_embeddings is None:
+                raise ValueError(
+                    "Either pixel_values and image_embeddings have to be not-None."
+                )
+
+            elif pixel_values is not None and image_embeddings is not None:
+                raise ValueError(
+                    "You cannot specify both pixel_values and image_embeddings at the same time"
+                )
+
+            elif pixel_values is not None:
+                no_images = len(torch.nonzero(pixel_values)) == 0
+                pixel_values = pixel_values.to(
+                    dtype=self.dtype, device=device
+                )  # fp16 compatibility
+                batch_size, num_images = pixel_values.shape[:2]
+                pixel_values = pixel_values.contiguous().view(
+                    batch_size * num_images, *pixel_values.shape[2:]
+                )
+
+                # Get sequence from the vision encoder
+                image_hidden_states = self.vision_model(
+                    pixel_values=pixel_values
+                ).last_hidden_state
+
+            elif image_embeddings is not None:
+                (
+                    batch_size,
+                    num_images,
+                    image_seq_len,
+                    image_hidden_size,
+                ) = image_embeddings.size()
+                image_hidden_states = image_embeddings.to(
+                    dtype=self.dtype, device=input_ids.device
+                )
+                image_hidden_states = image_hidden_states.view(
+                    batch_size * num_images, image_seq_len, image_hidden_size
+                )
+
+            if self.config.use_resampler:
+                image_hidden_states = self.perceiver_resampler(image_hidden_states)
+            image_seq_len, image_hidden_size = image_hidden_states.size(
+                1
+            ), image_hidden_states.size(2)
+            image_hidden_states = image_hidden_states.view(
+                batch_size, num_images * image_seq_len, image_hidden_size
+            )
+        else:
+            no_images = False
+            num_images = pixel_values.shape[1]
+            image_seq_len = image_hidden_states.shape[1] // num_images
+
+        # # Hack to use the model in full language modeling mode
+        # image_attention_mask = torch.zeros(batch_size, seq_length, 1, dtype=torch.long, device=image_hidden_states.device)
+        # Make image_attention_mask compatible with hidden states
+        text_seq_len = image_attention_mask.size(1)
+        image_attention_mask = image_attention_mask.unsqueeze(-1)
+        image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
+        image_attention_mask = image_attention_mask.view(
+            batch_size, text_seq_len, num_images * image_seq_len
+        )
+        image_batch_size, image_sequence_length, _ = image_hidden_states.size()
+        image_hidden_shape = (image_batch_size, image_sequence_length)
+        if image_attention_mask is None:
+            image_attention_mask = torch.ones(image_hidden_shape, device=device)
+        image_attention_mask = self.invert_attention_mask(image_attention_mask)
+
+        # if list(image_attention_mask.shape) != [4, 1, 1024, 64]:
+        #     raise ValueError(f"Image hidden_states {image_hidden_states.shape} - mask {image_attention_mask.shape} {num_images} {image_seq_len} {text_seq_len}")
+
+        # if image_hidden_states is not None:
+        # else:
+        #     image_attention_mask = None
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+
+        hidden_states = inputs_embeds
+
+        # if self.gradient_checkpointing and self.training:
+        #     if use_cache:
+        #         logger.warning_once(
+        #             "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+        #         )
+        #         use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+
+            def vblock(
+                main_block,
+                hidden_states,
+                attention_mask,
+                position_ids,
+                past_key_value,
+                image_hidden_states,
+                image_attention_mask,
+                output_attentions,
+                use_cache,
+                no_images,
+                layer_idx,
+                cross_layer_interval,
+                gated_cross_attn_layers,
+            ):
+                # TODO(ls): Add cross attention values to respective lists
+                if layer_idx % cross_layer_interval == 0:
+                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
+                    outputs = xblock(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        image_hidden_states=image_hidden_states,
+                        image_attention_mask=image_attention_mask,
+                        output_attentions=output_attentions,
+                        use_cache=use_cache,
+                        past_key_value=None,  # not implemented
+                        no_images=no_images,
+                    )
+                    hidden_states = outputs[0]
+
+                layer_outputs = main_block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+                return layer_outputs
+
+            # if self.gradient_checkpointing and self.training:
+            #     past_key_value = None
+            #     if use_cache:
+            #         logger.warning_once(
+            #             "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            #         )
+            #         use_cache = False
+
+            #     layer_outputs = torch.utils.checkpoint.checkpoint(
+            #         vblock,
+            #         decoder_layer,
+            #         hidden_states,
+            #         attention_mask,
+            #         position_ids,
+            #         past_key_value,
+            #         image_hidden_states,
+            #         image_attention_mask,
+            #         output_attentions,
+            #         use_cache,
+            #         no_images,
+            #         idx,
+            #         self.cross_layer_interval,
+            #         self.gated_cross_attn_layers,
+            #     )
+            # else:
+            layer_outputs = vblock(
+                decoder_layer,
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                image_hidden_states=image_hidden_states,
+                image_attention_mask=image_attention_mask,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                no_images=no_images,
+                layer_idx=idx,
+                cross_layer_interval=self.cross_layer_interval,
+                gated_cross_attn_layers=self.gated_cross_attn_layers,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPastImage(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            image_hidden_states=image_hidden_states,
+        )
+
+
+class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
+    def __init__(
+        self,
+        config,
+        weights,
+    ):
+        super().__init__(config)
+        self.model = IdeficsModel(
+            config=config,
+            weights=weights,
+        )
+
+        self.lm_head = IdeficsDecoupledTensorParallelLinear(
+            config=config,
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_embeddings: Optional[torch.FloatTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPastImage]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_embeddings=image_embeddings,
+            image_hidden_states=image_hidden_states,
+            image_attention_mask=image_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits, speculative_logits = self.lm_head(hidden_states)
+
+        loss = None
+
+        return (
+            CausalLMOutputWithPastImage(
+                loss=loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+                image_hidden_states=outputs.image_hidden_states,
+            ),
+            speculative_logits,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
+        unwanted_kwargs = ["token_type_ids"]
+        for kwarg in unwanted_kwargs:
+            inputs.pop(kwarg, None)
+        return inputs
+
+    @staticmethod
+    def _expand_inputs_for_generation(
+        *args,
+        **model_kwargs,
+    ):
+        return expand_inputs_for_generation(*args, **model_kwargs)
+
+    @staticmethod
+    def _update_model_kwargs_for_generation(
+        outputs, model_kwargs, is_encoder_decoder=False
+    ):
+        return update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
+        )
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_perceiver.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
new file mode 100644
index 00000000..6da8045b
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
@@ -0,0 +1,276 @@
+# This code was adapted from https://github.com/lucidrains/flamingo-pytorch licensed under the MIT License.
+#
+# MIT License
+#
+# Copyright (c) 2020  The Google AI Language Team Authors, The HuggingFace Inc. team and github/lonePatient
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+"""
+
+Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
+time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
+that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
+prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
+to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.
+
+References:
+    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
+    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch
+
+"""
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+
+EPS = 1e-5
+
+
+class IdeficsPerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        prefix,
+        config,
+        embed_dim: int,
+        depth: int,
+        n_heads: int,
+        head_dim: int,
+        n_latents: int,
+        weights,
+    ) -> None:
+        """
+        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
+        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
+        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
+        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
+        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.
+
+        Args:
+            config (`IdeficsConfig`): config object
+            embed_dim (`int`): The size of each embedding vector
+            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
+            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
+            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
+            n_latents (`int`):
+                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
+
+        """
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = (
+            embed_dim,
+            n_heads,
+            head_dim,
+            n_latents,
+        )
+        self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver
+
+        # Create Latents for Perceiver
+        self.latents = nn.Parameter(weights.get_tensor(f"{prefix}.latents"))
+
+        self.intermediate_dim = (
+            self.embed_dim * 4
+            if not hasattr(config.vision_config, "embed_dim")
+            else config.vision_config.embed_dim * 4
+        )
+        # Create Transformer Blocks
+        self.blocks = nn.ModuleList(
+            [
+                nn.ModuleList(
+                    [
+                        IdeficsPerceiverAttention(
+                            prefix=f"{prefix}.blocks.{layer_id}.0",
+                            config=config,
+                            embed_dim=self.embed_dim,
+                            n_heads=self.n_heads,
+                            head_dim=self.head_dim,
+                            qk_layer_norms=self.qk_layer_norms,
+                            weights=weights,
+                        ),
+                        IdeficsMLP(
+                            prefix=f"{prefix}.blocks.{layer_id}.1",
+                            intermediate_size=self.intermediate_dim,
+                            config=config,
+                            weights=weights,
+                        ),
+                    ]
+                )
+                for layer_id in range(depth)
+            ]
+        )
+        self.layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm", weights=weights, eps=EPS
+        )
+
+    def forward(self, context: torch.Tensor) -> torch.Tensor:
+        """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
+        # einsum.repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
+        latents = self.latents.repeat(context.shape[0], 1, 1)
+
+        # Feed through Perceiver Attention blocks...
+        for attn, ff in self.blocks:
+            latents = attn(context, latents) + latents
+            latents = ff(latents) + latents
+
+        return self.layer_norm(latents)
+
+
+class IdeficsPerceiverAttention(nn.Module):
+    def __init__(
+        self,
+        prefix,
+        config,
+        embed_dim: int,
+        n_heads: int,
+        head_dim: int,
+        qk_layer_norms: bool,
+        weights,
+    ) -> None:
+        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
+        super().__init__()
+        self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
+        self.qk_layer_norms = qk_layer_norms
+        # Normalization & Scaling
+        self.context_layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.context_layer_norm", weights=weights, eps=EPS
+        )
+        self.latents_layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.latents_layer_norm", weights=weights, eps=EPS
+        )
+        if self.qk_layer_norms:
+            self.q_layer_norm = nn.LayerNorm.load(
+                prefix=f"{prefix}.q_layer_norm", weights=weights, eps=EPS
+            )
+            self.k_layer_norm = nn.LayerNorm.load(
+                prefix=f"{prefix}.k_layer_norm", weights=weights, eps=EPS
+            )
+
+        self.qk_scale = self.head_dim**-0.5
+
+        if n_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {n_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.n_heads //= weights.process_group.size()
+
+        # Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
+        self.q_proj = TensorParallelColumnLinear.load(
+            config=config, prefix=f"{prefix}.q_proj", weights=weights, bias=False
+        )
+        self.k_proj = TensorParallelColumnLinear.load(
+            config=config, prefix=f"{prefix}.k_proj", weights=weights, bias=False
+        )
+        self.v_proj = TensorParallelColumnLinear.load(
+            config=config, prefix=f"{prefix}.v_proj", weights=weights, bias=False
+        )
+
+        self.output_proj = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.output_proj", weights=weights, bias=False
+        )
+
+    def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
+        """
+        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
+
+        Args:
+            context (`torch.Tensor`):
+                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
+            latents (`torch.Tensor`):
+                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.
+
+        Returns:
+            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
+            from context.
+        """
+        context = self.context_layer_norm(context)
+        latents = self.latents_layer_norm(latents)
+        batch_size, seq_length, embed_dim = context.shape[:3]
+
+        # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
+        #   Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
+        q = self.q_proj(latents)
+        k = self.k_proj(torch.cat([context, latents], dim=-2))
+        v = self.v_proj(torch.cat([context, latents], dim=-2))
+
+        # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
+        #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
+        # einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
+        q, k, v = [
+            x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(
+                1, 2
+            )
+            for x in (q, k, v)
+        ]
+
+        if self.qk_layer_norms:
+            q = self.q_layer_norm(q)
+            k = self.k_layer_norm(k)
+
+        scores = torch.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
+        stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
+        attn = stabilized_scores.softmax(dim=-1)
+
+        # Attend & project back to output...
+        resampled = torch.einsum("... i j, ... j d -> ... i d", attn, v)
+        # einsum.rearrange(resampled, "bsz heads seq embed -> bsz seq (heads embed)", heads=self.n_heads)
+        return self.output_proj(resampled.transpose(1, 2).flatten(-2))
+
+
+class IdeficsMLP(nn.Module):
+    def __init__(
+        self,
+        prefix,
+        intermediate_size,
+        config,
+        weights,
+    ):
+        """Simple MLP block with intermediate_size and embedding size"""
+        super().__init__()
+        self.embed_dim = config.vision_config.embed_dim
+        self.ln = nn.LayerNorm.load(prefix=f"{prefix}.ln", weights=weights, eps=EPS)
+        self.fc = TensorParallelColumnLinear.load(
+            config=config,
+            prefix=f"{prefix}.fc",
+            weights=weights,
+            bias=False,
+        )
+        self.act = nn.ReLU()
+        self.c_proj = TensorParallelRowLinear.load(
+            config=config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=False,
+        )
+
+    def forward(
+        self, hidden_states: Optional[Tuple[torch.FloatTensor]]
+    ) -> torch.FloatTensor:
+        hidden_states = self.ln(hidden_states)
+        hidden_states = self.fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+
+        return hidden_states
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_processing.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_processing.py
new file mode 100644
index 00000000..ca61e27d
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_processing.py
@@ -0,0 +1,443 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for IDEFICS.
+"""
+
+from typing import Callable, List, Optional, Union
+from urllib.parse import urlparse
+
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import (
+    BatchEncoding,
+    PaddingStrategy,
+    TextInput,
+    TruncationStrategy,
+)
+from transformers.utils import TensorType, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+IMAGE_TOKEN = "<image>"
+
+
+# copied from m4.training.packing
+def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1):
+    # This function converts: [-1, 0, 1] => [[0, 0], [1, 0], [0, 1]]
+
+    # If any of images index are more than num_classes, set them to -1.
+    # Words after the max number of images allowed have been seen don't attend on anything
+    if num_classes != -1:
+        incremental_mask[incremental_mask >= num_classes] = -1
+
+    negatives = incremental_mask == -1
+    incremental_mask[negatives] = 0
+    attn_mask = torch.nn.functional.one_hot(incremental_mask, num_classes=num_classes)
+    attn_mask[negatives, :] = 0
+    return attn_mask
+
+
+# copied from m4.training.packing
+def image_attention_mask_for_packed_input_ids(input_ids, tokenizer):
+    image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    next_image_attention_mask = torch.full_like(input_ids, fill_value=-1)
+    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+    eod_token_id = tokenizer.eos_token_id
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx, token_id in enumerate(input_ids[batch_idx]):
+            if token_id == image_token_id:
+                count += 1
+                image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                image_attention_mask[batch_idx][idx] = count
+
+            if seen_eod:
+                image_attention_mask[batch_idx][idx] = -1
+
+            if token_id == eod_token_id:
+                seen_eod = True
+
+    for batch_idx in range(input_ids.size(0)):
+        count = -1
+        seen_eod = False
+        for idx in range(input_ids[batch_idx].size(0) - 1, -1, -1):
+            token_id = input_ids[batch_idx][idx]
+            if token_id == image_token_id:
+                count += 1
+                next_image_attention_mask[batch_idx][idx] = count
+                seen_eod = False
+            else:
+                next_image_attention_mask[batch_idx][idx] = count
+
+            if token_id == eod_token_id:
+                seen_eod = True
+
+            if seen_eod:
+                next_image_attention_mask[batch_idx][idx] = -1
+
+        non_negative_indices = next_image_attention_mask[batch_idx] != -1
+        next_image_attention_mask[batch_idx][non_negative_indices] -= count
+        next_image_attention_mask[batch_idx][non_negative_indices] *= -1
+
+    return image_attention_mask, next_image_attention_mask
+
+
+def is_url(string):
+    """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
+    invalidated the url"""
+    if " " in string:
+        return False
+    result = urlparse(string)
+    return all([result.scheme, result.netloc])
+
+
+def is_image(string):
+    """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
+    invalidated the url"""
+    return is_url(string) or string.startswith("data:")
+
+
+class IdeficsProcessor(ProcessorMixin):
+    r"""
+    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
+
+    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
+    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`IdeficsImageProcessor`):
+            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizerFast`):
+            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
+        image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "IdeficsImageProcessor"
+    tokenizer_class = "LlamaTokenizerFast"
+
+    def __init__(
+        self,
+        image_processor,
+        tokenizer=None,
+        image_size=224,
+        add_end_of_utterance_token=None,
+        **kwargs,
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+
+        self.default_image_dims = (
+            self.image_processor.image_num_channels,
+            self.image_processor.image_size,
+            self.image_processor.image_size,
+        )
+
+        self.tokenizer_was_trained_with_end_of_utterance_token = (
+            True
+            if "<end_of_utterance>"
+            in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
+            else False
+        )
+
+    def __call__(
+        self,
+        prompts: Union[List[TextInput], List[List[TextInput]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        transform: Callable = None,
+        add_eos_token=False,
+        add_end_of_utterance_token=None,
+        debug=False,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchEncoding:
+        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
+        the model was trained on and prepares the image pixel values for the model to process.
+
+        Args:
+            prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
+                either a single prompt or a batched list of prompts - see the detailed description immediately after
+                the end of the arguments doc section.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            transform (`Callable`, *optional*):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
+                set of transforms will be applied to the images
+            add_eos_token (`bool`, *optional*, defaults to `False`):
+                Adds `eos_token` at the end of the final prompt if True`
+            add_end_of_utterance_token (`bool`, *optional*)
+                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
+                image). If `None` the tokenizer will be checked instead and if this token is found in
+                `additional_special_tokens` then the value will be `True`.
+            debug (`bool`, *optional*, defaults to `False`):
+                `True` value will help debug prompt generation by dumping useful information
+            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
+                The type of tensors to return. Can be one of:
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+
+        Returns:
+            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
+            directly passed to `model.generate`
+
+        Detailed explanation:
+
+        Each entry in `prompts` is either a text to be passed as is or an image that will be processed.
+
+        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
+
+        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
+        entry into the prompt.
+
+        Example:
+
+        ```python
+        checkpoint = "HuggingFaceM4/idefics-9b"
+        processor = AutoProcessor.from_pretrained(checkpoint)
+        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
+        img = processor.image_processor.fetch_images([url])[0]
+
+        prompts = [
+            "User:",
+            img,
+            "Describe this image.\nAssistant: An image of two kittens in grass.\n",
+            "User:",
+            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
+            "Describe this image.\nAssistant:",
+        ]
+
+        inputs = processor(prompts, return_tensors="pt")
+        generated_ids = model.generate(**inputs, max_length=100)
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```
+
+        In this example the `prompts` will be converted into:
+
+        ```
+        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant: An image of two kittens in grass.
+        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
+        Assistant:'
+        ```
+
+        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
+        `pixel_values` dict entry of the return value.
+
+        This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
+        first image is passed as object and the second one as a url.
+
+        To do training do:
+
+        ```python
+        image_transform = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=self.image_mean, std=self.image_std),
+            ]
+        )
+        inputs = processor(prompts, transform=image_transform, return_tensors="pt")
+        ```
+
+        In order to help debug prompt generation enable `debug=True` which will show you what's happening.
+
+        """
+
+        # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
+        if add_end_of_utterance_token is None:
+            add_end_of_utterance_token = (
+                self.tokenizer_was_trained_with_end_of_utterance_token
+            )
+
+        # turn non-batched prompts into batched
+        if not any(isinstance(i, list) for i in prompts):
+            prompts = [prompts]
+
+        fake_token = "<fake_token_around_image>"
+        image_token = "<image>"
+        end_of_utterance_token = "<end_of_utterance>"
+
+        def image_tokens(last_was_image):
+            if last_was_image:
+                return image_token + fake_token
+            else:
+                return fake_token + image_token + fake_token
+
+        all_texts = []
+        all_images = []
+        for sample in prompts:
+            # the model was trained on samples starting with <s>
+            full_text = f"{self.tokenizer.bos_token}"
+
+            # an image can either be an image object in the item or the url, everything else is a verbatim prompt text
+            image_objects = []
+            last_was_image = False
+            last_was_text = False
+            for i, item in enumerate(sample):
+                if i > 0:
+                    last_was_text = True if not last_was_image else False
+
+                if isinstance(item, str):
+                    item = item.strip(" ")
+                    if is_image(item):
+                        image = self.image_processor.fetch_images(item)
+                        full_text += image_tokens(last_was_image)
+                        image_objects.append(image)
+                        last_was_image = True
+                    else:
+                        # we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
+                        if add_end_of_utterance_token and last_was_text:
+                            full_text += end_of_utterance_token
+                        full_text += item
+                        last_was_image = False
+                else:
+                    # must be an image obj
+                    full_text += image_tokens(last_was_image)
+                    image_objects.append(item)
+                    last_was_image = True
+
+            if add_eos_token:
+                full_text += self.tokenizer.eos_token
+
+            if debug is True:
+                print(f"{full_text=}")
+
+            image_objects = self.image_processor(image_objects, transform=transform)
+
+            text_encoding = self.tokenizer(
+                text=full_text,
+                add_special_tokens=False,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+            )
+
+            all_texts.append(text_encoding["input_ids"])
+            all_images.append(image_objects)
+
+        max_seq_len = max(len(x) for x in all_texts)
+
+        # max_num_images has to be at least 1 even when there are no images
+        max_num_images = max(len(x) for x in all_images)
+        max_num_images = max(1, max_num_images)
+
+        at_least_one_image = sum(len(x) for x in all_images) > 0
+        output_input_ids = []
+        output_images = []
+        output_attention_masks = []
+        for text, images in zip(all_texts, all_images):
+            padded_input_ids = [self.tokenizer.pad_token_id] * max_seq_len
+            unpadded_seq_len = len(text)
+            start = max_seq_len - unpadded_seq_len
+            padded_input_ids[start:] = text[:max_seq_len]
+
+            attention_mask = torch.zeros((max_seq_len,), dtype=torch.long)
+            attention_mask[start:] = 1
+
+            image_count = padded_input_ids.count(self.image_token_id)
+            local_max_num_images = min(image_count, max_num_images)
+
+            current_images = images[:local_max_num_images]
+
+            if len(current_images) > 0:
+                padded_image_tensor = torch.zeros(
+                    max_num_images, *current_images.size()[1:]
+                )
+                padded_image_tensor[: current_images.size(0)] = current_images
+            else:
+                padded_image_tensor = torch.zeros(
+                    max_num_images, *self.default_image_dims
+                )
+
+            output_images.append(padded_image_tensor)
+            output_input_ids.append(torch.tensor(padded_input_ids))
+
+            output_attention_masks.append(attention_mask)
+
+        output_input_ids = torch.stack(output_input_ids)
+        output_images = torch.stack(output_images)
+        output_attention_masks = torch.stack(output_attention_masks)
+
+        if at_least_one_image:
+            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(
+                output_input_ids, self.tokenizer
+            )
+            image_attention_mask = incremental_to_binary_attention_mask(
+                image_attention_mask, num_classes=max_num_images
+            )
+        else:
+            # in full language mode we set the image mask to all-0s
+            image_attention_mask = torch.zeros(
+                output_input_ids.shape[0],
+                output_input_ids.shape[1],
+                1,
+                dtype=torch.bool,
+            )
+
+        return BatchFeature(
+            data={
+                "input_ids": output_input_ids,
+                "attention_mask": output_attention_masks,
+                "pixel_values": output_images,
+                "image_attention_mask": image_attention_mask,
+            }
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py
new file mode 100644
index 00000000..dd8f76bc
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_vision.py
@@ -0,0 +1,529 @@
+# coding=utf-8
+# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.utils import (
+    ModelOutput,
+    logging,
+)
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    TensorParallelEmbedding,
+)
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class IdeficsVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Idefics
+class IdeficsVisionEmbeddings(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(
+            weights.get_tensor(f"{prefix}.class_embedding")
+        )
+
+        self.patch_embedding = nn.Conv2d.load_no_bias(
+            prefix=f"{prefix}.patch_embedding",
+            weights=weights,
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = TensorParallelEmbedding(
+            prefix="model.vision_model.embeddings.position_embedding", weights=weights
+        )
+        self.position_ids = (
+            torch.arange(self.num_positions).expand((1, -1)).to(device=weights.device)
+        )
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->IdeficsVision
+class IdeficsVisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.embed_dim = self.embed_dim // weights.process_group.size()
+
+        self.k_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.k_proj", weights=weights, bias=True
+        )
+        self.v_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.v_proj", weights=weights, bias=True
+        )
+        self.q_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.q_proj", weights=weights, bias=True
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + causal_attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->IdeficsVision
+class IdeficsVisionMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.fc1", weights=weights, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.fc2", weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->IdeficsVision
+class IdeficsVisionEncoderLayer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = IdeficsVisionAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.layer_norm1 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
+        )
+        self.mlp = IdeficsVisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+        self.layer_norm2 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->IdeficsVision
+class IdeficsVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`IdeficsVisionEncoderLayer`].
+
+    Args:
+        config: IdeficsVisionConfig
+    """
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                IdeficsVisionEncoderLayer(
+                    prefix=f"{prefix}.encoder.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        # self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # if self.gradient_checkpointing and self.training:
+
+            #     def create_custom_forward(module):
+            #         def custom_forward(*inputs):
+            #             return module(*inputs, output_attentions)
+
+            #         return custom_forward
+
+            #     layer_outputs = torch.utils.checkpoint.checkpoint(
+            #         create_custom_forward(encoder_layer),
+            #         hidden_states,
+            #         attention_mask,
+            #         causal_attention_mask,
+            #     )
+            # else:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+# Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer
+class IdeficsVisionTransformer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = IdeficsVisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
+        self.pre_layrnorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.pre_layrnorm", weights=weights, eps=config.layer_norm_eps
+        )
+        self.encoder = IdeficsVisionEncoder(
+            prefix=prefix, config=config, weights=weights
+        )
+        self.post_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.post_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+
+    # copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py
new file mode 100644
index 00000000..f98dab91
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -0,0 +1,342 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Llava-NeXT model."""
+
+from typing import List, Optional
+
+import torch
+import torch.utils.checkpoint
+
+from transformers.models.llava_next.modeling_llava_next import (
+    unpad_image,
+)
+from optimum.habana.transformers.models import GaudiLlavaNextForConditionalGeneration
+from transformers.image_processing_utils import select_best_resolution
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`tuple`):
+            The size of the input image in the format (width, height).
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise ValueError("grid_pinpoints should be a list of tuples or lists")
+
+    height, width = select_best_resolution(image_size, grid_pinpoints)
+    return height // patch_size, width // patch_size
+
+
+class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration):
+
+    def _merge_input_ids_with_image_features(
+        self,
+        inputs_embeds: torch.Tensor,
+        image_features: torch.Tensor,
+        input_ids: torch.Tensor,
+    ):
+        """In place merges in vision_embeddings with inputs_embeds."""
+        mask = input_ids == self.config.image_token_index
+        # Let's pray we have enabled enough slots !
+        try:
+            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+        except Exception as e:
+            raise RuntimeError(
+                f"Cannot fill images right now. If error happens at warmup, make sure you have enough `--max-input-tokens`  to handle images. If error happens at regular runtime, please fill in an issue: {e}"
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[int] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_idx: Optional[torch.Tensor] = None,
+        use_flash_attention: Optional[bool] = True,
+        flash_attention_recompute: Optional[bool] = True,
+    ):
+
+        if token_idx is not None:
+            output_attentions = (
+                output_attentions
+                if output_attentions is not None
+                else self.config.output_attentions
+            )
+            output_hidden_states = (
+                output_hidden_states
+                if output_hidden_states is not None
+                else self.config.output_hidden_states
+            )
+            return_dict = (
+                return_dict if return_dict is not None else self.config.use_return_dict
+            )
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            outputs = self.language_model(
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                token_idx=token_idx,
+                use_flash_attention=use_flash_attention,
+                flash_attention_recompute=flash_attention_recompute,
+            )
+
+            logits = outputs[0]
+
+            if not return_dict:
+                output = (logits,) + outputs[1:]
+                return output
+
+        return outputs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        image_sizes=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        """
+        Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava_next/modeling_llava_next.py#L635
+        The only differences are:
+        - add new args token_idx
+        - add the process of merging images into inputs_embeds
+        """
+        token_idx = kwargs.get("token_idx", None)
+        if token_idx is None:
+            return super().prepare_inputs_for_generation(
+                input_ids=input_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                pixel_values=pixel_values,
+                image_sizes=image_sizes,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+        else:
+            use_flash_attention = kwargs.get("use_flash_attention", True)
+            flash_attention_recompute = kwargs.get("flash_attention_recompute", True)
+
+            position_ids = kwargs.get("position_ids", None)
+            labels = kwargs.get("labels", None)
+            if (
+                past_key_values is None
+                and pixel_values is not None
+                and input_ids.shape[1] != 1
+            ):
+                vision_feature_select_strategy = kwargs.get(
+                    "vision_feature_select_strategy", None
+                )
+                vision_feature_layer = kwargs.get("vision_feature_layer", None)
+                vision_feature_select_strategy = (
+                    vision_feature_select_strategy
+                    if vision_feature_select_strategy is not None
+                    else self.config.vision_feature_select_strategy
+                )
+                vision_feature_layer = (
+                    vision_feature_layer
+                    if vision_feature_layer is not None
+                    else self.config.vision_feature_layer
+                )
+
+                # 1. Extract the input embeddings
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+                # 2. Merge text and images
+                batch_size, num_patches, num_channels, height, width = (
+                    pixel_values.shape
+                )
+                reshaped_pixel_values = pixel_values.view(
+                    batch_size * num_patches, num_channels, height, width
+                )
+                image_features = self.vision_tower(
+                    reshaped_pixel_values,
+                    output_hidden_states=True,
+                    use_flash_attention=use_flash_attention,
+                    flash_attention_recompute=flash_attention_recompute,
+                )
+
+                selected_image_feature = image_features.hidden_states[
+                    vision_feature_layer
+                ]
+
+                if vision_feature_select_strategy == "default":
+                    selected_image_feature = selected_image_feature[:, 1:]
+                elif vision_feature_select_strategy == "full":
+                    selected_image_feature = selected_image_feature
+
+                image_features = self.multi_modal_projector(selected_image_feature)
+
+                # split up image_features for each of the individual images
+                # hence we get a list of image_features, each of shape (5, num_patches, hidden_size)
+                # if we assume each image has 5 image features (base image + 4 patches)
+                split_sizes = [image.shape[0] for image in pixel_values]
+                image_features = torch.split(image_features, split_sizes, dim=0)
+
+                # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
+                height = width = (
+                    self.config.vision_config.image_size
+                    // self.config.vision_config.patch_size
+                )
+
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    if image_feature.shape[0] > 1:
+                        base_image_feature = image_feature[0]
+                        image_feature = image_feature[1:]
+
+                        if height * width != base_image_feature.shape[0]:
+                            raise ValueError(
+                                "The number of patches is not consistent with the image size."
+                            )
+
+                        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                            image_sizes[image_idx].tolist(),
+                            self.config.image_grid_pinpoints,
+                            self.config.vision_config.image_size,
+                        )
+
+                        image_feature = image_feature.view(
+                            num_patch_height, num_patch_width, height, width, -1
+                        )
+                        image_feature = image_feature.permute(
+                            4, 0, 2, 1, 3
+                        ).contiguous()
+                        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                        image_feature = unpad_image(
+                            image_feature, image_sizes[image_idx]
+                        )
+                        image_feature = torch.cat(
+                            (
+                                image_feature,
+                                self.image_newline[:, None, None].expand(
+                                    *image_feature.shape[:-1], 1
+                                ),
+                            ),
+                            dim=-1,
+                        )
+                        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        image_feature = torch.cat(
+                            (base_image_feature, image_feature), dim=0
+                        )
+                    else:
+                        image_feature = image_feature[0]
+                        image_feature = torch.cat(
+                            (image_feature, self.image_newline[None]), dim=0
+                        )
+                    new_image_features.append(image_feature)
+                image_features = torch.stack(new_image_features, dim=0)
+                inputs_embeds = self._merge_input_ids_with_image_features(
+                    inputs_embeds, image_features, input_ids
+                )
+                self.image_offset = (
+                    image_features.shape[1] - 1
+                )  # image_token has occupied 1 token position.
+            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+            # generation with cache
+            elif past_key_values is not None:
+                seq_len = input_ids.shape[1]
+                pad_len = seq_len - token_idx.item()
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+                # Retrieve the first layer to inspect the logits and mask out the hidden states
+                # that are set to 0
+                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                batch_index, non_attended_tokens = torch.where(
+                    first_layer_past_key_value.float().sum(-2) == 0
+                )
+
+                # Get the target length
+                past_length = first_layer_past_key_value.shape[-1]
+                extended_attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_length),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                # Filter out only the tokens that can be un-attended, this can happen
+                # if one uses Llava + Fused modules where the cache on the
+                # first iteration is already big enough, or if one passes custom cache
+                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                new_batch_index = batch_index[valid_indices]
+                new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                # Zero-out the places where we don't need to attend
+                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+
+                attention_mask = extended_attention_mask
+                attention_mask[:, -pad_len:] = 0
+
+            if attention_mask is not None and position_ids is None:
+                # create position_ids on the fly for batch generation
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                if past_key_values:
+                    if token_idx is not None:
+                        position_ids = (
+                            torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                        )
+                    else:
+                        position_ids = position_ids[:, -input_ids.shape[1] :]
+
+            # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+            if inputs_embeds is not None and past_key_values is None:
+                model_inputs = {"inputs_embeds": inputs_embeds}
+            else:
+                model_inputs = {"input_ids": input_ids}
+
+            model_inputs.update(
+                {
+                    "position_ids": position_ids,
+                    "past_key_values": past_key_values,
+                    "use_cache": kwargs.get("use_cache"),
+                    "attention_mask": attention_mask,
+                    "token_idx": token_idx,
+                    "labels": labels,
+                    "use_flash_attention": use_flash_attention,
+                    "flash_attention_recompute": flash_attention_recompute,
+                }
+            )
+
+            return model_inputs
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py
new file mode 100644
index 00000000..293051c2
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@@ -0,0 +1,232 @@
+import torch
+import torch.distributed
+
+from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+from torch import nn
+from typing import Optional, Tuple, Any
+from transformers.configuration_utils import PretrainedConfig
+import torch.nn.functional as F
+
+from text_generation_server.layers import (
+    SpeculativeHead,
+    TensorParallelEmbedding,
+    FastLinear,
+)
+from text_generation_server.layers.layernorm import FastRMSNorm
+
+from einops import rearrange
+from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+import math
+from dataclasses import dataclass
+
+
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    max_seqlen: int
+    max_batch_size: int
+    conv_states: torch.Tensor
+    ssm_states: torch.Tensor
+    seqlen_offset: int
+
+
+class MambaConfig(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=50280,
+        d_model=768,
+        d_state=16,
+        n_layer=32,
+        layer_norm_epsilon=1e-5,
+        tie_word_embeddings=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        expand=2,
+        dt_rank="auto",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_layer = n_layer
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.d_model = d_model
+        self.d_inner = d_model * 2
+        self.d_conv = 4
+        self.d_state = d_state
+        self.expand = expand
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class MambaBlock(nn.Module):
+    def __init__(self, prefix, config, weights, layer_id):
+        super().__init__()
+        self.layer_id = layer_id
+        self.in_proj = FastLinear.load(config, f"{prefix}.in_proj", weights, bias=False)
+        self.x_proj = FastLinear.load(config, f"{prefix}.x_proj", weights, bias=False)
+        self.dt_proj = FastLinear.load(config, f"{prefix}.dt_proj", weights, bias=True)
+        self.dt_proj_no_bias = FastLinear.load(
+            config, f"{prefix}.dt_proj", weights, bias=False
+        )
+        self.out_proj = FastLinear.load(
+            config, f"{prefix}.out_proj", weights, bias=False
+        )
+        self.conv1d = FastLinear.load(config, f"{prefix}.conv1d", weights, bias=True)
+        self.negA = -torch.exp(weights.get_tensor(f"{prefix}.A_log").float())
+        self.D = weights.get_tensor(f"{prefix}.D")
+        self.activation = "silu"
+        self.dt_rank = config.dt_rank
+        self.d_state = config.d_state
+        self.d_conv = config.d_conv
+        self.act = nn.SiLU()
+
+    # inference_params
+    def forward(self, hidden_states: torch.Tensor, inference_params=None):
+        if inference_params.seqlen_offset > 0:
+            conv_state = inference_params.conv_states[self.layer_id]
+            ssm_state = inference_params.ssm_states[self.layer_id]
+            out, conv_state, ssm_state = self.step(hidden_states, conv_state, ssm_state)
+            return out, conv_state, ssm_state
+
+        _, seqlen, _ = hidden_states.shape
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+        # assert projected_states.shape == [batch_size, 2 * dstate, seqlen], f"{projected_states.shape} [{batch_size}, {dstate}, {seqlen}]"
+        x, z = projected_states.chunk(2, dim=1)
+        conv_state = F.pad(x, (self.d_conv - seqlen, 0))
+        x = causal_conv1d_fn(
+            x=x,
+            weight=self.conv1d.weight.squeeze(1),
+            bias=self.conv1d.bias,
+            activation=self.activation,
+        )
+
+        # We're careful here about the layout, to avoid extra transposes.
+        # We want dt to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
+        dt, B, C = torch.split(
+            x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1
+        )
+        dt = self.dt_proj.weight @ dt.t()
+        dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
+        B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+        C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+        y, last_state = selective_scan_fn(
+            x,
+            dt,
+            self.negA,
+            B,
+            C,
+            self.D.float(),
+            z=z,
+            delta_bias=self.dt_proj.bias.float(),
+            delta_softplus=True,
+            return_last_state=True,
+        )
+        y = rearrange(y, "b d l -> b l d")
+        attn_outputs = self.out_proj(y)
+        return attn_outputs, conv_state, last_state
+
+    def step(self, hidden_states, conv_state, ssm_state):
+        xz = self.in_proj(hidden_states.squeeze(1))
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+        x = causal_conv1d_update(
+            x,
+            conv_state,
+            self.conv1d.weight.squeeze(1),
+            self.conv1d.bias,
+            self.activation,
+        )
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        dt = F.linear(dt, self.dt_proj.weight)
+        A = self.negA
+        y = selective_state_update(
+            ssm_state,
+            x,
+            dt,
+            A,
+            B,
+            C,
+            self.D,
+            z=z,
+            dt_bias=self.dt_proj.bias,
+            dt_softplus=True,
+        )
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state.clone(), ssm_state.clone()
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, prefix, config, weights, layer_id):
+        super().__init__()
+        self.mamba_block = MambaBlock(
+            prefix=f"{prefix}.mixer", config=config, weights=weights, layer_id=layer_id
+        )
+        self.layer_norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.layer_norm_epsilon
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        inference_params: Optional[Any] = None,
+    ):
+        residual = (hidden_states + residual) if residual is not None else hidden_states
+        shape = residual.shape
+        hidden_states, _ = self.layer_norm(residual.view(-1, shape[-1]))
+        hidden_states, conv_state, last_ssm_state = self.mamba_block(
+            hidden_states.view(*shape), inference_params
+        )
+        return hidden_states, residual, conv_state, last_ssm_state
+
+
+class MambaModel(nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+        prefix = "backbone"
+        self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embedding", weights)
+        self.blocks = nn.ModuleList(
+            [
+                ResidualBlock(f"{prefix}.layers.{i}", config, weights, layer_id=i)
+                for i in range(config.n_layer)
+            ]
+        )
+        self.norm_f = FastRMSNorm.load(
+            f"{prefix}.norm_f", weights, eps=config.layer_norm_epsilon
+        )
+        self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
+        self.config = config
+
+    def forward(
+        self, input_ids: torch.Tensor, inference_params=None, residual=None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.embed_tokens(input_ids)
+        for i, block in enumerate(self.blocks):
+            hidden_states, residual, conv_state, ssm_state = block(
+                hidden_states, residual, inference_params
+            )
+            inference_params.conv_states[i].copy_(conv_state)
+            inference_params.ssm_states[i].copy_(ssm_state)
+
+        hidden_states = (
+            hidden_states + residual if residual is not None else hidden_states
+        )
+        hidden_states, _ = self.norm_f(hidden_states.view(-1, hidden_states.size(-1)))
+        hidden_states = hidden_states.view(residual.shape)
+        logits, speculative_logits = self.lm_head(hidden_states)
+
+        # update the offset for the next inference using these params
+        inference_params.seqlen_offset += input_ids.size(1)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py
new file mode 100644
index 00000000..73536bd6
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py
@@ -0,0 +1,995 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Mllama model."""
+
+from typing import Optional, Tuple, List
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import flash_attn_2_cuda
+
+from transformers.activations import ACT2FN
+import torch.nn.functional as F
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    FastLinear,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+)
+from text_generation_server.models.custom_modeling.flash_llama_modeling import (
+    FlashLlamaForCausalLM,
+)
+
+
+def _prepare_aspect_ratio_attention_mask(
+    aspect_ratio_mask: torch.Tensor,
+    num_patches: int,
+    target_length: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    # Expand aspect ratio mask to target_length
+    batch_size, max_num_tiles = aspect_ratio_mask.shape
+    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype)
+    attention_mask = attention_mask.repeat(1, 1, target_length, 1)
+
+    # Mask padding patches
+    pad_patches = target_length - num_patches
+    attention_mask[:, :, -pad_patches:] = 0
+
+    # Invert the mask (0 -> 1, 1 -> 0)
+    attention_mask = 1 - attention_mask
+
+    # Reshape to 2D and create 4D attention mask
+    # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length)
+    attention_mask = attention_mask.reshape(
+        batch_size, max_num_tiles * target_length, 1
+    )
+    attention_mask = (
+        attention_mask @ attention_mask.transpose(-1, -2) * torch.finfo(dtype).min
+    )
+    attention_mask = attention_mask.unsqueeze(1)
+
+    return attention_mask
+
+
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full(
+            (sequence_length, target_length),
+            fill_value=min_dtype,
+            dtype=dtype,
+            device=device,
+        )
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(
+            target_length, device=device
+        ) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = (
+                causal_mask.clone()
+            )  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = (
+                causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            )
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[
+                :, :, :, :mask_length
+            ].masked_fill(padding_mask, min_dtype)
+
+    return causal_mask
+
+
+def _prepare_cross_attention_mask(
+    cross_attention_mask: torch.Tensor,
+    num_vision_tokens: int,
+    dtype: str,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # reshape so it can be used by attn module
+    batch_size, text_total_length, *_ = cross_attention_mask.shape
+    cross_attention_mask = cross_attention_mask.repeat_interleave(
+        num_vision_tokens, dim=3
+    )
+    cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1)
+    cross_attention_mask = cross_attention_mask.unsqueeze(1)
+
+    # invert the mask
+    inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype)
+    cross_attention_mask = inverted_cross_attn_mask.masked_fill(
+        inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+
+    # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's
+    # last dimension contains negative infinity values, otherwise it's 1
+    negative_inf_value = torch.finfo(dtype).min
+    full_text_row_masked_out_mask = (
+        (cross_attention_mask != negative_inf_value)
+        .any(dim=-1)
+        .type_as(cross_attention_mask)[..., None]
+    )
+    cross_attention_mask *= full_text_row_masked_out_mask
+
+    return cross_attention_mask, full_text_row_masked_out_mask
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->MllamaVision
+class MllamaVisionMLP(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MllamaVisionSdpaAttention(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+
+        self.embed_dim = config.hidden_size
+        self.head_dim = config.hidden_size // config.attention_heads
+        self.num_heads = config.attention_heads // weights.process_group.size()
+
+        self.qkv_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qkv = self.qkv_proj(hidden_state)
+        query, key, value = qkv.split(
+            [
+                self.head_dim * self.num_heads,
+                self.head_dim * self.num_heads,
+                self.head_dim * self.num_heads,
+            ],
+            dim=2,
+        )
+
+        batch_size, q_seq_len, _ = query.shape
+        _, kv_seq_len, _ = key.shape
+
+        query = query.view(batch_size, q_seq_len, self.num_heads, self.head_dim)
+        key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
+        value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        attn_output = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_seq_len, -1)
+
+        output = self.o_proj(attn_output)
+        return output
+
+
+class MllamaVisionEncoderLayer(nn.Module):
+    def __init__(self, *, prefix, config, weights, is_gated: bool):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.attention_heads
+        self.is_gated = is_gated
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = MllamaVisionSdpaAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = MllamaVisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+
+        self.input_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=1e-05
+        )
+        self.post_attention_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=1e-05
+        )
+
+        # there used to be an if else here, no code path
+        if is_gated:
+            self.gate_attn = nn.Parameter(
+                weights.get_tensor(f"{prefix}.gate_attn"), requires_grad=False
+            )
+            self.gate_ffn = nn.Parameter(
+                weights.get_tensor(f"{prefix}.gate_ffn"), requires_grad=False
+            )
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state, attention_mask=attention_mask)
+        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
+        hidden_state = residual + gate_attn * hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
+        hidden_state = residual + gate_ffn * hidden_state
+        return hidden_state
+
+
+class MllamaVisionEncoder(nn.Module):
+    def __init__(self, *, prefix, config, weights, is_gated: bool, num_layers: int):
+        super().__init__()
+        self.config = config
+        self.layers = [
+            MllamaVisionEncoderLayer(
+                prefix=f"{prefix}.layers.{i}",
+                config=config,
+                weights=weights,
+                is_gated=is_gated,
+            )
+            for i in range(num_layers)
+        ]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        encoder_states = [hidden_states]
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+
+            hidden_states = layer_outputs
+            encoder_states.append(hidden_states)
+
+        return hidden_states, encoder_states
+
+
+class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+
+        self.embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.embedding", weights=weights
+        )
+        self.gate = nn.Parameter(
+            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
+        )
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        embeddings = self.embedding(aspect_ratio_ids)
+        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)
+
+        # Always gated.
+        embeddings = embeddings * self.gate.tanh()
+
+        hidden_state = hidden_state + embeddings
+        return hidden_state
+
+
+class MllamaPrecomputedPositionEmbedding(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.num_patches = (config.image_size // config.patch_size) ** 2 + 1
+        self.hidden_size = config.hidden_size
+        self.scale = config.hidden_size**-0.5
+
+        self.gate = nn.Parameter(
+            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
+        )
+
+        # position embedding
+        embedding = nn.Parameter(
+            weights.get_tensor(f"{prefix}.embedding"), requires_grad=False
+        )
+        self.gated_position_embedding = (1 - self.gate.tanh()) * embedding
+        self.tile_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.tile_embedding", weights=weights
+        )
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        # position embeddings
+        hidden_state = hidden_state + self.gated_position_embedding.view(
+            1, 1, self.num_patches, self.hidden_size
+        )
+
+        # precomputed tile position embeddings
+        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
+        batch_size = hidden_state.shape[0]
+        tile_position_embedding = tile_position_embedding.reshape(
+            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size
+        )
+        gated_tile_position_embedding = self.gate.tanh() * tile_position_embedding
+        hidden_state = hidden_state + gated_tile_position_embedding
+
+        return hidden_state
+
+
+class MllamaVisionModel(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+        self.intermediate_layers_indices = config.intermediate_layers_indices
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
+        self.scale = config.hidden_size**-0.5
+        self.dtype = weights.dtype
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+            bias=False,
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+
+        self.class_embedding = nn.Parameter(
+            weights.get_tensor(f"{prefix}.class_embedding"), requires_grad=False
+        )
+
+        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(
+            prefix=f"{prefix}.gated_positional_embedding",
+            config=config,
+            weights=weights,
+        )
+
+        self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            prefix=f"{prefix}.pre_tile_positional_embedding",
+            config=config,
+            weights=weights,
+        )
+        self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            prefix=f"{prefix}.post_tile_positional_embedding",
+            config=config,
+            weights=weights,
+        )
+
+        ## layer norms
+        self.layernorm_pre = nn.LayerNorm.load(
+            prefix=f"{prefix}.layernorm_pre",
+            weights=weights,
+            # torch default
+            eps=1e-05,
+        )
+        self.layernorm_post = nn.LayerNorm.load(
+            prefix=f"{prefix}.layernorm_post",
+            weights=weights,
+            # torch default
+            eps=1e-05,
+        )
+
+        ## encoders
+        self.transformer = MllamaVisionEncoder(
+            prefix=f"{prefix}.transformer",
+            config=config,
+            weights=weights,
+            is_gated=False,
+            num_layers=config.num_hidden_layers,
+        )
+        self.global_transformer = MllamaVisionEncoder(
+            prefix=f"{prefix}.global_transformer",
+            config=config,
+            weights=weights,
+            is_gated=True,
+            num_layers=config.num_global_layers,
+        )
+
+    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        aspect_ratio_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        batch_size, num_concurrent_media, num_tiles, num_channels, height, width = (
+            pixel_values.shape
+        )
+
+        pixel_values = pixel_values.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_channels, height, width
+        )
+        aspect_ratio_ids = aspect_ratio_ids.reshape(
+            batch_size * num_concurrent_media, -1
+        )
+
+        # patch embedding
+        patch_embeds = self.patch_embedding(pixel_values)
+        hidden_state = patch_embeds.flatten(2).transpose(1, 2)
+
+        # tile embeddings
+        _, num_patches, dim = hidden_state.shape
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, -1, dim
+        )
+        hidden_state = self.pre_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+
+        # apply cls token
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_patches, dim
+        )
+        hidden_state = self.apply_class_embedding(hidden_state)
+        num_patches += 1
+
+        # apply position embeddings
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, num_patches, dim
+        )
+        hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids)
+
+        # apply encoder
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        # Compute the number of tokens to pad
+        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
+        # Compute padding tuple for pad function
+        padding = (
+            0,
+            0,
+            0,
+            num_padding_patches,
+        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
+        # Pad the tensor
+        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
+        slice_index = -num_padding_patches if num_padding_patches > 0 else None
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(
+                batch_size * num_concurrent_media, -1
+            )
+            attention_mask = _prepare_aspect_ratio_attention_mask(
+                aspect_ratio_mask=attention_mask,
+                num_patches=self.num_patches,
+                target_length=hidden_state.shape[2],
+                dtype=self.dtype,
+            )
+
+        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim)
+        hidden_state, all_intermediate_hidden_states = self.transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+        )
+        intermediate_hidden_states = [
+            hidden_state
+            for idx, hidden_state in enumerate(all_intermediate_hidden_states)
+            if idx in self.intermediate_layers_indices
+        ]
+        intermediate_hidden_states = torch.stack(intermediate_hidden_states, dim=-1)
+
+        # apply global encoder
+        hidden_state = self.layernorm_post(hidden_state)
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = self.post_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles * (num_patches + num_padding_patches),
+            dim,
+        )
+        hidden_state, _ = self.global_transformer(
+            hidden_state, attention_mask=attention_mask
+        )
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = hidden_state[:, :, :slice_index]
+
+        # adding intermediate layer outputs
+        hidden_state = hidden_state.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, dim
+        )
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            -1,
+        )
+        intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index]
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, -1
+        )
+        hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1)
+        return hidden_state
+
+
+class MllamaTextCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, *, prefix, config, weights, layer_idx):
+        super().__init__()
+        self.config = config
+        self.num_heads = self.config.num_attention_heads
+        self.num_key_value_heads = self.config.num_key_value_heads
+        self.dropout = config.dropout
+        self.hidden_size = config.hidden_size
+        self.head_size = config.hidden_size // self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.layer_idx = layer_idx
+
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            self.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.q_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.q_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.k_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.k_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.v_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.v_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.q_norm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.k_norm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.softmax_scale = self.head_size**-0.5
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        # past_key_value=None,
+        # attention_mask: Optional[torch.Tensor] = None,
+        # cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # hidden_states = hidden_states.unsqueeze(0)
+        # bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        query_states = query_states.view(-1, self.num_heads, self.head_size)
+        query_states = self.q_norm(query_states)
+
+        (
+            cross_attention_states,
+            cu_seqlen_q,
+            cu_seqlen_k,
+            max_q,
+            max_k,
+            indices,
+        ) = cross_attention_states
+
+        key_states = self.k_proj(cross_attention_states)
+        value_states = self.v_proj(cross_attention_states)
+        key_states = key_states.view(-1, self.num_key_value_heads, self.head_size)
+        value_states = value_states.view(-1, self.num_key_value_heads, self.head_size)
+        key_states = self.k_norm(key_states)
+
+        # key_states = key_states.repeat(1, self.num_key_value_groups, 1)
+        # value_states = value_states.repeat(1, self.num_key_value_groups, 1)
+
+        causal = False
+        # logger.info(
+        #     f"Q: {query_states.shape} -K {key_states.shape} - V{value_states.shape}"
+        # )
+        attn_output = flash_attn_2_cuda.varlen_fwd(
+            query_states,
+            key_states,
+            value_states,
+            None,
+            cu_seqlen_q,
+            cu_seqlen_k,
+            None,
+            None,
+            None,  # block_tables
+            None,
+            max_q,
+            max_k,
+            0.0,
+            self.softmax_scale,
+            False,
+            causal,  # Causal
+            -1,  # window_size_left,
+            -1,
+            0.0,  # softcap
+            False,
+            None,
+        )[0]
+        attn_output = self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+        return attn_output
+
+
+# Copied from transformers.models.gemma2.modeling_gemma2.Gemma2MLP with Gemma2->MllamaText
+class MllamaTextMLP(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        shape = x.shape
+        gate_up_states = self.gate_up_proj(x)
+        gate_up_states = gate_up_states.view(*shape[:-1], 2, self.intermediate_size)
+        result = self.down_proj(
+            self.act_fn(gate_up_states[:, 0]) * gate_up_states[:, 1]
+        )
+        return result
+
+
+class FlashLlamaCrossLayer(torch.nn.Module):
+    """Cross-attention transformer block with tanh-gated attention and feedforward."""
+
+    def __init__(self, *, prefix, config, weights, index) -> None:
+        layer_idx = index
+        super().__init__()
+        self.cross_attn = MllamaTextCrossAttention(
+            prefix=f"{prefix}.cross_attn",
+            config=config,
+            weights=weights,
+            layer_idx=layer_idx,
+        )
+
+        self.input_layernorm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.cross_attn_attn_gate = torch.nn.Parameter(
+            weights.get_tensor(f"{prefix}.cross_attn_attn_gate"), requires_grad=False
+        )
+
+        self.mlp = MllamaTextMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.post_attention_layernorm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.cross_attn_mlp_gate = torch.nn.Parameter(
+            weights.get_tensor(f"{prefix}.cross_attn_mlp_gate"), requires_grad=False
+        )
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        adapter_data,
+        cross_attention_states,  # [ IB, ...]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cross_attention_states is None:
+            return hidden_states, residual
+        if residual is not None:
+            hidden_states += residual
+
+        indices = cross_attention_states[-1]
+        out_hidden_states = hidden_states[:]
+        if len(indices) > 0:
+            assert max(indices) < hidden_states.shape[0]
+        hidden_states = hidden_states[indices]
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.cross_attn(
+            hidden_states=hidden_states,
+            # attention_mask=cross_attention_mask,
+            cross_attention_states=cross_attention_states,
+        )
+        hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states
+
+        out_hidden_states[indices] = hidden_states
+        hidden_states = out_hidden_states
+
+        return hidden_states, None
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MllamaText
+class MllamaTextRMSNorm(nn.Module):
+    def __init__(self, weight, eps):
+        super().__init__()
+        self.weight = weight
+        self.variance_epsilon = eps
+
+    @classmethod
+    def load(cls, *, prefix, weights, eps):
+        weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
+        )
+        return cls(weight=weight, eps=eps)
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class MllamaForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = None
+        config.vision_config.speculator = config.speculator
+        config.text_config.quantize = config.quantize
+        config.text_config.speculator = config.speculator
+        config.text_config._attn_implementation = "sdpa"
+        self.hidden_size = config.text_config.hidden_size
+        self.vision_model = MllamaVisionModel(
+            prefix="vision_model", config=config.vision_config, weights=weights
+        )
+        self.multi_modal_projector = FastLinear.load(
+            prefix="multi_modal_projector", config=config, weights=weights, bias=True
+        )
+        self.text_model = FlashLlamaForCausalLM(
+            prefix="language_model", config=config.text_config, weights=weights
+        )
+        self.config = config
+        self.dtype = weights.dtype
+        self.device = weights.device
+
+    def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
+        if aspect_ratio_ids is None:
+            raise ValueError(
+                "`aspect_ratio_ids` must be provided if `pixel_values` is provided"
+            )
+        # logger.info(f"PIxel values {pixel_values.shape}")
+        batch_size = pixel_values.shape[0]
+        vision_states = self.vision_model(
+            pixel_values, aspect_ratio_ids, aspect_ratio_mask
+        )
+        cross_attention_states = self.multi_modal_projector(vision_states).reshape(
+            -1, vision_states.shape[-2], self.hidden_size
+        )
+        _, _, h = cross_attention_states.shape
+        cross_attention_states = cross_attention_states.view(batch_size, -1, h)
+        # logger.info(f"cross {cross_attention_states.shape}")
+        return cross_attention_states
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor],
+        adapter_data: Optional[torch.Tensor] = None,
+        # XXX: Putting these as optional so that the cuda warmup calls can go through.
+        cross_attention_states: Optional[torch.Tensor] = None,
+        image_indices=None,
+    ):
+        if cross_attention_states is not None:
+            seqlen_q = len(image_indices)
+            n_images = cross_attention_states.shape[0]
+            seqlen_k = cross_attention_states.shape[1]
+            device = cross_attention_states.device
+            if cu_seqlen_prefill is not None:
+                offset = 0
+                cu_q = []
+                indices = []
+                for index in image_indices:
+                    cu_q.append(offset)
+                    length = seqlen.input_lengths[index].item()
+                    assert index < seqlen.cu_seqlen_q.shape[0]
+                    input_ids_offset = seqlen.cu_seqlen_q[index]
+                    indices.extend(range(input_ids_offset, input_ids_offset + length))
+                    offset += length
+                cu_q.append(offset)
+                cu_seqlen_q = torch.Tensor(cu_q).to(device=device, dtype=torch.int32)
+
+                assert max(indices) < input_ids.shape[0]
+
+                cu_seqlen_k = (
+                    torch.arange(
+                        n_images + 1,
+                        device=device,
+                        dtype=torch.int32,
+                    )
+                    * seqlen_k
+                )
+                max_q = cu_seqlen_q[-1].item()
+                max_k = seqlen_k
+            else:
+                cu_seqlen_q = torch.arange(
+                    seqlen_q + 1, device=device, dtype=torch.int32
+                )
+                seqlen_k = cross_attention_states.shape[1]
+                n_images = cross_attention_states.shape[0]
+                cu_seqlen_k = (
+                    torch.arange(
+                        n_images + 1,
+                        device=device,
+                        dtype=torch.int32,
+                    )
+                    * seqlen_k
+                )
+                max_q = seqlen_q
+                max_k = seqlen_k
+                indices = image_indices[:]
+
+            cross_attention_states = (
+                cross_attention_states,
+                cu_seqlen_q,
+                cu_seqlen_k,
+                max_q,
+                max_k,
+                indices,
+            )
+
+        outputs = self.text_model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+            prefill_cache_indices=prefill_cache_indices,
+            lm_head_indices=lm_head_indices,
+            adapter_data=adapter_data,
+            cross_attention_states=cross_attention_states,
+        )
+
+        return outputs
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/mpt_modeling.py
new file mode 100644
index 00000000..988a74a3
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/mpt_modeling.py
@@ -0,0 +1,1215 @@
+"""A simple, flexible implementation of a GPT model.
+
+Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+"""
+
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from einops import rearrange
+from packaging import version
+from text_generation_server.layers import (
+    TensorParallelEmbedding,
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    SpeculativeHead,
+    get_linear,
+)
+
+EPS = 1e-5
+
+
+def load_col(config, prefix, weights, bias):
+    assert config.quantize != "gptq", NotImplementedError
+    slice_ = weights._get_slice(f"{prefix}.weight")
+    rank = weights.process_group.rank()
+    size = weights.process_group.size()
+
+    h3, h = slice_.get_shape()
+    block_size = h // size
+
+    q_part = slice_[rank * block_size : (rank + 1) * block_size]
+    k_part = slice_[h + rank * block_size : h + (rank + 1) * block_size]
+    v_part = slice_[2 * h + rank * block_size : 2 * h + (rank + 1) * block_size]
+
+    weight = torch.cat([q_part, k_part, v_part], dim=0)
+    if weight.dtype != torch.int32:
+        weight = weight.to(dtype=weights.dtype)
+    weight = weight.to(device=weights.device)
+
+    if bias:
+        bias_slice_ = weights._get_slice(f"{prefix}.bias")
+        bias_rank = weights.process_group.rank()
+        bias_size = weights.process_group.size()
+
+        bias_h = bias_slice_.get_shape()
+        bias_h = bias_h[0]
+        bias_block_size = bias_h // bias_size
+
+        bias_q_part = bias_slice_[
+            bias_rank * bias_block_size : (bias_rank + 1) * bias_block_size
+        ]
+        bias_k_part = bias_slice_[
+            bias_h
+            + bias_rank * bias_block_size : bias_h
+            + (bias_rank + 1) * bias_block_size
+        ]
+        bias_v_part = bias_slice_[
+            2 * bias_h
+            + bias_rank * bias_block_size : 2 * bias_h
+            + (bias_rank + 1) * bias_block_size
+        ]
+
+        bias = torch.cat([bias_q_part, bias_k_part, bias_v_part], dim=0)
+        if bias.dtype != torch.int32:
+            bias = bias.to(dtype=weights.dtype)
+        bias = bias.to(device=weights.device)
+    else:
+        bias = None
+    linear = get_linear(weight, bias)
+    return TensorParallelColumnLinear(linear)
+
+
+def _reset_is_causal(
+    num_query_tokens: int, num_key_tokens: int, original_is_causal: bool
+):
+    if original_is_causal and num_query_tokens != num_key_tokens:
+        if num_query_tokens != 1:
+            raise NotImplementedError(
+                "MPT does not support query and key with different number of tokens, unless number of query tokens is 1."
+            )
+        else:
+            return False
+    return original_is_causal
+
+
+def scaled_multihead_dot_product_attention(
+    query,
+    key,
+    value,
+    n_heads,
+    past_key_value=None,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+    multiquery=False,
+):
+    q = rearrange(query, "b s (h d) -> b h s d", h=n_heads)
+    kv_n_heads = 1 if multiquery else n_heads
+    k = rearrange(key, "b s (h d) -> b h d s", h=kv_n_heads)
+    v = rearrange(value, "b s (h d) -> b h s d", h=kv_n_heads)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            k = torch.cat([past_key_value[0], k], dim=3)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        past_key_value = (k, v)
+    (b, _, s_q, d) = q.shape
+    s_k = k.size(-1)
+    attn_weight = q.matmul(k) * softmax_scale
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - s_q)
+        _s_k = max(0, attn_bias.size(3) - s_k)
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+        if (
+            attn_bias.size(-1) != 1
+            and attn_bias.size(-1) != s_k
+            or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q)
+        ):
+            raise RuntimeError(
+                f"attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}."
+            )
+        attn_weight = attn_weight + attn_bias
+    min_val = torch.finfo(q.dtype).min
+    if key_padding_mask is not None:
+        if attn_bias is not None:
+            warnings.warn(
+                "Propogating key_padding_mask to the attention module "
+                + "and applying it within the attention module can cause "
+                + "unneccessary computation/memory usage. Consider integrating "
+                + "into attn_bias once and passing that to each attention "
+                + "module instead."
+            )
+        attn_weight = attn_weight.masked_fill(
+            ~key_padding_mask.view((b, 1, 1, s_k)), min_val
+        )
+    if is_causal and (not q.size(2) == 1):
+        s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
+        causal_mask = causal_mask.tril()
+        causal_mask = causal_mask.to(torch.bool)
+        causal_mask = ~causal_mask
+        causal_mask = causal_mask[-s_q:, -s_k:]
+        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if dropout_p:
+        attn_weight = torch.nn.functional.dropout(
+            attn_weight, p=dropout_p, training=training, inplace=True
+        )
+    out = attn_weight.to(v.dtype).matmul(v)
+    out = rearrange(out, "b h s d -> b s (h d)")
+    if needs_weights:
+        return (out, attn_weight, past_key_value)
+    return (out, None, past_key_value)
+
+
+def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
+    for tensor in tensors:
+        if tensor.dtype not in valid_dtypes:
+            raise TypeError(
+                f"tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}."
+            )
+        if not tensor.is_cuda:
+            raise TypeError(
+                f"Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r})."
+            )
+
+
+def flash_attn_fn(
+    query,
+    key,
+    value,
+    n_heads,
+    past_key_value=None,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+    multiquery=False,
+):
+    try:
+        from flash_attn import bert_padding, flash_attn_interface
+    except Exception:
+        raise RuntimeError("Please install flash-attn==1.0.3.post0")
+    check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+    if attn_bias is not None:
+        raise NotImplementedError("attn_bias not implemented for flash attn.")
+    (batch_size, seqlen) = query.shape[:2]
+    if key_padding_mask is None:
+        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
+    query_padding_mask = key_padding_mask[:, -query.size(1) :]
+    (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(
+        query, query_padding_mask
+    )
+    query_unpad = rearrange(query_unpad, "nnz (h d) -> nnz h d", h=n_heads)
+    (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(
+        key, key_padding_mask
+    )
+    key_unpad = rearrange(
+        key_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads
+    )
+    (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
+    value_unpad = rearrange(
+        value_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads
+    )
+    if multiquery:
+        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
+        value_unpad = value_unpad.expand(
+            value_unpad.size(0), n_heads, value_unpad.size(-1)
+        )
+    dropout_p = dropout_p if training else 0.0
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
+        query_unpad,
+        key_unpad,
+        value_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale=softmax_scale,
+        causal=reset_is_causal,
+        return_attn_probs=needs_weights,
+    )
+    output = bert_padding.pad_input(
+        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices_q, batch_size, seqlen
+    )
+    return (output, None, past_key_value)
+
+
+def triton_flash_attn_fn(
+    query,
+    key,
+    value,
+    n_heads,
+    past_key_value=None,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+    multiquery=False,
+):
+    try:
+        from .flash_attn_triton import flash_attn_func
+    except Exception:
+        _installed = False
+        if version.parse(torch.__version__) < version.parse("2.0.0"):
+            _installed = True
+            try:
+                from flash_attn.flash_attn_triton import flash_attn_func
+            except Exception:
+                _installed = False
+        if not _installed:
+            raise RuntimeError(
+                "Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed."
+            )
+    check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+    if dropout_p:
+        raise NotImplementedError("Dropout not implemented for attn_impl: triton.")
+    if needs_weights:
+        raise NotImplementedError("attn_impl: triton cannot return attn weights.")
+    if key_padding_mask is not None:
+        warnings.warn(
+            "Propagating key_padding_mask to the attention module "
+            + "and applying it within the attention module can cause "
+            + "unnecessary computation/memory usage. Consider integrating "
+            + "into attn_bias once and passing that to each attention "
+            + "module instead."
+        )
+        (b_size, s_k) = key_padding_mask.shape[:2]
+        if attn_bias is None:
+            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
+        attn_bias = attn_bias.masked_fill(
+            ~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min
+        )
+    query = rearrange(query, "b s (h d) -> b s h d", h=n_heads)
+    key = rearrange(key, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
+    value = rearrange(value, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
+    if multiquery:
+        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
+        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    attn_output = flash_attn_func(
+        query, key, value, attn_bias, reset_is_causal, softmax_scale
+    )
+    output = attn_output.view(*attn_output.shape[:2], -1)
+    return (output, None, past_key_value)
+
+
+class MultiheadAttention(nn.Module):
+    """Multi-head self attention.
+
+    Using torch or triton attention implementation enables user to also use
+    additive bias.
+    """
+
+    def __init__(
+        self,
+        config,
+        prefix,
+        weights,
+    ):
+        super().__init__()
+        attn_impl = config.attn_config.attn_impl
+        self.attn_impl = config.attn_config.attn_impl
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.qk_ln = config.attn_config.qk_ln
+        self.d_model = config.d_model
+        d_model = config.d_model
+        self.n_heads = config.n_heads
+        self.softmax_scale = config.attn_config.softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
+        self.attn_dropout_p = config.attn_config.attn_pdrop
+
+        if self.n_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`n_heads` must be divisible by `num_shards` (got `n_heads`: {self.n_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.n_heads = self.n_heads // weights.process_group.size()
+        self.Wqkv = load_col(
+            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
+        )
+        if self.qk_ln:
+            bias = not config.no_bias
+            hidden_size = config.d_model
+            head_dim = hidden_size // self.n_heads
+
+            self.q_ln = LPLayerNorm(
+                d_model, bias=bias, prefix=f"{prefix}.q_ln", weights=weights
+            )
+            self.k_ln = LPLayerNorm(
+                self.n_heads * head_dim, prefix=f"{prefix}.k_ln", weights=weights
+            )
+        if self.attn_impl == "flash":
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == "triton":
+            self.attn_fn = triton_flash_attn_fn
+        elif self.attn_impl == "torch":
+            self.attn_fn = scaled_multihead_dot_product_attention
+        else:
+            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
+        self.out_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=not config.no_bias,
+        )
+
+    def forward(
+        self,
+        x,
+        past_key_value=None,
+        attn_bias=None,
+        attention_mask=None,
+        is_causal=True,
+        needs_weights=False,
+    ):
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        (query, key, value) = qkv.chunk(3, dim=2)
+
+        key_padding_mask = attention_mask
+        if self.qk_ln:
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        (context, attn_weights, past_key_value) = self.attn_fn(
+            query,
+            key,
+            value,
+            self.n_heads,
+            past_key_value=past_key_value,
+            softmax_scale=self.softmax_scale,
+            attn_bias=attn_bias,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            dropout_p=self.attn_dropout_p,
+            training=self.training,
+            needs_weights=needs_weights,
+        )
+        out = self.out_proj(context)
+        return (out, attn_weights, past_key_value)
+
+
+class MultiQueryAttention(nn.Module):
+    """Multi-Query self attention.
+
+    Using torch or triton attention implementation enables user to also use
+    additive bias.
+    """
+
+    def __init__(self, config, prefix, weights, verbose=False):
+        super().__init__()
+        attn_impl = config.attn_config.attn_impl
+        self.attn_impl = config.attn_config.attn_impl
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.qk_ln = config.attn_config.qk_ln
+        self.d_model = config.d_model
+        d_model = config.d_model
+        self.n_heads = config.n_heads
+        self.softmax_scale = config.attn_config.softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.head_dim)
+        self.attn_dropout_p = config.attn_config.attn_pdrop
+        # self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
+        self.Wqkv = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
+        )
+        (d_model, d_model + self.head_dim)
+        if self.qk_ln:
+            raise NotImplementedError("qk_ln not supported")
+        if self.attn_impl == "flash":
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == "triton":
+            self.attn_fn = triton_flash_attn_fn
+            if verbose:
+                warnings.warn(
+                    "While `attn_impl: triton` can be faster than `attn_impl: flash` "
+                    + "it uses more memory. When training larger models this can trigger "
+                    + "alloc retries which hurts performance. If encountered, we recommend "
+                    + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
+                )
+        elif self.attn_impl == "torch":
+            self.attn_fn = scaled_multihead_dot_product_attention
+            if torch.cuda.is_available() and verbose:
+                warnings.warn(
+                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
+                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
+                    + "we recommend using `attn_impl: triton`."
+                )
+        else:
+            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
+        self.out_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=not config.no_bias,
+        )
+        # self.out_proj._is_residual = True
+
+    def forward(
+        self,
+        x,
+        past_key_value=None,
+        attn_bias=None,
+        attention_mask=None,
+        is_causal=True,
+        needs_weights=False,
+    ):
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        (query, key, value) = qkv.split(
+            [self.d_model, self.head_dim, self.head_dim], dim=2
+        )
+        key_padding_mask = attention_mask
+        if self.qk_ln:
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        (context, attn_weights, past_key_value) = self.attn_fn(
+            query,
+            key,
+            value,
+            self.n_heads,
+            past_key_value=past_key_value,
+            softmax_scale=self.softmax_scale,
+            attn_bias=attn_bias,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            dropout_p=self.attn_dropout_p,
+            training=self.training,
+            needs_weights=needs_weights,
+            multiquery=True,
+        )
+        return (self.out_proj(context), attn_weights, past_key_value)
+
+
+def attn_bias_shape(
+    attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id
+):
+    if attn_impl == "flash":
+        return None
+    elif attn_impl in ["torch", "triton"]:
+        if alibi:
+            if (prefix_lm or not causal) or use_sequence_id:
+                return (1, n_heads, seq_len, seq_len)
+            return (1, n_heads, 1, seq_len)
+        elif prefix_lm or use_sequence_id:
+            return (1, 1, seq_len, seq_len)
+        return None
+    else:
+        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
+
+
+def build_attn_bias(
+    attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8
+):
+    if attn_impl == "flash":
+        return None
+    elif attn_impl in ["torch", "triton"]:
+        if alibi:
+            (device, dtype) = (attn_bias.device, attn_bias.dtype)
+            attn_bias = attn_bias.add(
+                build_alibi_bias(
+                    n_heads,
+                    seq_len,
+                    full=not causal,
+                    alibi_bias_max=alibi_bias_max,
+                    device=device,
+                    dtype=dtype,
+                )
+            )
+        return attn_bias
+    else:
+        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
+
+
+def gen_slopes(n_heads, alibi_bias_max=8, device=None):
+    _n_heads = 2 ** math.ceil(math.log2(n_heads))
+    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
+    m = m.mul(alibi_bias_max / _n_heads)
+    slopes = 1.0 / torch.pow(2, m)
+    if _n_heads != n_heads:
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
+    return slopes.view(1, n_heads, 1, 1)
+
+
+def build_alibi_bias(
+    n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None
+):
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(
+        1, 1, 1, seq_len
+    )
+    if full:
+        alibi_bias = alibi_bias - torch.arange(
+            1 - seq_len, 1, dtype=torch.int32, device=device
+        ).view(1, 1, seq_len, 1)
+        alibi_bias = alibi_bias.abs().mul(-1)
+    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
+    alibi_bias = alibi_bias * slopes
+    return alibi_bias.to(dtype=dtype)
+
+
+ATTN_CLASS_REGISTRY = {
+    "multihead_attention": MultiheadAttention,
+    "multiquery_attention": MultiQueryAttention,
+}
+
+"""GPT Blocks used for the GPT Model."""
+
+
+class MPTMLP(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        # self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
+        self.up_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.up_proj", weights=weights, bias=not config.no_bias
+        )
+        self.act = nn.GELU(approximate="none")
+        # self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=not config.no_bias,
+        )
+        # self.down_proj._is_residual = True
+
+    def forward(self, x):
+        return self.down_proj(self.act(self.up_proj(x)))
+
+
+class MPTBlock(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.prefix = prefix
+        if config.attn_config.attn_type != "multihead_attention":
+            raise NotImplementedError(
+                f"""Not implemented attn {config.attn_config.attn_type}"""
+            )
+        resid_pdrop = config.resid_pdrop
+        if config.no_bias:
+            self.norm_1 = nn.LayerNorm.load_no_bias(
+                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
+            )
+            self.norm_2 = nn.LayerNorm.load_no_bias(
+                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
+            )
+        else:
+            self.norm_1 = nn.LayerNorm.load(
+                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
+            )
+            self.norm_2 = nn.LayerNorm.load(
+                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
+            )
+        self.attn = MultiheadAttention(config, prefix=f"{prefix}.attn", weights=weights)
+        self.ffn = MPTMLP(config, prefix=f"{prefix}.ffn", weights=weights)
+        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
+        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal: bool = True,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
+        a = self.norm_1(x)
+        (b, attn_weights, past_key_value) = self.attn(
+            a,
+            past_key_value=past_key_value,
+            attn_bias=attn_bias,
+            attention_mask=attention_mask,
+            is_causal=is_causal,
+        )
+        x = x + self.resid_attn_dropout(b)
+        m = self.norm_2(x)
+        n = self.ffn(m)
+        x = x + self.resid_ffn_dropout(n)
+        return (x, attn_weights, past_key_value)
+
+
+def _cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == "cuda":
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == "cpu":
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
+
+
+class LPLayerNorm(torch.nn.LayerNorm):
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-05,
+        elementwise_affine=True,
+        device=None,
+        dtype=None,
+        bias: Optional[bool] = True,
+        prefix=None,
+        weights=None,
+    ):
+        super().__init__(
+            normalized_shape=normalized_shape,
+            eps=eps,
+            elementwise_affine=elementwise_affine,
+            device=device,
+            dtype=dtype,
+            bias=bias,
+        )
+        if weights is not None:
+            self.weight = nn.Parameter(weights.get_sharded(f"{prefix}.weight", dim=0))
+            if bias:
+                self.bias = nn.Parameter(weights.get_sharded(f"{prefix}.bias", dim=0))
+            self.normalized_shape = self.weight.shape
+
+    def forward(self, x):
+        module_device = x.device
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = (
+            _cast_if_autocast_enabled(self.weight)
+            if self.weight is not None
+            else self.weight
+        )
+        downcast_bias = (
+            _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+        )
+        with torch.autocast(enabled=False, device_type=module_device.type):
+            return torch.nn.functional.layer_norm(
+                downcast_x,
+                self.normalized_shape,
+                downcast_weight,
+                downcast_bias,
+                self.eps,
+            )
+
+
+def rms_norm(x, weight=None, eps=1e-05):
+    output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+    if weight is not None:
+        return output * weight
+    return output
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(
+        self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
+    ):
+        super().__init__()
+        self.eps = eps
+        if weight:
+            self.weight = torch.nn.Parameter(
+                torch.ones(normalized_shape, dtype=dtype, device=device)
+            )
+        else:
+            self.register_parameter("weight", None)
+
+    def forward(self, x):
+        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
+
+
+class LPRMSNorm(RMSNorm):
+    def __init__(
+        self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
+    ):
+        super().__init__(
+            normalized_shape=normalized_shape,
+            eps=eps,
+            weight=weight,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(self, x):
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = (
+            _cast_if_autocast_enabled(self.weight)
+            if self.weight is not None
+            else self.weight
+        )
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
+
+
+NORM_CLASS_REGISTRY = {
+    "layernorm": torch.nn.LayerNorm,
+    "low_precision_layernorm": LPLayerNorm,
+    "rmsnorm": RMSNorm,
+    "low_precision_rmsnorm": LPRMSNorm,
+}
+
+Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+
+
+class MPTPreTrainedModel(PreTrainedModel):
+    base_model_prefix = "model"
+    _no_split_modules = ["MPTBlock"]
+
+
+class MPTModel(MPTPreTrainedModel):
+    def __init__(self, prefix: str, config, weights):
+        # config._validate_config()
+        super().__init__(config)
+        self.world_size = weights.process_group.size()
+        self.rank = weights.process_group.rank()
+        self.n_heads = config.n_heads
+        self.attn_impl = config.attn_config.attn_impl
+        self.prefix_lm = config.attn_config.prefix_lm
+        self.attn_uses_sequence_id = config.attn_config.attn_uses_sequence_id
+        self.alibi = config.attn_config.alibi
+        self.alibi_bias_max = config.attn_config.alibi_bias_max
+        if config.init_device == "mixed":
+            # TODO: reimplement mixed device initialization
+            # dist.get_local_rank() == 0:
+            if True:
+                config.init_device = "cpu"
+            else:
+                config.init_device = "meta"
+        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
+            norm_options = " | ".join(NORM_CLASS_REGISTRY.keys())
+            raise NotImplementedError(
+                f"Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options})."
+            )
+        if config.norm_type.lower() != "low_precision_layernorm":
+            raise NotImplementedError(
+                f"Requested norm type ({config.norm_type}) is not implemented within this repo."
+            )
+
+        self.wte = TensorParallelEmbedding(f"{prefix}.wte", weights)
+
+        if not self.alibi:
+            self.wpe = TensorParallelEmbedding(f"{prefix}.wpe", weights)
+        self.blocks = nn.ModuleList(
+            [
+                MPTBlock(config, prefix=f"{prefix}.blocks.{i}", weights=weights)
+                for i in range(config.n_layers)
+            ]
+        )
+        if config.no_bias:
+            self.norm_f = nn.LayerNorm.load_no_bias(
+                prefix="transformer.norm_f", weights=weights, eps=EPS
+            )
+        else:
+            self.norm_f = nn.LayerNorm.load(
+                prefix="transformer.norm_f", weights=weights, eps=EPS
+            )
+        self.is_causal = not self.prefix_lm
+        self._attn_bias_initialized = False
+        self.attn_bias = None
+        self.attn_bias_shape = attn_bias_shape(
+            self.attn_impl,
+            config.n_heads,
+            config.max_seq_len,
+            self.alibi,
+            prefix_lm=self.prefix_lm,
+            causal=self.is_causal,
+            use_sequence_id=self.attn_uses_sequence_id,
+        )
+        if config.no_bias:
+            for module in self.modules():
+                if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
+                    if config.verbose:
+                        warnings.warn(f"Removing bias ({module.bias}) from {module}.")
+                    module.register_parameter("bias", None)
+        if hasattr(self.config, "verbose"):
+            if config.verbose and config.verbose > 2:
+                print(self)
+        if "verbose" not in self.config.init_config:
+            self.config.init_config["verbose"] = self.config.verbose
+        if self.config.init_config["verbose"] > 1:
+            init_fn_name = self.config.init_config["name"]
+            warnings.warn(f"Using {init_fn_name} initialization.")
+
+    @torch.no_grad()
+    def _attn_bias(
+        self,
+        device,
+        dtype,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+    ):
+        if not self._attn_bias_initialized:
+            if self.attn_bias_shape:
+                self.attn_bias = torch.zeros(
+                    self.attn_bias_shape, device=device, dtype=dtype
+                )
+                self.attn_bias = build_attn_bias(
+                    self.attn_impl,
+                    self.attn_bias,
+                    self.config.n_heads,
+                    self.config.max_seq_len,
+                    causal=self.is_causal,
+                    alibi=self.alibi,
+                    alibi_bias_max=self.alibi_bias_max,
+                )
+                assert self.n_heads % self.world_size == 0
+                block_size = self.n_heads // self.world_size
+                self.attn_bias = self.attn_bias[
+                    :, self.rank * block_size : (self.rank + 1) * block_size
+                ]
+            self._attn_bias_initialized = True
+        if self.attn_impl == "flash":
+            return (self.attn_bias, attention_mask)
+        if self.attn_bias is not None:
+            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
+        attn_bias = self.attn_bias
+        if self.prefix_lm:
+            assert isinstance(attn_bias, torch.Tensor)
+            assert isinstance(prefix_mask, torch.Tensor)
+            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
+        if self.attn_uses_sequence_id and sequence_id is not None:
+            assert isinstance(attn_bias, torch.Tensor)
+            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
+        if attention_mask is not None:
+            s_k = attention_mask.shape[-1]
+            if attn_bias is None:
+                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
+            else:
+                _s_k = max(0, attn_bias.size(-1) - s_k)
+                attn_bias = attn_bias[:, :, :, _s_k:]
+            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
+                raise ValueError(
+                    f"attention_mask shape={attention_mask.shape} "
+                    + f"and prefix_mask shape={prefix_mask.shape} are not equal."
+                )
+            min_val = torch.finfo(attn_bias.dtype).min
+            attn_bias = attn_bias.masked_fill(
+                ~attention_mask.view(-1, 1, 1, s_k), min_val
+            )
+        return (attn_bias, None)
+
+    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
+        (s_k, s_q) = attn_bias.shape[-2:]
+        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
+            raise ValueError(
+                "attn_bias does not match the expected shape. "
+                + f"The last two dimensions should both be {self.config.max_length} "
+                + f"but are {s_k} and {s_q}."
+            )
+        seq_len = prefix_mask.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(
+                f"prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}"
+            )
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        causal = torch.tril(
+            torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)
+        ).view(1, 1, seq_len, seq_len)
+        prefix = prefix_mask.view(-1, 1, 1, seq_len)
+        cannot_attend = ~torch.logical_or(causal, prefix.bool())
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+
+    def _apply_sequence_id(
+        self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor
+    ):
+        seq_len = sequence_id.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(
+                f"sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}"
+            )
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        cannot_attend = torch.logical_not(
+            torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))
+        ).unsqueeze(1)
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.return_dict
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool()
+        if prefix_mask is not None:
+            prefix_mask = prefix_mask.bool()
+        if not return_dict:
+            raise NotImplementedError(
+                "return_dict False is not implemented yet for MPT"
+            )
+        if output_attentions:
+            if self.attn_impl != "torch":
+                raise NotImplementedError(
+                    "output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`."
+                )
+        if (
+            attention_mask is not None
+            and attention_mask[:, 0].sum() != attention_mask.shape[0]
+            and self.training
+        ):
+            raise NotImplementedError(
+                "MPT does not support training with left padding."
+            )
+        if self.prefix_lm and prefix_mask is None:
+            raise ValueError(
+                "prefix_mask is a required argument when MPT is configured with prefix_lm=True."
+            )
+        if self.training:
+            if self.attn_uses_sequence_id and sequence_id is None:
+                raise ValueError(
+                    "sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True "
+                    + "and the model is in train mode."
+                )
+            elif self.attn_uses_sequence_id is False and sequence_id is not None:
+                warnings.warn(
+                    "MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. "
+                    + "This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True."
+                )
+        S = input_ids.size(1)
+        assert (
+            S <= self.config.max_seq_len
+        ), f"Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}"
+        tok_emb = self.wte(input_ids)
+        if self.alibi:
+            x = tok_emb
+        else:
+            past_position = 0
+            if past_key_values is not None:
+                if len(past_key_values) != self.config.n_layers:
+                    raise ValueError(
+                        "past_key_values must provide a past_key_value for each attention "
+                        + f"layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r})."
+                    )
+                past_position = past_key_values[0][0].size(1)
+                if self.attn_impl == "torch":
+                    past_position = past_key_values[0][0].size(3)
+            if S + past_position > self.config.max_seq_len:
+                raise ValueError(
+                    f"Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}."
+                )
+            pos = torch.arange(
+                past_position,
+                S + past_position,
+                dtype=torch.long,
+                device=input_ids.device,
+            ).unsqueeze(0)
+            if attention_mask is not None:
+                pos = torch.clamp(
+                    pos
+                    - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[
+                        :, past_position:
+                    ],
+                    min=0,
+                )
+            pos_emb = self.wpe(pos)
+            x = tok_emb + pos_emb
+        (attn_bias, attention_mask) = self._attn_bias(
+            device=x.device,
+            dtype=torch.float32,
+            attention_mask=attention_mask,
+            prefix_mask=prefix_mask,
+            sequence_id=sequence_id,
+        )
+        if use_cache and past_key_values is None:
+            past_key_values = [() for _ in range(self.config.n_layers)]
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for b_idx, block in enumerate(self.blocks):
+            if output_hidden_states:
+                assert all_hidden_states is not None
+                all_hidden_states = all_hidden_states + (x,)
+            past_key_value = (
+                past_key_values[b_idx] if past_key_values is not None else None
+            )
+            (x, attn_weights, past_key_value) = block(
+                x,
+                past_key_value=past_key_value,
+                attn_bias=attn_bias,
+                attention_mask=attention_mask,
+                is_causal=self.is_causal,
+            )
+            if past_key_values is not None:
+                past_key_values[b_idx] = past_key_value
+            if output_attentions:
+                assert all_self_attns is not None
+                all_self_attns = all_self_attns + (attn_weights,)
+        x = self.norm_f(x)
+        if output_hidden_states:
+            assert all_hidden_states is not None
+            all_hidden_states = all_hidden_states + (x,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=x,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class MPTForCausalLM(MPTPreTrainedModel):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__(config)
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        if not config.tie_word_embeddings:
+            raise ValueError("MPTForCausalLM only supports tied word embeddings")
+        self.transformer = MPTModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config, prefix=f"{prefix}.wte", weights=weights
+        )
+        self.logit_scale = None
+        if config.logit_scale is not None:
+            logit_scale = config.logit_scale
+            if isinstance(logit_scale, str):
+                if logit_scale == "inv_sqrt_d_model":
+                    logit_scale = 1 / math.sqrt(config.d_model)
+                else:
+                    raise ValueError(
+                        f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
+                    )
+            self.logit_scale = logit_scale
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.return_dict
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            prefix_mask=prefix_mask,
+            sequence_id=sequence_id,
+            return_dict=return_dict,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+        )
+        logits, speculative_logits = self.lm_head(outputs.last_hidden_state)
+        if self.logit_scale is not None:
+            if self.logit_scale == 0:
+                warnings.warn(
+                    f"Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs."
+                )
+            logits *= self.logit_scale
+        loss = None
+        if labels is not None:
+            labels = torch.roll(labels, shifts=-1)
+            labels[:, -1] = -100
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)
+            )
+        return (
+            CausalLMOutputWithPast(
+                loss=loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            ),
+            speculative_logits,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        if inputs_embeds is not None:
+            raise NotImplementedError("inputs_embeds is not implemented for MPT yet")
+        attention_mask = kwargs["attention_mask"].bool()
+        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
+            raise NotImplementedError(
+                "MPT does not support generation with right padding."
+            )
+        if self.transformer.attn_uses_sequence_id and self.training:
+            sequence_id = torch.zeros_like(input_ids[:1])
+        else:
+            sequence_id = None
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if self.transformer.prefix_lm:
+            prefix_mask = torch.ones_like(attention_mask)
+            if kwargs.get("use_cache") is False:
+                raise NotImplementedError(
+                    "MPT with prefix_lm=True does not support use_cache=False."
+                )
+        else:
+            prefix_mask = None
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "prefix_mask": prefix_mask,
+            "sequence_id": sequence_id,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache", True),
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        """Used by HuggingFace generate when using beam search with kv-caching.
+
+        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
+        for an example in transformers.
+        """
+        reordered_past = []
+        for layer_past in past_key_values:
+            reordered_past += [
+                tuple(
+                    (past_state.index_select(0, beam_idx) for past_state in layer_past)
+                )
+            ]
+        return reordered_past
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/neox_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/neox_modeling.py
new file mode 100644
index 00000000..06731a6f
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -0,0 +1,796 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch GPTNeoX model."""
+
+from typing import Optional, Tuple, Union
+
+import os
+import torch
+import torch.distributed
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    SpeculativeHead,
+)
+
+
+CUSTOM_KERNELS_ENABLED = False
+if (
+    torch.cuda.is_available()
+    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
+):
+    try:
+        from custom_kernels import fused_attention_cuda
+
+        CUSTOM_KERNELS_ENABLED = True
+    except ImportError:
+        pass
+
+
+def make_causal_mask(
+    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
+) -> torch.BoolTensor:
+    """
+    Make causal mask used for self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = torch.ones(
+        (target_length, target_length + past_key_values_length),
+        dtype=torch.bool,
+        device=device,
+    )
+    mask = mask.triu(1 + past_key_values_length)
+
+    expanded_mask = mask.unsqueeze(0).expand(
+        batch_size, target_length, target_length + past_key_values_length
+    )
+    return expanded_mask
+
+
+def expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    expanded_mask = ~(mask[:, None, :].to(torch.bool))
+    return expanded_mask.expand(batch_size, tgt_length, src_length)
+
+
+def prepare_attn_mask(
+    attention_mask: torch.Tensor,
+    input_shape: Tuple[int, int],
+    past_key_values_length: int,
+) -> torch.BoolTensor:
+    # create causal mask
+    # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
+    combined_attention_mask = None
+    device = attention_mask.device
+    _, src_length = input_shape
+
+    if src_length > 1:
+        combined_attention_mask = make_causal_mask(
+            input_shape, device=device, past_key_values_length=past_key_values_length
+        )
+
+    # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
+    expanded_attn_mask = expand_mask(attention_mask, tgt_length=src_length)
+    combined_attention_mask = (
+        expanded_attn_mask
+        if combined_attention_mask is None
+        else expanded_attn_mask | combined_attention_mask
+    )
+
+    return combined_attention_mask
+
+
+class GPTNeoXPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+
+class GPTNeoXAttention(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_attention_heads
+        self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        # ??? TODO
+        # self.register_buffer(
+        #     "bias",
+        #     torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+        #         1, 1, max_positions, max_positions
+        #     ),
+        # )
+        # self.register_buffer("masked_bias", torch.tensor(-1e9))
+        self.rotary_emb = RotaryEmbedding(
+            self.rotary_ndims,
+            config.max_position_embeddings,
+            base=config.rotary_emb_base,
+        )
+        self.rotary_emb.inv_freq = nn.Parameter(
+            weights.get_tensor(f"{prefix}.rotary_emb.inv_freq")
+        )
+        self.inv_norm_factor = 1.0 / torch.sqrt(
+            torch.tensor(self.head_size, dtype=torch.float32)
+        ).to(torch.get_default_dtype())
+
+        if self.num_attention_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_attention_heads` must be divisible by `num_shards` "
+                f"(got `num_attention_heads`: {self.num_attention_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_attention_heads = (
+            self.num_attention_heads // weights.process_group.size()
+        )
+        self.query_key_value = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.query_key_value", weights=weights, bias=True
+        )
+        self.dense = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.dense", weights=weights, bias=True
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids,
+        attention_mask,
+        head_mask=None,
+        layer_past=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        has_layer_past = layer_past is not None
+
+        # Compute QKV
+        # Attention heads [batch, seq_len, hidden_size]
+        #   --> [batch, seq_len, (np * 3 * head_size)]
+        qkv = self.query_key_value(hidden_states)
+
+        # [batch, seq_len, (num_heads * 3 * head_size)]
+        #   --> [batch, seq_len, num_heads, 3 * head_size]
+        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
+        qkv = qkv.view(*new_qkv_shape).permute(0, 2, 1, 3)
+        # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
+        query, key, value = qkv.split(self.head_size, -1)
+
+        # Compute token offset for rotary embeddings (when decoding)
+        seq_len = key.shape[-2]
+        if has_layer_past:
+            seq_len += layer_past[0].shape[-2]
+
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = query[..., : self.rotary_ndims]
+        key_rot = key[..., : self.rotary_ndims]
+
+        query_rot, key_rot = self.rotary_emb(query_rot, key_rot, position_ids, seq_len)
+
+        query[..., : self.rotary_ndims] = query_rot
+        key[..., : self.rotary_ndims] = key_rot
+
+        if CUSTOM_KERNELS_ENABLED:
+            attn_output, present, attn_weights = fused_attention_cuda.forward(
+                query,
+                key,
+                value,
+                layer_past,
+                attention_mask,
+                head_mask,
+                self.inv_norm_factor,
+                self.num_attention_heads,
+                use_cache,
+            )
+        else:
+            # Cache QKV values
+            if has_layer_past:
+                past_key = layer_past[0]
+                past_value = layer_past[1]
+                key = torch.cat((past_key, key), dim=-2)
+                value = torch.cat((past_value, value), dim=-2)
+            present = (key, value) if use_cache else None
+
+            # Compute attention
+            attn_output, attn_weights = self._attn(
+                query, key, value, attention_mask, head_mask
+            )
+
+            # Reshape outputs
+            attn_output = self._merge_heads(
+                attn_output, self.num_attention_heads, self.head_size
+            )
+
+        attn_output = self.dense(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    @classmethod
+    def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Splits hidden dim into attn_head_size and num_attention_heads
+        """
+        # tensor: [bs, seq_len, hidden_size]
+        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
+        # -> [bs, seq_len, num_attention_heads, attn_head_size]
+        tensor = tensor.view(new_shape)
+        # -> [bs, num_attention_heads, seq_len, attn_head_size]
+        tensor = tensor.permute(0, 2, 1, 3)
+        return tensor
+
+    @classmethod
+    def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
+        """
+        # tensor [bs, num_attention_heads, seq_len, attn_head_size]
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        # -> [bs, seq_len, num_attention_heads, attn_head_size]
+        tensor = tensor.view(
+            tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size
+        )
+        # -> [bs, seq_len, hidden_size]
+        return tensor
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
+        # compute causal mask from causal mask buffer
+        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
+        key_length = key.size(-2)
+
+        query = query.reshape(
+            batch_size * num_attention_heads, query_length, attn_head_size
+        )
+        key = key.reshape(batch_size * num_attention_heads, key_length, attn_head_size)
+        attn_scores = torch.zeros(
+            1,
+            dtype=query.dtype,
+            device=key.device,
+        ).expand(batch_size * num_attention_heads, query_length, key_length)
+        attn_scores = torch.baddbmm(
+            attn_scores,
+            query,
+            key.transpose(1, 2),
+            beta=1.0,
+            alpha=self.inv_norm_factor,
+        )
+
+        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+        input_dtype = attn_scores.dtype
+        if input_dtype in [torch.float16, torch.bfloat16]:
+            attn_scores = attn_scores.to(torch.float)
+        attn_scores = torch.where(
+            attention_mask, torch.finfo(attn_scores.dtype).min, attn_scores
+        )
+        attn_scores = attn_scores.view(
+            batch_size, num_attention_heads, query_length, key_length
+        )
+
+        attn_weights = nn.functional.softmax(attn_scores, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+
+
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings, base=10000, device=None):
+        super().__init__()
+        self.true_inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+        )
+        self.register_buffer("inv_freq", self.true_inv_freq)
+
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+        self.cos_cached = None
+        self.sin_cached = None
+
+    @staticmethod
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    @staticmethod
+    def _create_cos_sin(inv_freq, max_position_embeddings, dtype, device):
+        t = torch.arange(
+            max_position_embeddings, device=inv_freq.device, dtype=inv_freq.dtype
+        )
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos().to(device).to(dtype), emb.sin().to(device).to(dtype)
+
+    def forward(self, q, k, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if (
+            seq_len > self.max_seq_len_cached
+            or self.cos_cached is None
+            or self.sin_cached is None
+        ):
+            if seq_len > self.max_seq_len_cached:
+                self.max_seq_len_cached = seq_len
+            self.cos_cached, self.sin_cached = self._create_cos_sin(
+                self.true_inv_freq, self.max_seq_len_cached, q.dtype, q.device
+            )
+        return rotary_forward(q, k, self.cos_cached, self.sin_cached, position_ids)
+
+
+@torch.jit.script
+def rotary_forward(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)
+    sin = sin[position_ids].unsqueeze(1)
+
+    chunk_size = q.shape[-1] // 2
+    q1, q2 = q.split(chunk_size, -1)
+    q_rotated = torch.cat((-q2, q1), dim=-1)
+    k1, k2 = k.split(chunk_size, -1)
+    k_rotated = torch.cat((-k2, k1), dim=-1)
+
+    q_embed = (q * cos) + (q_rotated * sin)
+    k_embed = (k * cos) + (k_rotated * sin)
+    return q_embed, k_embed
+
+
+class GPTNeoXMLP(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.act = (
+            ACT2FN[config.hidden_act]
+            if "gelu_fast" not in config.hidden_act
+            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
+        )
+
+        self.dense_h_to_4h = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=True
+        )
+        self.dense_4h_to_h = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class GPTNeoXLayer(nn.Module):
+    def __init__(self, layer_id, prefix: str, config, weights):
+        super().__init__()
+        self.use_parallel_residual = config.use_parallel_residual
+        self.input_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.layers.{layer_id}.input_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+        self.post_attention_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.layers.{layer_id}.post_attention_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+        self.attention = GPTNeoXAttention(
+            config, prefix=f"{prefix}.layers.{layer_id}.attention", weights=weights
+        )
+        self.mlp = GPTNeoXMLP(
+            config, prefix=f"{prefix}.layers.{layer_id}.mlp", weights=weights
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=False,
+        layer_past=None,
+        output_attentions=False,
+    ):
+        attention_layer_outputs = self.attention(
+            self.input_layernorm(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attention_layer_outputs[
+            0
+        ]  # output_attn: attn_output, present, (attn_weights)
+        outputs = attention_layer_outputs[1:]
+
+        if self.use_parallel_residual:
+            # pseudocode:
+            # x = x + attn(ln1(x)) + mlp(ln2(x))
+            mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+            hidden_states = mlp_output + attn_output + hidden_states
+        else:
+            # pseudocode:
+            # x = x + attn(ln1(x))
+            # x = x + mlp(ln2(x))
+            attn_output = attn_output + hidden_states
+            mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+            hidden_states = mlp_output + attn_output
+
+        if use_cache:
+            outputs = (
+                hidden_states,
+            ) + outputs  # hidden_states, present, (attn_weights)
+        else:
+            outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)
+
+        return outputs
+
+
+class GPTNeoXModel(GPTNeoXPreTrainedModel):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__(config)
+        self.config = config
+
+        self.num_attention_heads = config.num_attention_heads
+
+        self.embed_in = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_in", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                GPTNeoXLayer(layer_id, prefix, config, weights)
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.final_layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.final_layer_norm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+        self.tp_world_size = weights.process_group.size()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids=None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * self.config.num_hidden_layers)
+        else:
+            past_length = past_key_values[0][0].size(-2)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_length, seq_length + past_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_in(input_ids)
+
+        hidden_states = inputs_embeds
+
+        # Attention mask.
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[-1]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), device=hidden_states.device
+            )
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        causal_mask = prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+
+        assert self.num_attention_heads % self.tp_world_size == 0
+        block_size = self.num_attention_heads // self.tp_world_size
+        causal_mask = torch.repeat_interleave(causal_mask, block_size, dim=0)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        presents = () if use_cache else None
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (layer, layer_past) in enumerate(zip(self.layers, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = layer(
+                hidden_states,
+                position_ids=position_ids,
+                attention_mask=causal_mask,
+                head_mask=head_mask[i],
+                layer_past=layer_past,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, presents, all_hidden_states, all_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, prefix: str, config, weights):
+        super().__init__(config)
+
+        if not prefix:
+            prefix = "gpt_neox"
+        else:
+            prefix = f"{prefix}.gpt_neox"
+
+        self.gpt_neox = GPTNeoXModel(prefix, config, weights)
+        self.embed_out = SpeculativeHead.load(
+            config, prefix="embed_out", weights=weights
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are
+            only required when the model is used as a decoder in a Sequence to Sequence model.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config = GPTNeoXConfig.from_pretrained("EleutherAI/gpt-neox-20b")
+        >>> config.is_decoder = True
+        >>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = self.gpt_neox(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        lm_logits, speculative_logits = self.embed_out(hidden_states)
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shift_logits = lm_logits[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return (
+            CausalLMOutputWithPast(
+                loss=lm_loss,
+                logits=lm_logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            ),
+            speculative_logits,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        input_shape = input_ids.shape
+
+        # cut decoder_input_ids if past is used
+        if past_key_values and past_key_values[0] is not None:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+        )
+
+        return model_inputs
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx)
+                    for past_state in layer_past[:2]
+                )
+                + layer_past[2:],
+            )
+        return reordered_past
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/opt_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/opt_modeling.py
new file mode 100644
index 00000000..bd440321
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/opt_modeling.py
@@ -0,0 +1,857 @@
+# coding=utf-8
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch OPT model."""
+import random
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers import OPTConfig
+from text_generation_server.layers import (
+    FastLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    SpeculativeHead,
+)
+
+EPS = 1e-5
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full(
+        (tgt_len, tgt_len),
+        torch.tensor(torch.finfo(dtype).min, device=device),
+        device=device,
+    )
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    tgt_len, past_key_values_length, dtype=dtype, device=device
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(
+        bsz, 1, tgt_len, tgt_len + past_key_values_length
+    )
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+
+
+class OPTLearnedPositionalEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, prefix: str, weights):
+        super().__init__()
+        self.offset = 2
+        self.weight = nn.Parameter(
+            weights.get_tensor(
+                f"{prefix + '.' if prefix else ''}decoder.embed_positions.weight"
+            )
+        )
+
+    def forward(
+        self, attention_mask: torch.LongTensor, past_key_values_length: int = 0
+    ):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        attention_mask = attention_mask.long()
+
+        # create positions depending on attention_mask
+        positions = (
+            torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
+        ).long() - 1
+
+        # cut positions if `past_key_values_length` is > 0
+        positions = positions[:, past_key_values_length:]
+
+        return torch.nn.functional.embedding(positions + self.offset, self.weight)
+
+
+class OPTAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config,
+        prefix,
+        weights,
+        is_decoder: bool = False,
+        bias: bool = True,
+        process_group=None,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        num_heads = config.num_attention_heads
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.dropout = config.dropout
+        self.head_dim = hidden_size // num_heads
+
+        if (self.head_dim * num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        process_group = weights.process_group
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // process_group.size()
+        self.hidden_size = self.hidden_size // process_group.size()
+
+        self.q_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.q_proj", weights=weights, bias=bias
+        )
+        self.k_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.k_proj", weights=weights, bias=bias
+        )
+        self.v_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.v_proj", weights=weights, bias=bias
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.out_proj", weights=weights, bias=bias
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
+        if attn_weights.dtype == torch.float16:
+            attn_weights = nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(torch.float16)
+        else:
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `hidden_size` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.hidden_size)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class OPTDecoderLayer(nn.Module):
+    def __init__(self, layer_id: int, prefix: str, config: OPTConfig, weights):
+        super().__init__()
+        self.process_group = weights.process_group
+        self.hidden_size = config.hidden_size
+        prefix = f"{prefix + '.' if prefix else ''}decoder.layers.{layer_id}"
+        self.self_attn = OPTAttention(
+            config,
+            prefix=f"{prefix}.self_attn",
+            weights=weights,
+            is_decoder=True,
+            bias=config.enable_bias,
+        )
+        self.do_layer_norm_before = config.do_layer_norm_before
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+
+        self.self_attn_layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.self_attn_layer_norm", weights=weights, eps=EPS
+        )
+        self.fc1 = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.fc1", weights=weights, bias=config.enable_bias
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.fc2", weights=weights, bias=config.enable_bias
+        )
+        self.final_layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.final_layer_norm", weights=weights, eps=EPS
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
+        hidden_states = residual + hidden_states
+
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        hidden_states_shape = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, hidden_states.size(-1))
+        residual = hidden_states
+
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
+
+        hidden_states = (residual + hidden_states).view(hidden_states_shape)
+
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class OPTPreTrainedModel(PreTrainedModel):
+    config_class = OPTConfig
+
+
+class OPTDecoder(OPTPreTrainedModel):
+    def __init__(self, prefix: str, config: OPTConfig, weights):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.vocab_size = config.vocab_size
+
+        prefix = prefix + "." if prefix else ""
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}decoder.embed_tokens", weights=weights
+        )
+        self.embed_positions = OPTLearnedPositionalEmbedding(prefix, weights)
+
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_out = FastLinear.load(
+                config,
+                prefix=f"{prefix}decoder.project_out",
+                weights=weights,
+                bias=False,
+            )
+        else:
+            self.project_out = None
+
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_in = FastLinear.load(
+                config,
+                prefix=f"{prefix}decoder.project_in",
+                weights=weights,
+                bias=False,
+            )
+        else:
+            self.project_in = None
+
+        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
+        # with checkpoints that have been fine-tuned before transformers v4.20.1
+        # see https://github.com/facebookresearch/metaseq/pull/164
+        if config.do_layer_norm_before and not config._remove_final_layer_norm:
+            self.final_layer_norm = nn.LayerNorm.load(
+                prefix=f"{prefix}decoder.final_layer_norm", weights=weights, eps=EPS
+            )
+        else:
+            self.final_layer_norm = None
+
+        self.layers = nn.ModuleList(
+            [
+                OPTDecoderLayer(layer_id, prefix, config, weights)
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values_length + seq_length
+
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                batch_size, mask_seq_length, device=inputs_embeds.device
+            )
+        causal_attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+        pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
+
+        if self.project_in is not None:
+            inputs_embeds = self.project_in(inputs_embeds)
+
+        hidden_states = inputs_embeds + pos_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        if self.final_layer_norm is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.project_out is not None:
+            hidden_states = self.project_out(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class OPTModel(OPTPreTrainedModel):
+    def __init__(self, prefix: str, config: OPTConfig, weights):
+        super().__init__(config)
+        self.decoder = OPTDecoder(prefix, config, weights)
+        # Initialize weights and apply final processing
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            hidden_states=decoder_outputs.hidden_states,
+            attentions=decoder_outputs.attentions,
+        )
+
+
+class OPTForCausalLM(OPTPreTrainedModel):
+    def __init__(self, prefix, config, weights):
+        super().__init__(config)
+
+        self.model = OPTModel(prefix, config, weights)
+
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix=f"{prefix + '.' if prefix else ''}decoder.embed_tokens",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits, speculative_logits = self.lm_head(outputs.last_hidden_state)
+
+        loss = None
+
+        return (
+            CausalLMOutputWithPast(
+                loss=loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            ),
+            speculative_logits,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/phi_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/phi_modeling.py
new file mode 100644
index 00000000..3f2ed010
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/phi_modeling.py
@@ -0,0 +1,336 @@
+# imlementation of the PhiModel and PhiForCausalLM classes
+
+import torch
+import torch.distributed
+
+import math
+from torch import nn
+from typing import Optional, List, Tuple
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    FastLinear,
+)
+
+
+# PhiConfig is the configuration class for the PhiModel.
+class PhiConfig(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=51200,
+        n_positions=2048,
+        n_embd=2560,
+        n_layer=32,
+        n_inner=None,
+        n_head=32,
+        rotary_dim=32,
+        layer_norm_epsilon=1e-5,
+        tie_word_embeddings=False,
+        pad_vocab_size_multiple=64,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        no_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_inner = n_inner
+        self.n_head = n_head
+        self.rotary_dim = rotary_dim
+
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.no_bias = no_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+# RotaryEmbedding is a class that implements the rotary embedding.
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len):
+        super().__init__()
+        inv_freq = [1.0 / 10000.0 ** (i / dim) for i in range(0, dim, 2)]
+        inv_freq_len = len(inv_freq)
+        inv_freq = torch.tensor(inv_freq).view(1, inv_freq_len)
+        t = torch.arange(0, max_seq_len, dtype=torch.float).view(max_seq_len, 1)
+        freqs = t.matmul(inv_freq)
+        self.sin = freqs.sin()
+        self.cos = freqs.cos()
+
+    def apply_rotary_emb_qkv(self, qkv, seqlen_offset):
+        b_size, seqlen, three, _, _headdim = qkv.shape
+        if three != 3:
+            raise Exception("unexpected shape for qkv")
+        _, rotary_dim = self.cos.shape
+        rotary_dim = rotary_dim * 2
+        q_rot = qkv[:, :, 0, :, :rotary_dim]
+        q_pass = qkv[:, :, 0, :, rotary_dim:]
+        k_rot = qkv[:, :, 1, :, :rotary_dim]
+        k_pass = qkv[:, :, 1, :, rotary_dim:]
+        q12 = torch.chunk(q_rot, 2, dim=-1)
+        k12 = torch.chunk(k_rot, 2, dim=-1)
+        q1, q2 = q12[0], q12[1]
+        k1, k2 = k12[0], k12[1]
+        c = self.cos.narrow(0, seqlen_offset, seqlen).unsqueeze(1)
+        s = self.sin.narrow(0, seqlen_offset, seqlen).unsqueeze(1)
+        q_rot = torch.cat(
+            [
+                q1 * c - q2 * s,
+                q1 * s + q2 * c,
+            ],
+            dim=-1,
+        )
+        k_rot = torch.cat(
+            [
+                k1 * c - k2 * s,
+                k1 * s + k2 * c,
+            ],
+            dim=-1,
+        )
+        q = torch.cat([q_rot, q_pass], dim=-1)
+        k = torch.cat([k_rot, k_pass], dim=-1)
+        v = qkv[:, :, 2]
+        return q, k, v
+
+
+# PhiCausalLMHead is the head of the PhiModel. It is a linear layer with a layer norm.
+class PhiCausalLMHead(nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+        self.ln = nn.LayerNorm.load(
+            prefix="lm_head.ln",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+        self.linear = SpeculativeHead.load(
+            config=config, prefix="lm_head.linear", weights=weights
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.ln(hidden_states)
+        hidden_states = self.linear(hidden_states)
+        return hidden_states
+
+
+# PhiMHA is a multi-head attention layer. This layer uses an attention mask to prevent tokens from attending to subsequent tokens.
+class PhiMHA(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.Wqkv = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=not config.no_bias,
+        )
+        self.op_size = config.n_embd
+        self.head_dim = int(config.n_embd / config.n_head)
+        self.num_heads = config.n_head
+        self.rotary_emb = RotaryEmbedding(
+            config.rotary_dim,
+            config.n_positions,
+        )
+        self.softmax_scale = 1.0 / math.sqrt(self.head_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        past_kv_cache,
+        attention_mask=None,
+    ):
+        b_size, seq_len, _n_embd = hidden_states.shape
+        qkv = self.Wqkv(hidden_states)
+        qkv = qkv.view(b_size, seq_len, 3, self.num_heads, self.head_dim)
+        seqlen_offset = 0 if past_kv_cache is None else past_kv_cache[0].shape[1]
+        q, k, v = self.rotary_emb.apply_rotary_emb_qkv(qkv, seqlen_offset)
+
+        # if there is a kv_cache, then we need to concatenate
+        if past_kv_cache is not None:
+            prev_k, prev_v = past_kv_cache
+            k = torch.cat([prev_k, k], dim=1)
+            v = torch.cat([prev_v, v], dim=1)
+
+        past_kv_cache = [k, v]
+        attn_weights = torch.einsum("bthd,bshd->bhts", q, k * self.softmax_scale)
+
+        if attention_mask is not None:
+            seqlen_k = k.shape[1]
+            seqlen_q = q.shape[1]
+            causal_mask = torch.triu(
+                torch.full((seqlen_q, seqlen_k), -10000.0, device=attn_weights.device),
+                1,
+            )
+            attn_weights = attn_weights + causal_mask.to(dtype=attn_weights.dtype)
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = attn_weights.matmul(v.transpose(1, 2)).squeeze(0)
+        attn_output = (
+            attn_output.view((b_size, self.num_heads, seq_len, self.head_dim))
+            .transpose(1, 2)
+            .flatten(-2)
+        )
+        return self.out_proj(attn_output), past_kv_cache
+
+
+# PhiMLP is a multi-layer perceptron. It contains two linear layers with a gelu activation function.
+class PhiMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        self.n_inner = config.n_inner
+        self.fc1 = FastLinear.load(
+            config=config,
+            prefix=f"{prefix}.fc1",
+            weights=weights,
+            bias=False,
+        )
+        self.fc2 = FastLinear.load(
+            config=config,
+            prefix=f"{prefix}.fc2",
+            weights=weights,
+            bias=False,
+        )
+        self.activation = torch.nn.functional.gelu
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# PhiBlock is a single transformer block. It contains a layer norm, a multi-head attention layer and an multi-layer perceptron.
+class PhiBlock(nn.Module):
+    def __init__(self, layer_id, config, weights):
+        super().__init__()
+        self.layer_id = layer_id
+        self.layer_norm = nn.LayerNorm.load(
+            prefix=f"{layer_id}.ln", weights=weights, eps=config.layer_norm_epsilon
+        )
+        self.mixer = PhiMHA(prefix=f"{layer_id}.mixer", config=config, weights=weights)
+        self.mlp = PhiMLP(prefix=f"{layer_id}.mlp", config=config, weights=weights)
+
+    def forward(
+        self,
+        hidden_states,
+        kv_cache,
+        attention_mask,
+    ):
+        residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        attn_outputs, past_kv_cache = self.mixer(
+            hidden_states, kv_cache, attention_mask
+        )
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        out = attn_outputs + feed_forward_hidden_states + residual
+        return out, past_kv_cache
+
+
+# PhiModel implements the embedding layer and the transformer blocks.
+class PhiModel(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.tp_rank = weights.process_group.rank()
+        self.tp_world_size = weights.process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embd.wte", weights=weights
+        )
+        self.blocks = nn.ModuleList(
+            [
+                PhiBlock(f"{prefix}.h.{layer_id}", config, weights)
+                for layer_id in range(config.n_layer)
+            ]
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        hidden_states = self.embed_tokens(input_ids)
+        seq_len = hidden_states.shape[1]
+        mask = None if seq_len <= 1 else attention_mask
+
+        past_key_values = (
+            [None] * len(self.blocks) if past_key_values is None else past_key_values
+        )
+
+        for index, block in enumerate(self.blocks):
+            hidden_states, new_key_values = block(
+                hidden_states, past_key_values[index], mask
+            )
+            past_key_values[index] = new_key_values
+
+        return hidden_states, past_key_values
+
+
+# PhiForCausalLM wraps the PhiModel and PhiCausalLMHead together and returns a CausalLMOutputWithPast object.
+class PhiForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        self.model = PhiModel(prefix, config, weights)
+        self.lm_head = PhiCausalLMHead(config, weights)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        model_output = self.model(
+            input_ids, past_key_values, attention_mask, return_dict, use_cache
+        )
+        logits = self.lm_head(model_output[0])
+
+        loss = None
+        if labels is not None:
+            loss = nn.CrossEntropyLoss()(
+                logits[:, :-1].view(-1, logits.size(-1)), labels[:, 1:].view(-1)
+            )
+
+        if not return_dict:
+            return (
+                ((loss,) + (logits,) + model_output[1:])
+                if loss is not None
+                else (logits,) + model_output[1:]
+            )
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=model_output[1],
+            hidden_states=None,
+            attentions=None,
+        )
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/siglip.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/siglip.py
new file mode 100644
index 00000000..95ac9ede
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/siglip.py
@@ -0,0 +1,410 @@
+from typing import Optional, Tuple
+import warnings
+import math
+import torch
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPooling,
+)
+from transformers import SiglipConfig, SiglipVisionConfig
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelEmbedding,
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+
+
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, prefix, config: SiglipVisionConfig, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+        self.patch_embedding.bias = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.position_embedding", weights=weights
+        )
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions, device=weights.device).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        patch_embeds = self.patch_embedding(
+            pixel_values
+        )  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.head_size = self.head_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.embed_dim = self.embed_dim // weights.process_group.size()
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.k_proj", weights=weights, bias=True
+        )
+        self.v_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.v_proj", weights=weights, bias=True
+        )
+        self.q_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.q_proj", weights=weights, bias=True
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        # scale post matmul
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) * self.scale
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(attn_weights.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SiglipMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(  # config.hidden_size, config.intermediate_size
+            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(  # config.intermediate_size, config.hidden_size
+            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, prefix, config: SiglipConfig, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = SiglipAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.layer_norm1 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
+        )
+        self.mlp = SiglipMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.layer_norm2 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor]:
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states, None
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, prefix, config: SiglipVisionConfig, weights):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(
+            config.hidden_size, config.num_attention_heads, batch_first=True
+        )
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(prefix, config, weights)
+
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    lower = norm_cdf((a - mean) / std)
+    upper = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * lower - 1, 2 * upper - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    a: float = -2.0,
+    b: float = 2.0,
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, prefix, config: SiglipConfig, weights):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                SiglipEncoderLayer(
+                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states, _ = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+
+        return hidden_states
+
+
+class SiglipVisionTransformer(nn.Module):
+    def __init__(self, prefix, config: SiglipVisionConfig, weights):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = SiglipVisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
+        self.encoder = SiglipEncoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ):
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        # NOTE: up until this point, the code logits are exactly
+        # the same as the transformers code. The values evaulate
+        # slightly differently in our encoder layer.
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+        )
+        last_hidden_state = encoder_outputs
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            # pooler_output=pooled_output,
+            # hidden_states=encoder_outputs,
+        )
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/t5_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/t5_modeling.py
new file mode 100644
index 00000000..e6666acd
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/t5_modeling.py
@@ -0,0 +1,1227 @@
+# coding=utf-8
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch T5 model."""
+
+import copy
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+from loguru import logger
+
+import torch
+import torch.distributed
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (
+    is_torch_fx_proxy,
+)
+from transformers import T5Config
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    SpeculativeHead,
+)
+
+# copied from https://github.com/huggingface/transformers/blob/cd4584e3c809bb9e1392ccd3fe38b40daba5519a/src/transformers/models/t5/modeling_t5.py#L1316
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
+`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
+If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
+num_heads)`.
+"""
+
+
+class PartialTPEmbedding(nn.Module):
+    def __init__(self, prefix: str, weights):
+        super().__init__()
+        weight = weights.get_sharded(f"{prefix}.weight", dim=1)
+        self.weight = nn.Parameter(weight)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.embedding(input, self.weight)
+
+
+@torch.jit.script
+def layer_norm(hidden_states, weight, epsilon):
+    # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+    # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+    # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+    # half-precision inputs is done in fp32
+
+    variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + epsilon)
+
+    # convert into half-precision if necessary
+    if weight.dtype in [torch.float16, torch.bfloat16]:
+        hidden_states = hidden_states.to(weight.dtype)
+
+    return weight * hidden_states
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, prefix, weights, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        weight = weights.get_tensor(f"{prefix}.weight")
+        self.weight = nn.Parameter(weight)
+        self.variance_epsilon = torch.tensor(eps)
+
+    def forward(self, hidden_states):
+        return layer_norm(hidden_states, self.weight, self.variance_epsilon)
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    T5LayerNorm = FusedRMSNorm  # noqa
+
+    logger.info(
+        "Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm"
+    )
+except ImportError:
+    # using the normal T5LayerNorm
+    pass
+except Exception:
+    logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
+    pass
+
+ALL_LAYERNORM_LAYERS.append(T5LayerNorm)
+
+
+class T5DenseActDense(nn.Module):
+    def __init__(self, config: T5Config, prefix, weights):
+        super().__init__()
+        self.wi = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.wi", weights=weights, bias=False
+        )
+
+        ### XXX: T5 models do not handle well both f16 and quantization.
+        ### Overidding specifically this layer for that reason.
+        ### https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L316
+        ### https://github.com/huggingface/transformers/issues/20287
+        _q = config.quantize
+        _dtype = weights.dtype
+        weights.dtype = torch.float32
+        config.quantize = None
+        self.wo_cast = (torch.float32, _dtype)
+        self.wo = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.wo", weights=weights, bias=False
+        )
+        weights.dtype = _dtype
+        config.quantize = _q
+
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = (
+            ACT2FN[config.dense_act_fn]
+            if "gelu" not in config.dense_act_fn
+            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states.to(dtype=self.wo_cast[0])
+        hidden_states = self.wo(hidden_states)
+        # XXX: Recasting is already done within the layer norm.
+        # Casting back to float16 here modifies results
+        # hidden_states = hidden_states.to(dtype=self.wo_cast[1])
+        return hidden_states
+
+
+class T5DenseGatedActDense(nn.Module):
+    def __init__(self, config: T5Config, prefix, weights):
+        super().__init__()
+        self.wi_0 = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.wi_0", weights=weights, bias=False
+        )
+        self.wi_1 = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.wi_1", weights=weights, bias=False
+        )
+        ### XXX: T5 models do not handle well both f16 and quantization.
+        ### Overidding specifically this layer for that reason.
+        ### https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L316
+        ### https://github.com/huggingface/transformers/issues/20287
+        _q = config.quantize
+        _dtype = weights.dtype
+        weights.dtype = torch.float32
+        config.quantize = None
+        self.wo_cast = (torch.float32, _dtype)
+        self.wo = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.wo", weights=weights, bias=False
+        )
+        weights.dtype = _dtype
+        config.quantize = _q
+
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = (
+            ACT2FN[config.dense_act_fn]
+            if "gelu" not in config.dense_act_fn
+            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
+        )
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states.to(dtype=self.wo_cast[0])
+        hidden_states = self.wo(hidden_states)
+        # XXX: Recasting is already done within the layer norm.
+        # Casting back to float16 here modifies results
+        # hidden_states = hidden_states.to(dtype=self.wo_cast[1])
+        return hidden_states
+
+
+class T5LayerFF(nn.Module):
+    def __init__(self, config: T5Config, prefix, weights):
+        super().__init__()
+        if config.is_gated_act:
+            self.DenseReluDense = T5DenseGatedActDense(
+                config, prefix=f"{prefix}.DenseReluDense", weights=weights
+            )
+        else:
+            self.DenseReluDense = T5DenseActDense(
+                config, prefix=f"{prefix}.DenseReluDense", weights=weights
+            )
+
+        self.layer_norm = T5LayerNorm(
+            prefix=f"{prefix}.layer_norm",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5Attention(nn.Module):
+    def __init__(
+        self, config: T5Config, prefix, weights, has_relative_attention_bias=False
+    ):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        process_group = weights.process_group
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        assert self.n_heads % process_group.size() == 0
+        self.q = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.q", weights=weights, bias=False
+        )
+        self.k = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.k", weights=weights, bias=False
+        )
+        self.v = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.v", weights=weights, bias=False
+        )
+        self.o = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.o", weights=weights, bias=False
+        )
+        if self.n_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`n_heads` must be divisible by `num_shards` (got `n_heads`: {self.n_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.n_heads = self.n_heads // process_group.size()
+        self.inner_dim = self.inner_dim // process_group.size()
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = PartialTPEmbedding(
+                prefix=f"{prefix}.relative_attention_bias", weights=weights
+            )
+
+    @staticmethod
+    def _relative_position_bucket(
+        relative_position, bidirectional=True, num_buckets=32, max_distance=128
+    ):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(
+                relative_position, torch.zeros_like(relative_position)
+            )
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, num_buckets - 1),
+        )
+
+        relative_buckets += torch.where(
+            is_small, relative_position, relative_position_if_large
+        )
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, device=None):
+        """Compute binned relative position bias"""
+        if device is None:
+            device = self.relative_attention_bias.weight.device
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[
+            :, None
+        ]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[
+            None, :
+        ]
+        relative_position = (
+            memory_position - context_position
+        )  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(
+            relative_position_bucket
+        )  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(
+            0
+        )  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key_value is not None:
+            assert (
+                len(past_key_value) == 2
+            ), f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
+            real_seq_length += (
+                past_key_value[0].shape[2] if query_length is None else query_length
+            )
+
+        key_length = (
+            real_seq_length if key_value_states is None else key_value_states.shape[1]
+        )
+
+        def shape(states):
+            """projection"""
+            return states.view(
+                batch_size, -1, self.n_heads, self.key_value_proj_dim
+            ).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return (
+                states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+            )
+
+        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+                elif past_key_value.shape[2] != key_value_states.shape[1]:
+                    # checking that the `sequence_length` of the `past_key_value` is the same as
+                    # the provided `key_value_states` to support prefix tuning
+                    # cross-attn
+                    # (batch_size, n_heads, seq_length, dim_per_head)
+                    hidden_states = shape(proj_layer(key_value_states))
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        query_states = shape(
+            self.q(hidden_states)
+        )  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states,
+            self.k,
+            key_value_states,
+            past_key_value[0] if past_key_value is not None else None,
+        )
+        value_states = project(
+            hidden_states,
+            self.v,
+            key_value_states,
+            past_key_value[1] if past_key_value is not None else None,
+        )
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length),
+                    device=scores.device,
+                    dtype=scores.dtype,
+                )
+            else:
+                position_bias = self.compute_bias(
+                    real_seq_length, key_length, device=scores.device
+                )
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+
+            if mask is not None:
+                position_bias = (
+                    position_bias + mask
+                )  # (batch_size, n_heads, seq_length, key_length)
+
+        position_bias_masked = position_bias
+
+        scores += position_bias_masked
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
+            scores
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = unshape(
+            torch.matmul(attn_weights, value_states)
+        )  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        present_key_value_state = (
+            (key_states, value_states) if (self.is_decoder and use_cache) else None
+        )
+        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+class T5LayerSelfAttention(nn.Module):
+    def __init__(self, config, prefix, weights, has_relative_attention_bias=False):
+        super().__init__()
+        self.SelfAttention = T5Attention(
+            config,
+            prefix=f"{prefix}.SelfAttention",
+            weights=weights,
+            has_relative_attention_bias=has_relative_attention_bias,
+        )
+        self.layer_norm = T5LayerNorm(
+            prefix=f"{prefix}.layer_norm",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.EncDecAttention = T5Attention(
+            config,
+            prefix=f"{prefix}.EncDecAttention",
+            weights=weights,
+            has_relative_attention_bias=False,
+        )
+        self.layer_norm = T5LayerNorm(
+            prefix=f"{prefix}.layer_norm",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+
+
+class T5Block(nn.Module):
+    def __init__(self, config, prefix, weights, has_relative_attention_bias: bool):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(
+            T5LayerSelfAttention(
+                config,
+                prefix=f"{prefix}.layer.0",
+                weights=weights,
+                has_relative_attention_bias=has_relative_attention_bias,
+            )
+        )
+        if self.is_decoder:
+            i = 2
+            self.layer.append(
+                T5LayerCrossAttention(
+                    config, prefix=f"{prefix}.layer.1", weights=weights
+                )
+            )
+        else:
+            i = 1
+
+        self.layer.append(
+            T5LayerFF(config, prefix=f"{prefix}.layer.{i}", weights=weights)
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+        if past_key_value is not None:
+            if not self.is_decoder:
+                logger.warning(
+                    "`past_key_values` is passed to the encoder. Please make sure this is intended."
+                )
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            if len(past_key_value) != expected_num_past_key_values:
+                raise ValueError(
+                    f"There should be {expected_num_past_key_values} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
+                    f"Got {len(past_key_value)} past key / value states"
+                )
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[
+            2:
+        ]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16:
+                clamp_value = torch.where(
+                    torch.isinf(hidden_states).any(),
+                    torch.finfo(hidden_states.dtype).max - 1000,
+                    torch.finfo(hidden_states.dtype).max,
+                )
+                hidden_states = torch.clamp(
+                    hidden_states, min=-clamp_value, max=clamp_value
+                )
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = (
+                    present_key_value_state + cross_attention_outputs[1]
+                )
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.where(
+                torch.isinf(hidden_states).any(),
+                torch.finfo(hidden_states.dtype).max - 1000,
+                torch.finfo(hidden_states.dtype).max,
+            )
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = T5Config
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert decoder_start_token_id is not None, (
+            "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
+            " See T5 docs for more information"
+        )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(
+                input_ids.shape[:-1] + (1,), decoder_start_token_id
+            )
+            shifted_input_ids = torch.cat(
+                [shifted_input_ids, input_ids[..., :-1]], dim=-1
+            )
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert (
+            pad_token_id is not None
+        ), "self.model.config.pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
+
+
+class T5Stack(T5PreTrainedModel):
+    def __init__(self, config, prefix, weights, embed_tokens):
+        super().__init__(config)
+
+        self.is_decoder = config.is_decoder
+
+        self.embed_tokens = embed_tokens
+        self.block = nn.ModuleList(
+            [
+                T5Block(
+                    config,
+                    prefix=f"{prefix}.block.{layer_id}",
+                    weights=weights,
+                    has_relative_attention_bias=(layer_id == 0),
+                )
+                for layer_id in range(config.num_layers)
+            ]
+        )
+        self.final_layer_norm = T5LayerNorm(
+            prefix=f"{prefix}.final_layer_norm",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # Model parallel
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds"
+            )
+
+        if inputs_embeds is None:
+            assert (
+                self.embed_tokens is not None
+            ), "You have to initialize the model with valid token embeddings"
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = (
+            past_key_values[0][0].shape[2] + seq_length
+            if past_key_values is not None
+            else seq_length
+        )
+
+        if use_cache is True:
+            assert (
+                self.is_decoder
+            ), f"`use_cache` can only be set to `True` if {self} is used as a decoder"
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                batch_size, mask_seq_length, device=inputs_embeds.device
+            )
+        if (
+            self.is_decoder
+            and encoder_attention_mask is None
+            and encoder_hidden_states is not None
+        ):
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size,
+                encoder_seq_length,
+                device=inputs_embeds.device,
+                dtype=torch.long,
+            )
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape
+        )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            (
+                encoder_batch_size,
+                encoder_sequence_length,
+                _,
+            ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=inputs_embeds.device
+                )
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask
+            )
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(
+            cross_attn_head_mask, self.config.num_layers
+        )
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module, past_key_value) in enumerate(
+            zip(self.block, past_key_values)
+        ):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            # Model parallel
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                position_bias=position_bias,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                encoder_decoder_position_bias=encoder_decoder_position_bias,
+                layer_head_mask=layer_head_mask,
+                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[
+                    4 if output_attentions else 3
+                ]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (
+                    present_key_value_state,
+                )
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class T5ForConditionalGeneration(T5PreTrainedModel):
+    def __init__(self, config: T5Config, weights):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = TensorParallelEmbedding(prefix="shared", weights=weights)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(
+            config=encoder_config,
+            prefix="encoder",
+            weights=weights,
+            embed_tokens=self.shared,
+        )
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(
+            config=decoder_config,
+            prefix="decoder",
+            weights=weights,
+            embed_tokens=self.shared,
+        )
+
+        try:
+            self.lm_head = SpeculativeHead.load(
+                config, prefix="lm_head", weights=weights
+            )
+        except RuntimeError:
+            # Some models like t5-small were saved with shared weights unlike flan
+            # Since they are declared as the same arch we have no choice but hope
+            # that this is OK instead of using a proper flag.
+            self.lm_head = SpeculativeHead.load(
+                config, prefix="shared", weights=weights
+            )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if (
+            labels is not None
+            and decoder_input_ids is None
+            and decoder_inputs_embeds is None
+        ):
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        logits, speculative_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        if not return_dict:
+            output = (logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+
+        return (
+            Seq2SeqLMOutput(
+                loss=loss,
+                logits=logits,
+                past_key_values=decoder_outputs.past_key_values,
+                decoder_hidden_states=decoder_outputs.hidden_states,
+                decoder_attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+                encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+                encoder_hidden_states=encoder_outputs.hidden_states,
+                encoder_attentions=encoder_outputs.attentions,
+            ),
+            speculative_logits,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        decoder_attention_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past_key_values is None:
+            logger.warning(
+                "You might want to consider setting `use_cache=True` to speed up decoding"
+            )
+            return past_key_values
+
+        reordered_decoder_past = ()
+        for layer_past_states in past_key_values:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(
+                        0, beam_idx.to(layer_past_state.device)
+                    ),
+                )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + (
+                reordered_layer_past_states,
+            )
+        return reordered_decoder_past
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py
new file mode 100644
index 00000000..e5c44045
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py
@@ -0,0 +1,48 @@
+def load_text_model(prefix, config, weights, name=None):
+    if config.model_type == "llama":
+        from text_generation_server.models.custom_modeling.flash_llama_modeling import (
+            FlashLlamaForCausalLM,
+        )
+
+        return FlashLlamaForCausalLM(prefix, config, weights)
+    elif config.model_type == "mistral":
+        from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
+            FlashMistralForCausalLM,
+        )
+
+        return FlashMistralForCausalLM(prefix, config, weights, name=name)
+    elif config.model_type == "gemma":
+        from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
+            FlashGemmaForCausalLM,
+        )
+
+        return FlashGemmaForCausalLM(prefix, config, weights, causal=False)
+    elif config.model_type == "paligemma":
+        from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
+            FlashGemmaForCausalLM,
+        )
+
+        return FlashGemmaForCausalLM(prefix, config, weights)
+    else:
+        raise RuntimeError(f"Unsupported model type {config.model_type}")
+
+
+def load_vision_model(prefix, config, weights):
+    if config.model_type == "clip_vision_model":
+        from text_generation_server.models.custom_modeling.clip import (
+            CLIPVisionTransformer,
+        )
+
+        return CLIPVisionTransformer(
+            prefix=f"{prefix}.vision_model", config=config, weights=weights
+        )
+    if config.model_type == "siglip_vision_model":
+        from text_generation_server.models.custom_modeling.siglip import (
+            SiglipVisionTransformer,
+        )
+
+        return SiglipVisionTransformer(
+            prefix="vision_tower.vision_model", config=config, weights=weights
+        )
+    else:
+        raise RuntimeError(f"Unsupported model type {config.model_type}")
diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
new file mode 100644
index 00000000..bc9d44a0
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@@ -0,0 +1,2003 @@
+from contextlib import nullcontext
+import math
+import os
+import time
+import torch
+import torch.distributed
+
+import numpy as np
+
+from loguru import logger
+from dataclasses import dataclass
+from opentelemetry import trace
+from transformers import (
+    PreTrainedTokenizerBase,
+    AutoConfig,
+    AutoTokenizer,
+    GenerationConfig,
+)
+from typing import Any, ContextManager, Iterable, Optional, Tuple, List, Type, Dict
+
+from text_generation_server.adapters import AdapterBatchData, AdapterBatchMetadata
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.models import Model
+from text_generation_server.utils.log import log_master
+from text_generation_server.utils.tokens import batch_top_tokens
+from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
+from text_generation_server.models.types import (
+    Batch,
+    Tokens,
+    Generation,
+    GeneratedText,
+)
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.globals import (
+    MEM_POOL,
+    ATTENTION,
+    BLOCK_SIZE,
+    CUDA_GRAPHS,
+    TGI_WIGGLE_ROOM,
+    get_adapter_to_index,
+)
+from text_generation_server.layers.attention import Seqlen
+from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
+from text_generation_server.utils.dist import MEMORY_FRACTION
+from text_generation_server.utils.quantization import get_loader
+from text_generation_server.utils.segments import SegmentConcatBuilder, find_segments
+
+from text_generation_server.utils.import_utils import (
+    empty_cache,
+    synchronize,
+    get_free_memory,
+)
+
+tracer = trace.get_tracer(__name__)
+
+
+# Will be set in init
+SLIDING_WINDOW: Optional[int] = None
+
+
+def set_sliding_window(sliding_window: int):
+    global SLIDING_WINDOW
+    SLIDING_WINDOW = sliding_window
+
+
+def get_sliding_windows() -> int:
+    global SLIDING_WINDOW
+    return SLIDING_WINDOW
+
+
+def init_cpu_threads_env(rank_id: int, world_size: int):
+    import importlib.util
+
+    if importlib.util.find_spec("numa") is not None:
+        import numa
+        import psutil
+
+        nodes = numa.info.get_max_node() + 1
+        rank_per_node = math.ceil(world_size / nodes)
+        num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
+        node_id = int(rank_id / rank_per_node)
+        rank_offset_per_node = rank_id % rank_per_node
+        if os.getenv("OMP_NUM_THREADS") is None:
+            num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
+        else:
+            num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
+        if len(numa.memory.get_membind_nodes()) == nodes:
+            numa.memory.set_membind_nodes((node_id))
+        torch.set_num_threads(num_cpus_per_rank)
+        if len(numa.schedule.get_affinitive_cpus(0)) == psutil.cpu_count(logical=True):
+            cpu_start = num_cpus_per_rank * rank_offset_per_node
+            numa.schedule.run_on_cpus(
+                0,
+                *(
+                    numa.info.node_to_cpus(node_id)[
+                        cpu_start : cpu_start + num_cpus_per_rank
+                    ]
+                ),
+            )
+        logger.info(
+            f"affinity={numa.schedule.get_affinitive_cpus(0)}, membind = {numa.memory.get_membind_nodes()}"
+        )
+
+
+@dataclass
+class FlashCausalLMBatch(Batch):
+    batch_id: int
+    requests: List[generate_pb2.Request]
+    # request id -> idx in list mapping
+    requests_idx_mapping: Dict[int, int]
+
+    # Decoder values
+    input_ids: torch.Tensor
+    position_ids: torch.Tensor
+    speculative_ids: Optional[torch.Tensor]
+
+    # Flash Attention values
+
+    # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
+    cu_seqlen_prefill: Optional[torch.Tensor]
+    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
+    # as we only keep SLIDING_WINDOW values instead of the whole tensor
+    prefill_cache_indices: Optional[torch.Tensor]
+
+    # Paged Attention values
+
+    # Set when creating the batch
+    # CPU tensor of length b indicating the start of each sequence in slots
+    start_slots: torch.Tensor
+    # tensor of indices of the currently used slots, length = \sum_{i=0}^{b} s_i in prefill, length = b in decode
+    slot_indices: torch.Tensor
+
+    # list of length b of list of length s_i // block_size
+    block_tables: List[List[int]]
+    # tensor of size [b, max_total_seqlen // block_size] holding the paged attention block tables for all sequences
+    block_tables_tensor: torch.Tensor
+    # tensor of length \sum_{i=0}^{b} max_s_i  holding the paged attention slots for all sequences
+    slots: torch.Tensor
+    # size [b], containing the number of blocks that can be retrieved from the cache
+    prefix_lens: List[int]
+    prefix_lens_tensor: torch.Tensor
+
+    max_seqlen: int
+
+    # Prefill metadata tensors to efficiently compute logprobs
+    prefill_head_indices: Optional[torch.Tensor]
+    prefill_next_token_indices: Optional[torch.tensor]
+    prefill_cu_outlens: Optional[List[int]]
+
+    # Prefixes
+    prefix_ids: List[List[int]]
+
+    # All tokens
+    all_input_ids: List[List[int]]
+    all_input_ids_tensor: torch.Tensor
+
+    # Lengths of all generations present in the batch
+    input_lengths: List[int]
+    input_lengths_tensor: torch.Tensor
+    prefix_offsets: List[Optional[int]]
+    read_offsets: List[Optional[int]]
+
+    # Generation helpers
+    next_token_chooser: HeterogeneousNextTokenChooser
+    stopping_criterias: List[StoppingCriteria]
+    top_n_tokens: List[int]
+    top_n_tokens_tensor: torch.Tensor
+
+    # Adapter metadata for each request
+    adapter_meta: AdapterBatchMetadata
+
+    # Number of blocks in this batch
+    num_blocks: int
+    # Maximum number of blocks
+    max_blocks: int
+
+    def to_pb(self) -> generate_pb2.CachedBatch:
+        return generate_pb2.CachedBatch(
+            id=self.batch_id,
+            request_ids=[r.id for r in self.requests],
+            size=len(self),
+            max_tokens=self.num_blocks * BLOCK_SIZE,
+        )
+
+    @classmethod
+    def batch_tokenized_inputs(
+        cls, requests: Iterable[generate_pb2.Request], tokenizer
+    ):
+        max_length = 0
+        all_input_ids = []
+        batch_size = 0
+        for r in requests:
+            batch_size += 1
+            inputs = concat_text_chunks(r.input_chunks.chunks)
+            input_ids = tokenizer(
+                inputs,
+                truncation=True,
+                max_length=r.truncate,
+                add_special_tokens=r.add_special_tokens,
+            )["input_ids"]
+            max_length = max(max_length, len(input_ids))
+            all_input_ids.append(input_ids)
+        return all_input_ids
+
+    @classmethod
+    def from_tokenized(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        batch_tokenized_inputs,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "FlashCausalLMBatch":
+        sliding_window = get_sliding_windows()
+        position_ids = []
+        cu_seqlen_prefill = [0]
+        start_slots = []
+        slot_indices = []
+        prefill_cache_indices = []
+
+        input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        all_input_ids = []
+        prefix_ids = []
+        requests_idx_mapping = {}
+
+        all_prefill_logprobs = True
+        no_prefill_logprobs = True
+        prefill_head_indices = []
+        prefill_next_token_indices = []
+        prefill_cu_outlens = [0]
+
+        next_token_chooser_parameters = []
+        stopping_criterias = []
+        top_n_tokens = []
+
+        adapter_indices_list = []
+        adapter_set = set()
+
+        # Cumulative length
+        cumulative_length = 0
+        cumulative_slot_tokens = 0
+        prefill_out_cumulative_length = 0
+
+        num_blocks = 0
+        max_seqlen = 0
+        max_length = 0
+        max_blocks = 0
+
+        block_tables = []
+        slots = []
+        prefix_lens = []
+
+        # Parse batch
+        for i, (r, tokenized_input) in enumerate(
+            zip(pb.requests, batch_tokenized_inputs)
+        ):
+            # request id -> idx in list mapping
+            requests_idx_mapping[r.id] = i
+
+            orig_input_length = len(tokenized_input)
+
+            prefix_len = r.prefix_len
+            assert (
+                prefix_len <= orig_input_length
+            ), f"Prefix {prefix_len} vs input {orig_input_length}"
+            if prefix_len == orig_input_length:
+                assert prefix_len > 0
+                prefix_len -= 1
+
+            # Commented as it's costly.
+            # log_master(logger.debug, "Tokenized input ids {tokenized_input}")
+            prefix_ids.append(tokenized_input[:prefix_len])
+            tokenized_input = tokenized_input[prefix_len:]
+
+            input_length = len(tokenized_input)
+            input_lengths.append(input_length)
+
+            prefix_offsets.append(input_length - 5)
+            read_offsets.append(input_length)
+
+            all_input_ids.append(tokenized_input)
+
+            # Position ids
+            request_position_ids = torch.arange(
+                prefix_len, orig_input_length, dtype=torch.int32
+            )
+            position_ids.append(request_position_ids)
+
+            # Add cumulative lengths of all previous inputs
+            cu_seqlen_prefill.append(cumulative_length + input_length)
+
+            next_token_chooser_parameters.append(r.parameters)
+
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            max_new_tokens = stopping_criteria.max_new_tokens
+            stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(r.top_n_tokens)
+
+            ADAPTER_TO_INDEX = get_adapter_to_index()
+            adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
+            adapter_indices_list.append(torch.full((input_length,), adapter_index))
+            adapter_set.add(adapter_index)
+
+            # Paged attention
+            # Remove one as the first token des not have a past
+            speculative_length = get_speculate()
+            speculative_length = 0 if speculative_length is None else speculative_length
+
+            # Tokens that need to be mapped to blocks.
+            block_tokens = orig_input_length + max_new_tokens - 1 + speculative_length
+
+            # Tokens that need to be mapped to slots. We don't need slots for the
+            # cached prefix (if present).
+            slot_tokens = input_length + max_new_tokens - 1 + speculative_length
+
+            # blocks and slots can be empty (for example in warmup)
+            if not r.blocks:
+                needed_blocks = math.ceil(block_tokens / BLOCK_SIZE)
+                request_blocks = [
+                    b for b in range(num_blocks, num_blocks + needed_blocks)
+                ]
+                request_slots = [
+                    s
+                    for b in request_blocks
+                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
+                ]
+            else:
+                request_blocks = r.blocks
+                request_slots = r.slots[
+                    prefix_len:  #: orig_input_length + max_new_tokens + speculative_length
+                ]
+
+            block_tables.append(request_blocks)
+
+            slots.extend(request_slots)
+            prefix_lens.append(prefix_len)
+            num_blocks += len(request_blocks)
+            start_slots.append(cumulative_slot_tokens)
+
+            request_slot_indices = torch.arange(
+                cumulative_slot_tokens,
+                cumulative_slot_tokens + input_length,
+                dtype=torch.int64,
+            )
+            slot_indices.append(request_slot_indices)
+
+            # Create tensor to slice into the kv tensor in prefill
+            if sliding_window is not None:
+                request_prefill_cache_indices = torch.arange(
+                    cumulative_length + max(0, input_length - sliding_window),
+                    cumulative_length + input_length,
+                    dtype=torch.int64,
+                )
+                prefill_cache_indices.append(request_prefill_cache_indices)
+
+            all_prefill_logprobs = all_prefill_logprobs and r.prefill_logprobs
+            no_prefill_logprobs = no_prefill_logprobs and not r.prefill_logprobs
+
+            if r.prefill_logprobs:
+                prefill_head_indices.append(request_position_ids + cumulative_length)
+                prefill_next_token_indices.append(
+                    prefill_out_cumulative_length + input_length - 1
+                )
+                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
+                prefill_out_cumulative_length += input_length
+            else:
+                prefill_head_indices.append(
+                    torch.tensor(
+                        [cumulative_length + input_length - 1], dtype=torch.int32
+                    )
+                )
+                prefill_next_token_indices.append(prefill_out_cumulative_length)
+                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
+                prefill_out_cumulative_length += 1
+
+            # Update
+            cumulative_length += input_length
+            cumulative_slot_tokens += slot_tokens
+            max_seqlen = max(max_seqlen, input_length)
+            max_blocks = max(max_blocks, len(request_blocks))
+            max_length = max(
+                max_length, input_length + max_new_tokens + speculative_length
+            )
+
+        adapter_indices = torch.cat(adapter_indices_list).to(
+            dtype=torch.int64, device=device
+        )
+
+        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
+            next_token_chooser_parameters, dtype, device, tokenizer
+        )
+        start_slots = torch.tensor(start_slots, dtype=torch.int64)
+
+        # Padded all_input_ids_tensor
+        all_input_ids_tensor = np.zeros(
+            (len(all_input_ids), max_length), dtype=np.int64
+        )
+        for i, input_ids in enumerate(all_input_ids):
+            all_input_ids_tensor[i, : len(input_ids)] = input_ids
+
+        # Create tensors on device
+        all_input_ids_tensor = torch.tensor(
+            all_input_ids_tensor, dtype=torch.int64, device=device
+        )
+
+        if len(pb.requests) > 1:
+            input_ids = np.concatenate(all_input_ids, dtype=np.int64)
+            position_ids = torch.cat(position_ids)
+            slot_indices = torch.cat(slot_indices)
+            if sliding_window is not None:
+                prefill_cache_indices = torch.cat(prefill_cache_indices)
+        else:
+            input_ids = all_input_ids[0]
+            position_ids = position_ids[0]
+            slot_indices = slot_indices[0]
+            if sliding_window is not None:
+                prefill_cache_indices = prefill_cache_indices[0]
+
+        cu_seqlen_prefill = torch.tensor(
+            cu_seqlen_prefill, device=device, dtype=torch.int32
+        )
+        position_ids = position_ids.to(device)
+        slot_indices = slot_indices.to(device)
+        prefill_cache_indices = (
+            prefill_cache_indices.to(device) if sliding_window is not None else None
+        )
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
+        input_lengths_tensor = torch.tensor(
+            input_lengths, dtype=torch.int32, device=device
+        )
+
+        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+        adapter_segments = torch.tensor(
+            adapter_segments, dtype=torch.int32, device=device
+        )
+
+        if all_prefill_logprobs:
+            prefill_head_indices = None
+            prefill_next_token_indices = cu_seqlen_prefill[1:] - 1
+        elif no_prefill_logprobs:
+            prefill_head_indices = cu_seqlen_prefill[1:] - 1
+            prefill_next_token_indices = None
+        else:
+            prefill_head_indices = torch.tensor(
+                torch.cat(prefill_head_indices), dtype=torch.int64, device=device
+            )
+            prefill_next_token_indices = torch.tensor(
+                prefill_next_token_indices, dtype=torch.int64, device=device
+            )
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+
+        slots = torch.tensor(slots, dtype=torch.int64, device=device)
+
+        block_tables_tensor = torch.zeros(
+            (len(block_tables), max_blocks), dtype=torch.int32, device="cpu"
+        )
+        for i, request_blocks in enumerate(block_tables):
+            block_tables_tensor[i, : len(request_blocks)] = torch.tensor(request_blocks)
+        block_tables_tensor = block_tables_tensor.to(device)
+        prefix_lens_tensor = torch.tensor(prefix_lens, dtype=torch.int32, device=device)
+
+        return cls(
+            batch_id=pb.id,
+            requests=pb.requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            prefill_cache_indices=prefill_cache_indices,
+            start_slots=start_slots,
+            slot_indices=slot_indices,
+            block_tables=block_tables,
+            block_tables_tensor=block_tables_tensor,
+            slots=slots,
+            prefix_lens=prefix_lens,
+            prefix_lens_tensor=prefix_lens_tensor,
+            max_seqlen=max_seqlen,
+            prefill_head_indices=prefill_head_indices,
+            prefill_next_token_indices=prefill_next_token_indices,
+            prefill_cu_outlens=prefill_cu_outlens,
+            input_lengths=input_lengths,
+            input_lengths_tensor=input_lengths_tensor,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            all_input_ids=all_input_ids,
+            all_input_ids_tensor=all_input_ids_tensor,
+            prefix_ids=prefix_ids,
+            next_token_chooser=next_token_chooser,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            num_blocks=num_blocks,
+            max_blocks=max_blocks,
+            adapter_meta=AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            ),
+            speculative_ids=None,
+        )
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "FlashCausalLMBatch":
+        assert len(pb.requests) > 0
+        batch_tokenized_inputs = cls.batch_tokenized_inputs(pb.requests, tokenizer)
+        return cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
+
+    @tracer.start_as_current_span("filter")
+    def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
+        if len(request_ids) == 0:
+            raise ValueError("Batch must have at least one request")
+        # We assume that if len(requests) == len(self) then the requests are the same
+        if len(request_ids) == len(self):
+            return self
+
+        device = self.input_ids.device
+
+        # New values after filtering
+        requests_idx_mapping = {}
+
+        # Used to index into tensors
+        indices = []
+
+        # slots to keep after filtering
+        slot_filtering_indices = torch.zeros(
+            self.slots.shape[0], dtype=torch.bool, device=device
+        )
+
+        # Create on CPU to only move to GPU once instead of at every copy
+        slot_indices = torch.empty(len(request_ids), dtype=torch.int64)
+        max_seqlen = 0
+
+        requests = []
+        start_slots = []
+        block_tables = []
+        all_input_ids = []
+        prefix_ids = []
+
+        input_lengths = []
+        prefix_lens = []
+        prefix_offsets = []
+        read_offsets = []
+
+        stopping_criterias = []
+        top_n_tokens = []
+        adapter_set = set()
+
+        num_blocks = 0
+        max_blocks = 0
+        # Cumulative length
+        cumulative_max_length = 0
+
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            indices.append(idx)
+            requests_idx_mapping[request_id] = i
+
+            requests.append(self.requests[idx])
+
+            # Get length
+            request_input_length = self.input_lengths[idx]
+            prefix_len = self.prefix_lens[idx]
+            max_seqlen = max(max_seqlen, request_input_length)
+
+            all_input_ids.append(self.all_input_ids[idx])
+            prefix_ids.append(self.prefix_ids[idx])
+
+            input_lengths.append(request_input_length)
+            prefix_lens.append(prefix_len)
+            prefix_offsets.append(self.prefix_offsets[idx])
+            read_offsets.append(self.read_offsets[idx])
+
+            stopping_criteria = self.stopping_criterias[idx]
+            stopping_criterias.append(stopping_criteria)
+
+            top_n_tokens.append(self.top_n_tokens[idx])
+
+            ADAPTER_TO_INDEX = get_adapter_to_index()
+            adapter_index = ADAPTER_TO_INDEX.get(self.requests[idx].adapter_id, 0)
+            adapter_set.add(adapter_index)
+
+            remaining_tokens = (
+                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
+            )
+
+            request_block_table = self.block_tables[idx]
+            num_blocks += len(request_block_table)
+            block_tables.append(request_block_table)
+            start_slots.append(cumulative_max_length)
+
+            # Copy to tensor (CPU)
+            slot_indices[i] = cumulative_max_length + request_input_length - 1
+
+            # Set slice
+            slot_filtering_indices[
+                self.start_slots[idx] : self.start_slots[idx]
+                + request_input_length
+                + remaining_tokens
+                - 1
+            ] = True
+
+            cumulative_max_length += request_input_length + remaining_tokens - 1
+
+            max_blocks = max(max_blocks, len(request_block_table))
+
+        # Index into tensors
+        input_ids = self.input_ids[indices]
+        position_ids = self.position_ids[indices]
+        adapter_indices = self.adapter_meta.adapter_indices[indices]
+        all_input_ids_tensor = self.all_input_ids_tensor[indices]
+        block_tables_tensor = self.block_tables_tensor[indices]
+        input_lengths_tensor = self.input_lengths_tensor[indices]
+        slots = self.slots[slot_filtering_indices]
+        prefix_lens_tensor = self.prefix_lens_tensor[indices]
+        next_token_chooser = self.next_token_chooser.filter(indices)
+        top_n_tokens_tensor = self.top_n_tokens_tensor[indices]
+        speculative_ids = (
+            self.speculative_ids[indices] if self.speculative_ids is not None else None
+        )
+
+        start_slots = torch.tensor(start_slots, dtype=torch.int64)
+
+        # Move to GPU now that we have the whole tensor
+        slot_indices = slot_indices.to(device)
+
+        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+        adapter_segments = torch.tensor(
+            adapter_segments, dtype=torch.int32, device=device
+        )
+        # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum()
+
+        return type(self)(
+            batch_id=self.batch_id,
+            requests=requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=None,
+            prefill_cache_indices=None,
+            start_slots=start_slots,
+            slot_indices=slot_indices,
+            block_tables=block_tables,
+            block_tables_tensor=block_tables_tensor,
+            slots=slots,
+            max_seqlen=max_seqlen,
+            prefill_head_indices=None,
+            prefill_next_token_indices=None,
+            prefill_cu_outlens=None,
+            input_lengths=input_lengths,
+            input_lengths_tensor=input_lengths_tensor,
+            prefix_lens=prefix_lens,
+            prefix_lens_tensor=prefix_lens_tensor,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            all_input_ids=all_input_ids,
+            all_input_ids_tensor=all_input_ids_tensor,
+            prefix_ids=prefix_ids,
+            next_token_chooser=next_token_chooser,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            num_blocks=num_blocks,
+            max_blocks=max_blocks,
+            speculative_ids=speculative_ids,
+            adapter_meta=AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            ),
+        )
+
+    @classmethod
+    @tracer.start_as_current_span("concatenate")
+    def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch":
+        # Batch attributes
+        requests = []
+        requests_idx_mapping = {}
+
+        num_blocks = 0
+        total_batch_size = 0
+        total_slots = 0
+        max_blocks = 0
+        max_length = 0
+        max_seqlen = 0
+        for b in batches:
+            total_batch_size += len(b)
+            total_slots += len(b.slots)
+            num_blocks += b.num_blocks
+            speculative_length = (
+                b.speculative_ids.shape[1] if b.speculative_ids is not None else 0
+            )
+            max_blocks = max(max_blocks, b.max_blocks)
+            max_seqlen = max(max_seqlen, b.max_seqlen)
+            max_length = max(
+                max_length,
+                max(
+                    input_length
+                    + stopping_criteria.max_new_tokens
+                    + speculative_length
+                    - stopping_criteria.current_tokens
+                    for input_length, stopping_criteria in zip(
+                        b.input_lengths, b.stopping_criterias
+                    )
+                ),
+            )
+
+        input_ids = batches[0].input_ids.new_empty(total_batch_size)
+        position_ids = batches[0].position_ids.new_empty(total_batch_size)
+        slots = batches[0].slots.new_empty(total_slots)
+        slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
+        input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
+            total_batch_size
+        )
+        block_tables_tensor = batches[0].block_tables_tensor.new_zeros(
+            (total_batch_size, max_blocks)
+        )
+        prefix_lens_tensor = batches[0].prefix_lens_tensor.new_empty(total_batch_size)
+        all_input_ids_tensor = batches[0].all_input_ids_tensor.new_zeros(
+            (total_batch_size, max_length)
+        )
+        top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
+            total_batch_size,
+        )
+        total_indices_size = sum(
+            b.adapter_meta.adapter_indices.shape[0] for b in batches
+        )
+        adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
+            total_indices_size
+        )
+        adapter_set = set()
+        adapter_segment_builder = SegmentConcatBuilder()
+
+        start_slots = []
+        block_tables = []
+        prefix_lens = []
+        all_input_ids = []
+        prefix_ids = []
+
+        input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+
+        next_token_chooser_parameters = []
+        fsm_grammar_states = []
+        stopping_criterias = []
+        top_n_tokens = []
+
+        # Cumulative length
+        cumulative_batch_size = 0
+        cumulative_slots = 0
+        cumulative_adapter_indices_size = 0
+
+        for i, batch in enumerate(batches):
+            requests.extend(batch.requests)
+
+            if i == 0:
+                requests_idx_mapping = batch.requests_idx_mapping
+            else:
+                # We need to offset the mapping for each batch by the cumulative batch size
+                for k, v in batch.requests_idx_mapping.items():
+                    requests_idx_mapping[k] = v + cumulative_batch_size
+
+            start_index = cumulative_batch_size
+            end_index = cumulative_batch_size + len(batch)
+            slots_start_index = cumulative_slots
+            slots_end_index = cumulative_slots + len(batch.slots)
+
+            # Copy tensors (GPU)
+            input_ids[start_index:end_index] = batch.input_ids
+            position_ids[start_index:end_index] = batch.position_ids
+            slot_indices[start_index:end_index] = batch.slot_indices + cumulative_slots
+            input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor
+            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor
+            slots[slots_start_index:slots_end_index] = batch.slots
+
+            # Copy over adapter indices
+            adapter_start_index = cumulative_adapter_indices_size
+            adapter_end_index = (
+                cumulative_adapter_indices_size
+                + batch.adapter_meta.adapter_indices.shape[0]
+            )
+            adapter_indices[adapter_start_index:adapter_end_index] = (
+                batch.adapter_meta.adapter_indices
+            )
+            cumulative_adapter_indices_size = adapter_end_index
+            adapter_set.update(batch.adapter_meta.adapter_set)
+            adapter_segment_builder.concat(
+                batch.adapter_meta.adapter_segments, batch.adapter_meta.segment_indices
+            )
+
+            all_input_ids_tensor[
+                start_index:end_index, : batch.all_input_ids_tensor.shape[1]
+            ] = batch.all_input_ids_tensor[:, :max_length]
+
+            block_tables_tensor[
+                start_index:end_index, : batch.block_tables_tensor.shape[1]
+            ] = batch.block_tables_tensor[:, :max_blocks]
+
+            prefix_lens_tensor[start_index:end_index] = batch.prefix_lens_tensor
+
+            start_slots.append(batch.start_slots + cumulative_slots)
+
+            block_tables.extend(batch.block_tables)
+            prefix_lens.extend(batch.prefix_lens)
+            all_input_ids.extend(batch.all_input_ids)
+            prefix_ids.extend(batch.prefix_ids)
+
+            input_lengths.extend(batch.input_lengths)
+            prefix_offsets.extend(batch.prefix_offsets)
+            read_offsets.extend(batch.read_offsets)
+
+            next_token_chooser_parameters.extend([r.parameters for r in batch.requests])
+            fsm_grammar_states.extend(batch.next_token_chooser.fsm_grammar_states)
+            stopping_criterias.extend(batch.stopping_criterias)
+
+            top_n_tokens.extend(batch.top_n_tokens)
+
+            # Update
+            cumulative_batch_size += len(batch)
+            cumulative_slots += len(batch.slots)
+
+        start_slots = torch.concat(start_slots)
+
+        # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum()
+
+        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
+            next_token_chooser_parameters,
+            dtype=batches[0].next_token_chooser.dtype,
+            device=batches[0].next_token_chooser.device,
+            tokenizer=batches[0].next_token_chooser.tokenizer,
+            fsm_grammar_states=fsm_grammar_states,
+        )
+
+        speculative_ids = (
+            torch.cat([b.speculative_ids for b in batches], dim=0)
+            if batches[0].speculative_ids is not None
+            else None
+        )
+
+        adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
+
+        return cls(
+            batch_id=batches[0].batch_id,
+            requests=requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=None,
+            prefill_cache_indices=None,
+            start_slots=start_slots,
+            slot_indices=slot_indices,
+            block_tables=block_tables,
+            block_tables_tensor=block_tables_tensor,
+            prefix_lens=prefix_lens,
+            prefix_lens_tensor=prefix_lens_tensor,
+            slots=slots,
+            max_seqlen=max_seqlen,
+            prefill_head_indices=None,
+            prefill_next_token_indices=None,
+            prefill_cu_outlens=None,
+            input_lengths=input_lengths,
+            input_lengths_tensor=input_lengths_tensor,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            all_input_ids=all_input_ids,
+            all_input_ids_tensor=all_input_ids_tensor,
+            prefix_ids=prefix_ids,
+            next_token_chooser=next_token_chooser,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            num_blocks=num_blocks,
+            max_blocks=max_blocks,
+            speculative_ids=speculative_ids,
+            adapter_meta=AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            ),
+        )
+
+    def __len__(self):
+        return len(self.requests)
+
+
+ADAPTER_LAYERS = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+]
+ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}
+
+
+class FlashCausalLM(Model):
+    def __init__(
+        self,
+        model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+        lora_adapter_ids: Optional[list] = [],
+        tokenizer_class: PreTrainedTokenizerBase = AutoTokenizer,
+        config_class: PreTrainedTokenizerBase = AutoConfig,
+        default_dtype=torch.float16,
+        aliases=None,
+        # Used for Santacoder override of config
+        num_kv_heads: Optional[int] = None,
+        # Deepseek V2 uses different QK and V dims.
+        head_size: Optional[int] = None,
+        skip_special_tokens: bool = True,
+    ):
+        self.quantize = quantize
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = default_dtype if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                # Float16 doesn't exist on target.
+                dtype = torch.bfloat16 if dtype is None else dtype
+                init_cpu_threads_env(rank_id=rank, world_size=world_size)
+        else:
+            raise NotImplementedError(f"{model_class} is only available on GPU")
+
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        try:
+            generation_config = GenerationConfig.from_pretrained(
+                model_id, revision=revision, trust_remote_code=trust_remote_code
+            )
+            if isinstance(generation_config.eos_token_id, (list, set)):
+                # TODO Huge hack
+                tokenizer._eos_token_ids = set(generation_config.eos_token_id)
+        except Exception:
+            pass
+
+        config = config_class.from_pretrained(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+
+        torch.distributed.barrier(group=self.process_group)
+
+        weights_loader = get_loader(quantize, model_id, revision)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device,
+            dtype,
+            process_group=self.process_group,
+            aliases=aliases,
+            weights_loader=weights_loader,
+        )
+
+        prefix = ""
+        model = model_class(prefix, config, weights)
+        torch.distributed.barrier(group=self.process_group)
+
+        # VLM models define the config we care about in their text_config
+        text_config = getattr(config, "text_config", None)
+        if text_config is not None:
+            config = text_config
+
+        if getattr(config, "sliding_window", None) is not None:
+            set_sliding_window(config.sliding_window)
+        else:
+            config.sliding_window = None
+
+        self.num_layers = config.num_hidden_layers
+        self.num_heads = config.num_attention_heads // self.process_group.size()
+        # Validation is done in the model itself
+        if num_kv_heads is None:
+            num_kv_heads = getattr(config, "num_key_value_heads", None)
+            # GPT-2 workaround
+            if num_kv_heads is None:
+                num_kv_heads = getattr(config, "n_head", None)
+        if num_kv_heads is None:
+            raise ValueError("Cannot get the number of key/value heads")
+        self.num_kv_heads = (
+            num_kv_heads // self.process_group.size()
+            if num_kv_heads > 1
+            else num_kv_heads
+        )
+        assert self.num_kv_heads > 0
+
+        if head_size is None:
+            # Some models use GQA and different sizes for o_proj
+            # and q_proj, that allows for that.
+            if hasattr(config, "head_dim"):
+                self.head_size = config.head_dim
+            else:
+                self.head_size = config.hidden_size // config.num_attention_heads
+        else:
+            self.head_size = head_size
+
+        self.cuda_graphs = {}
+        self.kv_cache = []
+
+        if ATTENTION == "flashinfer":
+            from text_generation_server.layers.attention.flashinfer import (
+                create_prefill_state,
+                create_decode_state,
+                create_prefill_with_paged_kv_state,
+            )
+
+            self.prefill_state = create_prefill_state(device=device)
+            self.prefill_with_paged_kv_state = create_prefill_with_paged_kv_state(
+                device=device
+            )
+
+            self.decode_state = create_decode_state(
+                device=device,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+            )
+
+        super().__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=False,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+            sliding_window=config.sliding_window,
+        )
+
+    @property
+    def batch_type(self) -> Type[FlashCausalLMBatch]:
+        return FlashCausalLMBatch
+
+    def max_past(self) -> int:
+        return getattr(self.model, "max_past", None)
+
+    def init_kv_cache(
+        self,
+        num_blocks: int,
+        num_layers: int,
+        num_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        self.kv_cache = []
+        empty_cache()
+
+        element_size = torch.tensor([], dtype=dtype).element_size()
+        if SYSTEM == "ipex" and device.type == "xpu":
+            x = 1
+        else:
+            x = BLOCK_SIZE // element_size
+
+        if ATTENTION in {"flashdecoding", "flashinfer"}:
+            self.kv_cache = [
+                (
+                    torch.empty(
+                        (num_blocks, BLOCK_SIZE, num_heads, head_size),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                    torch.empty(
+                        (num_blocks, BLOCK_SIZE, num_heads, head_size),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                )
+                for _ in range(num_layers)
+            ]
+        elif SYSTEM == "ipex" and device == torch.device("cpu"):
+            self.kv_cache = [
+                (
+                    torch.empty(
+                        (num_blocks, num_heads, BLOCK_SIZE, head_size),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                    torch.empty(
+                        (num_blocks, num_heads, BLOCK_SIZE, head_size),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                )
+                for _ in range(num_layers)
+            ]
+        else:
+            self.kv_cache = [
+                (
+                    torch.zeros(
+                        (num_blocks, num_heads, head_size // x, BLOCK_SIZE, x),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                    torch.zeros(
+                        (num_blocks, num_heads, head_size, BLOCK_SIZE),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                )
+                for _ in range(num_layers)
+            ]
+
+    def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
+        input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
+        position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
+        slots = torch.arange(bs, dtype=torch.int64, device=self.device)
+        input_lengths = [max_s] * bs
+        prefix_lengths = [0] * bs
+        input_lengths_tensor = (
+            torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
+        )
+        prefix_lengths_tensor = torch.zeros(bs, dtype=torch.int32, device=self.device)
+        block_tables = torch.arange(
+            max_bt, dtype=torch.int32, device=self.device
+        ).repeat(bs)
+        block_tables = block_tables.reshape((bs, max_bt))
+
+        if ATTENTION == "flashinfer":
+            block_tables = block_tables_to_ragged(
+                block_tables=block_tables,
+                input_lengths=input_lengths,
+                prefix_lens=prefix_lengths,
+            )
+            from text_generation_server.layers.attention.flashinfer import (
+                create_decode_state_cuda_graphs,
+            )
+
+            block_tables_ptr = torch.zeros(
+                bs + 1, dtype=torch.int32, device=self.device
+            )
+            last_page_len = torch.ones(bs, dtype=torch.int32, device=self.device)
+            state = create_decode_state_cuda_graphs(
+                device=input_ids.device,
+                block_tables=block_tables,
+                block_tables_ptr=block_tables_ptr,
+                last_page_len=last_page_len,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+            )
+        else:
+            state = None
+
+        graph = torch.cuda.CUDAGraph()
+        self.cuda_graphs[bs] = {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "kv_cache": self.kv_cache,
+            "block_tables": block_tables,
+            "slots": slots,
+            "input_lengths": input_lengths_tensor,
+            "prefix_lengths": prefix_lengths_tensor,
+            "state": state,
+            "graph": graph,
+        }
+
+        torch.cuda.synchronize()
+        # Run once outside to warmup
+        with self._forward_context(
+            block_tables=block_tables,
+            cu_seqlen_prefill=None,
+            input_lengths_tensor=input_lengths_tensor,
+            state=state,
+            prefix_lens_tensor=prefix_lengths_tensor,
+        ):
+            seqlen = Seqlen(
+                input_lengths=input_lengths_tensor,
+                prefix_lengths=prefix_lengths_tensor,
+                cu_seqlen_q=None,
+                max_q=1,
+                max_k=max_s,
+            )
+            self.model.forward(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                cu_seqlen_prefill=None,
+                kv_cache=self.kv_cache,
+                block_tables=block_tables,
+                slots=slots,
+                seqlen=seqlen,
+                max_s=max_s,
+                prefill_cache_indices=None,
+                lm_head_indices=None,
+            )
+            del seqlen
+
+            torch.cuda.synchronize()
+
+            with torch.cuda.graph(graph, pool=MEM_POOL):
+                seqlen = Seqlen(
+                    input_lengths=input_lengths_tensor,
+                    prefix_lengths=prefix_lengths_tensor,
+                    cu_seqlen_q=None,
+                    max_q=1,
+                    max_k=max_s,
+                )
+                logits, speculative_logits = self.model.forward(
+                    input_ids=input_ids,
+                    position_ids=position_ids,
+                    cu_seqlen_prefill=None,
+                    kv_cache=self.kv_cache,
+                    block_tables=block_tables,
+                    slots=slots,
+                    seqlen=seqlen,
+                    max_s=max_s,
+                    prefill_cache_indices=None,
+                    lm_head_indices=None,
+                )
+                self.cuda_graphs[bs]["logits"] = logits
+                self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
+        torch.cuda.synchronize()
+
+    def warmup(self, batch: FlashCausalLMBatch):
+        # The warmup batch is the biggest batch we could ever receive
+        empty_cache()
+
+        try:
+            self.init_kv_cache(
+                batch.num_blocks,
+                self.num_layers,
+                self.num_kv_heads,
+                self.head_size,
+                self.dtype,
+                self.device,
+            )
+            max_bt = batch.max_blocks
+            max_s = max_bt * BLOCK_SIZE
+
+            if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
+                torch.cuda.tunable.tuning_enable(False)
+            _, batch, _ = self.generate_token(batch)
+        except torch.cuda.OutOfMemoryError as e:
+            raise RuntimeError(
+                f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
+                f"You need to decrease `--max-batch-prefill-tokens`"
+            ) from e
+
+        synchronize(self.device)
+
+        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
+        # Calculate the number of blocks that can be allocated with the free memory
+        dtype_size = torch.tensor([], dtype=self.dtype).element_size()
+        cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
+        total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
+
+        free_memory = get_free_memory(self.device, MEMORY_FRACTION)
+        batch_num_blocks = batch.num_blocks if batch is not None else 0
+
+        num_blocks = (
+            # Leave 5% for some wiggle room
+            int((free_memory * TGI_WIGGLE_ROOM) // total_cache_size)
+            # Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
+            + batch_num_blocks
+        )
+
+        del batch
+
+        self.init_kv_cache(
+            num_blocks,
+            self.num_layers,
+            self.num_kv_heads,
+            self.head_size,
+            self.dtype,
+            self.device,
+        )
+
+        if SYSTEM == "rocm":
+            if (
+                os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
+                or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
+            ):
+                torch.cuda.tunable.enable()
+
+                if os.environ.get("PYTORCH_TUNABLEOP_TUNING") != "0":
+                    torch.cuda.tunable.tuning_enable(True)
+
+                if os.environ.get("PYTORCH_TUNABLEOP_SEQLENS") is not None:
+                    tuning_sequences = [
+                        int(val)
+                        for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
+                    ]
+                elif CUDA_GRAPHS is not None:
+                    tuning_sequences = CUDA_GRAPHS
+                else:
+                    tuning_sequences = [1, 2, 3, 4, 5, 6, 7]
+
+                tunableop_filepath = os.path.join(
+                    HUGGINGFACE_HUB_CACHE,
+                    f"tunableop_{self.model_id.replace('/', '-')}_tp{self.world_size}_rank{self.rank}.csv",
+                )
+
+                log_master(
+                    logger.info,
+                    f"PyTorch TunableOp is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
+                )
+
+                torch.cuda.tunable.set_filename(
+                    tunableop_filepath, insert_device_ordinal=False
+                )
+
+                if os.path.isfile(tunableop_filepath):
+                    log_master(
+                        logger.info,
+                        f"The file {tunableop_filepath} already exists and will be reused.",
+                    )
+                    torch.cuda.tunable.read_file(tunableop_filepath)
+
+                os.makedirs(HUGGINGFACE_HUB_CACHE, exist_ok=True)
+
+                for seqlen in tuning_sequences:
+                    log_master(logger.info, f"Warming up TunableOp for seqlen={seqlen}")
+                    self.tunableop_warmup(seqlen)
+                    torch.cuda.tunable.write_file(tunableop_filepath)
+                if os.environ.get("PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP") != "1":
+                    torch.cuda.tunable.tuning_enable(False)
+            else:
+                log_master(
+                    logger.info,
+                    "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp.",
+                )
+
+        if CUDA_GRAPHS:
+            try:
+                log_master(
+                    logger.info, f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}"
+                )
+                # Warmup cuda graphs
+                for bs in CUDA_GRAPHS:
+                    if self.speculate is None or self.speculate + 1 <= bs:
+                        self.cuda_graph_warmup(bs, max_s, max_bt)
+            except torch.cuda.OutOfMemoryError:
+                logger.exception("Decode cuda graph warmup failed")
+        else:
+            log_master(
+                logger.info, f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS})."
+            )
+
+        return int(num_blocks * BLOCK_SIZE)
+
+    def tunableop_warmup(self, seqlen: int):
+        input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device)
+        position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
+        slots = torch.arange(seqlen, dtype=torch.int64, device=self.device)
+
+        # Dummy value, some models (starcoder2) don't accept `None`.
+        input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device)
+        prefix_lens_tensor = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
+        cu_seqlen_prefill = torch.tensor(
+            [0, seqlen], device=self.device, dtype=torch.int32
+        )
+        max_s = seqlen
+        seqlen = Seqlen(
+            input_lengths=input_lengths,
+            prefix_lengths=prefix_lens_tensor,
+            cu_seqlen_q=cu_seqlen_prefill,
+            max_q=1,
+            max_k=seqlen,
+        )
+
+        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
+        self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=self.kv_cache,
+            block_tables=None,
+            seqlen=seqlen,
+            slots=slots,
+            max_s=max_s,
+            lm_head_indices=None,
+            prefill_cache_indices=None,
+        )
+
+    def forward(
+        self, batch: FlashCausalLMBatch, adapter_data: AdapterBatchData
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # Model Forward
+        if batch.speculative_ids is not None:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            max_s = batch.max_seqlen
+            lm_head_indices = batch.prefill_head_indices
+
+            speculative_ids = batch.speculative_ids
+
+            B, speculative_length = speculative_ids.shape
+            new_length = speculative_length + 1
+            new_input_ids = torch.cat(
+                [input_ids.unsqueeze(-1), speculative_ids], dim=1
+            ).reshape(-1)
+            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
+            arange_int = arange.to(dtype=torch.int32)
+            new_position_ids = (
+                position_ids.unsqueeze(-1).expand(B, new_length) + arange
+            ).view(-1)
+            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
+            input_lengths = (
+                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
+            ).view(-1)
+            prefix_lens_tensor = (
+                batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length)
+            ).reshape(-1)
+
+            # Add Copy the block tables for all members
+            block_tables = (
+                block_tables.unsqueeze(1)
+                .expand(B, new_length, -1)
+                .reshape(B * new_length, -1)
+                .contiguous()
+            )
+            max_s = max_s + speculative_length
+
+            input_ids = new_input_ids
+            position_ids = new_position_ids
+        else:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            prefix_lens_tensor = batch.prefix_lens_tensor
+            max_s = batch.max_seqlen
+            lm_head_indices = batch.prefill_head_indices
+
+        if cu_seqlen_prefill is None and self.max_past() is not None:
+            # In decode, not prefill, we're actually overwriting the KV-cache
+            # in a circular buffer mode.
+            # This makes sure the max_s for the decode pass is correct.
+            max_s = min(self.max_past(), max_s)
+
+        bs = input_ids.shape[0]
+        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
+        if sorted_padded_bs:
+            # Get associated cuda graph
+            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
+        else:
+            cuda_graph = None
+
+        if cu_seqlen_prefill is not None or cuda_graph is None:
+            if ATTENTION == "flashinfer":
+                block_tables = block_tables_to_ragged(
+                    block_tables=block_tables,
+                    input_lengths=batch.input_lengths,
+                    prefix_lens=batch.prefix_lens,
+                )
+            with self._forward_context(
+                block_tables=block_tables,
+                cu_seqlen_prefill=cu_seqlen_prefill,
+                input_lengths_tensor=input_lengths,
+                prefix_lens_tensor=prefix_lens_tensor,
+            ):
+                max_k = (input_lengths + prefix_lens_tensor).max().item()
+                seqlen = Seqlen(
+                    input_lengths=input_lengths,
+                    prefix_lengths=prefix_lens_tensor,
+                    cu_seqlen_q=cu_seqlen_prefill,
+                    max_q=max_s,
+                    max_k=max_k,
+                )
+                logits, speculative_logits = self.model.forward(
+                    input_ids=input_ids,
+                    position_ids=position_ids,
+                    cu_seqlen_prefill=cu_seqlen_prefill,
+                    kv_cache=kv_cache,
+                    block_tables=block_tables,
+                    slots=slots,
+                    seqlen=seqlen,
+                    max_s=max_s,
+                    prefill_cache_indices=batch.prefill_cache_indices,
+                    lm_head_indices=lm_head_indices,
+                    adapter_data=adapter_data,
+                )
+                if batch.prefill_cache_indices is not None:
+                    batch.prefill_cache_indices = None
+                return logits, speculative_logits
+
+        # Copy inputs to the static inputs of the cuda graph
+        # Static inputs are potentially padded
+        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
+        cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
+        if ATTENTION == "flashinfer":
+            block_tables = block_tables_to_ragged(
+                block_tables=block_tables,
+                input_lengths=batch.input_lengths,
+                prefix_lens=batch.prefix_lens,
+            )
+            # assert block_tables.shape[0] >= slots.shape[0]
+            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
+        else:
+            cuda_graph["block_tables"][
+                : block_tables.shape[0], : block_tables.shape[1]
+            ] = block_tables
+
+        # XXX: This is working only because block 0 is reserved for the healthcheck
+        # so it doesn't matter if we override it with bogus values.
+        cuda_graph["slots"].fill_(0)
+        cuda_graph["slots"][: slots.shape[0]] = slots
+        cuda_graph["input_lengths"].zero_()
+        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
+        cuda_graph["prefix_lengths"].zero_()
+        cuda_graph["prefix_lengths"][: prefix_lens_tensor.shape[0]] = prefix_lens_tensor
+
+        with self._forward_context(
+            block_tables=cuda_graph["block_tables"],
+            cu_seqlen_prefill=None,
+            input_lengths_tensor=cuda_graph["input_lengths"],
+            prefix_lens_tensor=cuda_graph["prefix_lengths"],
+            state=cuda_graph["state"],
+        ):
+            # Replay the graph
+            cuda_graph["graph"].replay()
+
+        # Slice output to the correct shape
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
+        )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits
+
+    @tracer.start_as_current_span("generate_token")
+    def generate_token(
+        self, batch: FlashCausalLMBatch
+    ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch], Tuple[int, int]]:
+        start = time.time_ns()
+        prefill = batch.cu_seqlen_prefill is not None
+        prefill_logprobs = batch.prefill_next_token_indices is not None
+
+        # Update adapter indices for speculative tokens (if present)
+        adapter_meta = batch.adapter_meta
+        if batch.speculative_ids is not None:
+            B, speculative_length = batch.speculative_ids.shape
+            new_length = speculative_length + 1
+            adapter_indices = (
+                adapter_meta.adapter_indices.unsqueeze(-1)
+                .expand(B, new_length)
+                .reshape(-1)
+            )
+            adapter_segments = adapter_meta.adapter_segments * new_length
+            adapter_meta = AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_meta.adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_meta.segment_indices,
+            )
+
+        # Assign pointers to adapter weights
+        # TODO(travis): don't update this if indices haven't changed
+        adapter_data = AdapterBatchData.from_meta(
+            adapter_meta,
+            self.layer_to_adapter_weights,
+            prefill,
+            batch.prefill_head_indices,
+        )
+
+        out, speculative_logits = self.forward(batch, adapter_data)
+
+        if prefill:
+            next_token_logits = (
+                out[batch.prefill_next_token_indices] if prefill_logprobs else out
+            )
+            if speculative_logits is not None:
+                speculative_logits = (
+                    speculative_logits[batch.prefill_next_token_indices]
+                    if prefill_logprobs
+                    else speculative_logits
+                )
+            next_adapter_indices = batch.adapter_meta.adapter_indices.new_empty(
+                len(batch)
+            )
+
+        else:
+            next_token_logits = out
+            next_adapter_indices = batch.adapter_meta.adapter_indices
+
+        speculate = get_speculate()
+        (
+            next_input_ids,
+            next_token_logprobs,
+            logprobs,
+            accepted_ids,
+            speculative_ids,
+        ) = batch.next_token_chooser(
+            batch.all_input_ids_tensor[:, : batch.max_seqlen],
+            next_token_logits,
+            speculate,
+            batch.speculative_ids,
+            speculative_logits,
+        )
+
+        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
+            batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs, accepted_ids
+        )
+
+        if prefill:
+            if len(batch) > 1 and prefill_logprobs:
+                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
+                # When batch == 1, we will just use the batch.input_ids values directly
+                prefill_tokens_indices = batch.input_ids.new_zeros(len(out))
+
+            next_position_ids = batch.position_ids.new_empty(len(batch))
+            batch.slot_indices = batch.slot_indices[batch.cu_seqlen_prefill[1:] - 1]
+            # We do not need cu_seqlen_prefill anymore
+            batch.cu_seqlen_prefill = None
+        else:
+            prefill_logprobs = None
+            next_position_ids = batch.position_ids
+
+        # Cumulative length
+        cumulative_length = 0
+
+        # Results
+        generations: List[Generation] = []
+        stopped = True
+
+        # Zipped iterator
+        iterator = zip(batch.input_lengths, batch.all_input_ids, accepted_ids)
+
+        # We do two for loops as the first one can run completely asynchronously from the GPU while for the second
+        # one, we need to first do a GPU <-> CPU sync
+        # It is faster if we delay this sync for the maximum amount of time
+
+        # For each member of the batch
+        index = 0
+        for i, (input_length, all_input_ids, n_accepted_ids) in enumerate(iterator):
+            # Indexing metadata
+            start_index = cumulative_length
+            end_index = cumulative_length + input_length
+
+            if prefill:
+                # Indexing metadata
+                out_start_index = batch.prefill_cu_outlens[i]
+                out_end_index = batch.prefill_cu_outlens[i + 1]
+                out_length = out_end_index - out_start_index
+
+                # Initialize position_ids
+                # In decode, we do not need this as we can just increment position ids
+                next_position_ids[i] = batch.position_ids[end_index - 1]
+
+                # Initialize adapter indices
+                # In decode, we only have one token per row in the batch, so grab last index
+                next_adapter_indices[i] = batch.adapter_meta.adapter_indices[
+                    end_index - 1
+                ]
+
+                # Used to gather prefill logprobs
+                # Copy batch.input_ids to prefill_token_indices
+                if prefill_logprobs:
+                    if len(batch) > 1:
+                        prefill_tokens_indices[out_start_index : out_end_index - 1] = (
+                            batch.input_ids[start_index + 1 : start_index + out_length]
+                        )
+                    else:
+                        # Set prefill_tokens_indices to the correct slice
+                        prefill_tokens_indices = batch.input_ids[
+                            start_index + 1 : start_index + out_length
+                        ]
+
+            for j in range(n_accepted_ids):
+                batch.all_input_ids_tensor[i, input_length + j] = next_input_ids[index]
+                index += 1
+
+            cumulative_length += input_length
+
+        # Update values
+        batch.input_ids = next_input_ids[accepted_ids.cumsum(dim=-1) - 1]
+        batch.speculative_ids = speculative_ids
+        batch.position_ids = next_position_ids + accepted_ids
+        batch.input_lengths_tensor += accepted_ids
+        batch.slot_indices += accepted_ids
+        batch.adapter_meta.adapter_indices = next_adapter_indices
+
+        if prefill:
+            # adjust segment lengths to account for all request lengths being 1 during decoding
+            adapter_segments, _ = find_segments(batch.adapter_meta.adapter_indices)
+            batch.adapter_meta.adapter_segments = torch.tensor(
+                adapter_segments,
+                dtype=torch.int32,
+                device=batch.adapter_meta.adapter_segments.device,
+            )
+
+        if prefill and prefill_logprobs:
+            # Get prefill logprobs
+            prefill_logprobs_tensor = torch.log_softmax(out, -1)
+            prefill_logprobs = torch.gather(
+                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
+            )
+            # GPU <-> CPU sync
+            prefill_logprobs = prefill_logprobs.view(-1).tolist()
+
+        # GPU <-> CPU sync
+        next_token_logprobs = next_token_logprobs.tolist()
+        next_token_ids = next_input_ids.tolist()
+        accepted_ids = accepted_ids.tolist()
+        start_decode = time.time_ns()
+
+        # Zipped iterator
+        iterator = zip(
+            batch.requests,
+            batch.input_lengths,
+            batch.prefix_offsets,
+            batch.read_offsets,
+            batch.stopping_criterias,
+            batch.all_input_ids,
+            batch.prefix_ids,
+            batch.next_token_chooser.do_sample,
+            batch.next_token_chooser.seeds,
+            batch.top_n_tokens,
+            accepted_ids,
+            batch_top_token_ids,
+            batch_top_token_logprobs,
+        )
+
+        # For each member of the batch
+        index = 0
+        for i, (
+            request,
+            input_length,
+            prefix_offset,
+            read_offset,
+            stopping_criteria,
+            all_input_ids,
+            prefix_ids,
+            do_sample,
+            seed,
+            top_n_tokens,
+            n_accepted_ids,
+            top_token_ids,
+            top_token_logprobs,
+        ) in enumerate(iterator):
+            # Append next token to all tokens
+            next_token_texts = []
+            left = 0
+
+            if n_accepted_ids > 1:
+                log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}")
+
+            current_stopped = False
+            for j in range(index, index + n_accepted_ids):
+                # Generated token
+                next_token_id = next_token_ids[j]
+                all_input_ids.append(next_token_id)
+                next_token_text, prefix_offset, read_offset = self.decode_token(
+                    all_input_ids,
+                    prefix_offset,
+                    read_offset,
+                )
+                next_token_texts.append(next_token_text)
+
+                stop, reason = stopping_criteria(
+                    next_token_id,
+                    next_token_text,
+                )
+
+                if stop:
+                    left = index + n_accepted_ids - j - 1
+                    current_stopped = True
+                    break
+                else:
+                    current_stopped = False
+            stopped = stopped and current_stopped
+
+            _next_token_ids = next_token_ids[index : index + n_accepted_ids - left]
+            _next_token_logprobs = next_token_logprobs[
+                index : index + n_accepted_ids - left
+            ]
+            index += n_accepted_ids
+
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Decode generated tokens
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids,
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
+                    )
+                    generated_text = GeneratedText(
+                        output_text,
+                        stopping_criteria.current_tokens,
+                        reason,
+                        seed if do_sample else None,
+                    )
+                else:
+                    generated_text = None
+
+                # Prefill
+                if prefill and request.prefill_logprobs:
+                    out_start_index = batch.prefill_cu_outlens[i]
+                    out_end_index = batch.prefill_cu_outlens[i + 1]
+
+                    # Remove generated token to only have prefill and add nan for first prompt token
+                    request_prefill_logprobs = (
+                        [float("nan")] * (len(prefix_ids) + 1)
+                    ) + prefill_logprobs[out_start_index : out_end_index - 1]
+                    prefill_token_ids = all_input_ids[:-1]
+                    prefill_texts = self.tokenizer.batch_decode(
+                        prefix_ids + prefill_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+
+                    prefill_tokens = Tokens(
+                        prefix_ids + prefill_token_ids,
+                        request_prefill_logprobs,
+                        prefill_texts,
+                        is_special=[],
+                    )
+                else:
+                    prefill_tokens = None
+
+                if top_n_tokens > 0:
+                    all_top_tokens = []
+                    for top_token_ids, top_token_logprobs in zip(
+                        top_token_ids, top_token_logprobs
+                    ):
+                        toptoken_texts = self.tokenizer.batch_decode(
+                            top_token_ids,
+                            clean_up_tokenization_spaces=False,
+                            skip_special_tokens=False,
+                        )
+                        special_toptokens = [
+                            token_id in self.all_special_ids
+                            for token_id in top_token_ids
+                        ]
+                        top_tokens = Tokens(
+                            top_token_ids,
+                            top_token_logprobs,
+                            toptoken_texts,
+                            special_toptokens,
+                        )
+                        all_top_tokens.append(top_tokens)
+                    top_tokens = all_top_tokens
+                else:
+                    top_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    Tokens(
+                        _next_token_ids,
+                        _next_token_logprobs,
+                        next_token_texts,
+                        [nid in self.all_special_ids for nid in _next_token_ids],
+                    ),
+                    generated_text,
+                    top_tokens,
+                )
+
+                generations.append(generation)
+
+            # accept each new token for this specific request since we may
+            # have more than one new token per request with speculative decoding
+            for next_token_id in _next_token_ids:
+                batch.next_token_chooser = (
+                    batch.next_token_chooser.advance_grammar_single(i, next_token_id)
+                )
+
+            # Update values
+            batch.input_lengths[i] = input_length + n_accepted_ids
+            if batch.input_lengths[i] > batch.max_seqlen:
+                batch.max_seqlen = batch.input_lengths[i]
+            batch.prefix_offsets[i] = prefix_offset
+            batch.read_offsets[i] = read_offset
+            batch.all_input_ids[i] = all_input_ids
+
+        if stopped:
+            # No need to return a batch if we know that all requests stopped
+            forward_ns = start_decode - start
+            decode_ns = time.time_ns() - start_decode
+            return generations, None, (forward_ns, decode_ns)
+
+        batch.prefill_cu_outlens = None
+        batch.prefill_head_indices = None
+        batch.prefill_next_token_indices = None
+
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch, (forward_ns, decode_ns)
+
+    def _forward_context(
+        self,
+        *,
+        block_tables: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        input_lengths_tensor: torch.Tensor,
+        prefix_lens_tensor: torch.Tensor,
+        state: Optional[Any] = None,
+    ) -> ContextManager:
+        if ATTENTION != "flashinfer":
+            return nullcontext()
+
+        from text_generation_server.layers.attention.flashinfer import (
+            use_decode_state,
+            use_prefill_with_paged_kv_state,
+        )
+
+        # has_prefix_lens = any(prefix_len > 0 for prefix_len in prefix_lens)
+
+        if cu_seqlen_prefill is not None:
+            return use_prefill_with_paged_kv_state(
+                state=(
+                    state if state is not None else self.prefill_with_paged_kv_state
+                ),
+                # block_tables=block_tables_to_ragged(
+                #     block_tables=block_tables,
+                #     input_lengths=input_lengths,
+                #     prefix_lens=prefix_lens,
+                # ),
+                block_tables=block_tables,
+                cu_seqlens=cu_seqlen_prefill,
+                input_lengths=input_lengths_tensor + prefix_lens_tensor,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                page_size=BLOCK_SIZE,
+                dtype=self.dtype,
+                window_left=self.sliding_window,
+            )
+        else:
+            assert input_lengths_tensor is not None
+            return use_decode_state(
+                state=state if state is not None else self.decode_state,
+                input_lengths=input_lengths_tensor + prefix_lens_tensor,
+                block_tables=block_tables,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                page_size=BLOCK_SIZE,
+                dtype=self.dtype,
+                window_left=self.sliding_window,
+            )
+
+
+def block_tables_to_ragged(
+    *, block_tables: torch.Tensor, input_lengths: List[int], prefix_lens: List[int]
+) -> torch.Tensor:
+    """Convert block table to ragged format compatible with FlashInfer."""
+    assert len(input_lengths) == len(prefix_lens)
+
+    total_len = sum(input_lengths) + sum(prefix_lens)
+    block_tables_ragged = torch.empty(
+        total_len, dtype=torch.int32, device=block_tables.device
+    )
+
+    offset = 0
+    for i, (input_length, prefix_len) in enumerate(zip(input_lengths, prefix_lens)):
+        seq_len = prefix_len + input_length
+        block_tables_ragged[offset : offset + seq_len] = block_tables[i][:seq_len]
+        offset += seq_len
+
+    return block_tables_ragged
diff --git a/backends/gaudi/server/text_generation_server/models/galactica.py b/backends/gaudi/server/text_generation_server/models/galactica.py
new file mode 100644
index 00000000..7c4e462c
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/galactica.py
@@ -0,0 +1,156 @@
+import re
+import torch
+import torch.distributed
+
+
+from transformers import (
+    PreTrainedTokenizerBase,
+)
+from text_generation_server.models.causal_lm import CausalLMBatch
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils import (
+    NextTokenChooser,
+    StoppingCriteria,
+)
+from text_generation_server.utils.chunks import concat_text_chunks
+
+# CREDIT: Papers with code => https://github.com/paperswithcode/galai/blob/main/galai/utils.py
+
+# we split individual characters inside special tokens like [START_DNA]
+CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])")
+
+# token added to implement a custom sequence tokenization. This token is added at
+# corpus cleaning step and removed in pretokenization. The digits are added to increase the chance
+# that they do not occur in the corpus. The digits are escaped so that the token does not appear
+# literally in the source code in case we ever include it in the training data.
+SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"
+
+
+def _insert_split_marker(m: re.Match):
+    """
+    Applies split marker based on a regex match of special tokens such as
+    [START_DNA].
+    Parameters
+    ----------
+    n : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    start_token, _, sequence, end_token = m.groups()
+    sequence = re.sub(r"(.)", rf"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
+    return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"
+
+
+def escape_custom_split_sequence(text):
+    """
+    Applies custom splitting to the text for GALILEO's tokenization
+    Parameters
+    ----------
+    text : str
+        Input text to split
+    Returns
+    ----------
+    str - the text with the split token added
+    """
+    return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)
+
+
+# END CREDIT
+
+
+class GalacticaCausalLMBatch(CausalLMBatch):
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "GalacticaCausalLMBatch":
+        inputs = []
+        next_token_choosers = []
+        stopping_criterias = []
+        prefix_offsets = []
+        top_n_tokens = []
+        read_offsets = []
+        requests_idx_mapping = {}
+
+        # Parse batch
+        max_truncation = 0
+        padding_right_offset = 0
+        max_decode_tokens = 0
+        for i, r in enumerate(pb.requests):
+            requests_idx_mapping[r.id] = i
+            # Add escape_custom_split_sequence to the CausalLMBatch logic
+            inputs.append(
+                escape_custom_split_sequence(concat_text_chunks(r.input_chunks.chunks))
+            )
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(r.top_n_tokens)
+            max_truncation = max(max_truncation, r.truncate)
+            max_decode_tokens += stopping_criteria.max_new_tokens
+            padding_right_offset = max(
+                padding_right_offset, stopping_criteria.max_new_tokens
+            )
+
+        tokenized_inputs = tokenizer(
+            inputs,
+            return_tensors="pt",
+            padding=True,
+            return_token_type_ids=False,
+            truncation=True,
+            max_length=max_truncation,
+        ).to(device)
+        for _ in pb.requests:
+            input_len = tokenized_inputs["input_ids"].shape[1]
+            prefix_offsets.append(0)
+            read_offsets.append(input_len)
+
+        input_lengths = tokenized_inputs["attention_mask"].sum(1)
+        max_input_length = input_lengths.max()
+
+        input_ids = tokenized_inputs["input_ids"]
+        # Allocate maximum attention_mask
+        attention_mask = input_ids.new_zeros(
+            (pb.size, max_input_length + padding_right_offset)
+        )
+        # Copy tokenizer attention_mask into fully allocated attention_mask
+        attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]
+
+        position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
+        position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
+        all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+
+        max_tokens = len(inputs) * max_input_length + max_decode_tokens
+
+        return cls(
+            batch_id=pb.id,
+            requests=pb.requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=None,
+            all_input_ids=list(all_input_ids),
+            input_lengths=input_lengths.tolist(),
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            next_token_choosers=next_token_choosers,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            max_input_length=max_input_length.item(),
+            padding_right_offset=padding_right_offset,
+            max_tokens=max_tokens,
+        )
diff --git a/backends/gaudi/server/text_generation_server/models/globals.py b/backends/gaudi/server/text_generation_server/models/globals.py
new file mode 100644
index 00000000..30a5d3da
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/globals.py
@@ -0,0 +1,74 @@
+import torch
+import os
+from typing import Dict, Optional
+from loguru import logger
+from text_generation_server.utils.log import log_master
+
+ATTENTION = os.getenv("ATTENTION", "default")
+# default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0"
+PREFIX_CACHING = os.getenv("PREFIX_CACHING", "0").lower() in {
+    "1",
+    "true",
+}
+PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "0").lower() in {"1", "true"}
+log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
+_expected = {"paged", "flashdecoding", "flashinfer", "default"}
+assert (
+    ATTENTION in _expected
+), f"Attention is not valid {ATTENTION}, expected {_expected}"
+log_master(logger.info, f"Using Attention = {ATTENTION}")
+
+if PREFIX_CACHING and ATTENTION not in {"flashinfer", "flashdecoding"}:
+    raise RuntimeError("Prefix caching is only supported with flashinfer")
+
+MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
+TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
+assert TGI_WIGGLE_ROOM > 0
+assert TGI_WIGGLE_ROOM < 1
+
+# This is overridden by the cli
+BLOCK_SIZE: int
+if ATTENTION == "flashdecoding":
+    BLOCK_SIZE = 256
+elif ATTENTION == "flashinfer":
+    BLOCK_SIZE = 1
+else:
+    BLOCK_SIZE = 16
+
+# This is overridden by the cli
+cuda_graphs = os.getenv("CUDA_GRAPHS")
+if cuda_graphs is not None:
+    try:
+        cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
+    except Exception as e:
+        raise RuntimeError(
+            f"Could not parse cuda graphs {cuda_graphs}, expected comma separated list for batch sizes to run on: {e}"
+        )
+else:
+    cuda_graphs = None
+
+CUDA_GRAPHS = cuda_graphs
+
+# This is overridden at model loading.
+global MODEL_ID
+MODEL_ID = None
+
+
+def set_model_id(model_id: str):
+    global MODEL_ID
+    MODEL_ID = model_id
+
+
+# NOTE: eventually we should move this into the router and pass back the
+# index in all cases.
+ADAPTER_TO_INDEX: Optional[Dict[str, int]] = None
+
+
+def set_adapter_to_index(adapter_to_index: Dict[str, int]):
+    global ADAPTER_TO_INDEX
+    ADAPTER_TO_INDEX = adapter_to_index
+
+
+def get_adapter_to_index():
+    global ADAPTER_TO_INDEX
+    return ADAPTER_TO_INDEX
diff --git a/backends/gaudi/server/text_generation_server/models/idefics_causal_lm.py b/backends/gaudi/server/text_generation_server/models/idefics_causal_lm.py
new file mode 100644
index 00000000..9a7a6fe1
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/idefics_causal_lm.py
@@ -0,0 +1,899 @@
+from io import BytesIO
+from PIL import Image
+import torch
+import time
+
+from dataclasses import dataclass
+from opentelemetry import trace
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    AutoTokenizer,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
+from typing import Optional, Tuple, List, Type, Dict
+
+from text_generation_server.models import Model
+from text_generation_server.models.types import (
+    Batch,
+    Tokens,
+    Generation,
+    GeneratedText,
+)
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
+import torch.distributed
+from text_generation_server.models.custom_modeling.idefics_modeling import (
+    IdeficsForVisionText2Text,
+)
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
+from text_generation_server.utils.quantization import get_loader
+
+from text_generation_server.utils.import_utils import SYSTEM
+
+
+tracer = trace.get_tracer(__name__)
+
+
+@dataclass
+class IdeficsCausalLMBatch(Batch):
+    batch_id: int
+    requests: List[generate_pb2.Request]
+    requests_idx_mapping: Dict[int, int]
+
+    # Decoder values
+    input_ids: torch.Tensor
+    attention_mask: torch.Tensor
+    position_ids: torch.Tensor
+    pixel_values: Optional[torch.Tensor]
+    image_hidden_states: Optional[torch.Tensor]
+    image_attention_mask: Optional[torch.Tensor]
+    past_key_values: Optional[List[Tuple]]
+
+    # All tokens
+    all_input_ids: List[torch.Tensor]
+
+    # Lengths of all generations present in the batch
+    input_lengths: List[int]
+    prefix_offsets: List[int]
+    read_offsets: List[int]
+
+    # Generation helpers
+    next_token_choosers: List[NextTokenChooser]
+    stopping_criterias: List[StoppingCriteria]
+
+    # Metadata used for padding
+    max_input_length: int
+    padding_right_offset: int
+
+    # Maximum number of tokens this batch will grow to
+    max_tokens: int
+
+    # Past metadata
+    keys_head_dim_last: bool = True
+
+    def to_pb(self) -> generate_pb2.CachedBatch:
+        return generate_pb2.CachedBatch(
+            id=self.batch_id,
+            request_ids=[r.id for r in self.requests],
+            size=len(self),
+            max_tokens=self.max_tokens,
+        )
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "IdeficsCausalLMBatch":
+        raise NotImplementedError
+
+    @classmethod
+    def from_pb_processor(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        processor: ProcessorMixin,  # Hack
+        config,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "IdeficsCausalLMBatch":
+        inputs = []
+        next_token_choosers = []
+        stopping_criterias = []
+        prefix_offsets = []
+        read_offsets = []
+        requests_idx_mapping = {}
+
+        # Parse batch
+        max_truncation = 0
+        padding_right_offset = 0
+        max_decode_tokens = 0
+        for i, r in enumerate(pb.requests):
+            requests_idx_mapping[r.id] = i
+            inputs.append(r.input_chunks.chunks)
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            stopping_criterias.append(stopping_criteria)
+            max_truncation = max(max_truncation, r.truncate)
+            max_decode_tokens += stopping_criteria.max_new_tokens
+            padding_right_offset = max(
+                padding_right_offset, stopping_criteria.max_new_tokens
+            )
+
+        # TODO Check impact on idefics
+        prompts = []
+        for inp in inputs:
+            # Each input is encoded into a list, where each element of this input list is either a string or a URL
+            prompt = []
+            for chunk in inp:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    prompt.append(chunk.text)
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    prompt.append(image)
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+            prompts.append(prompt)
+
+        # The processor replaces the call to tokenizer, and
+        # a/ takes care of fetching images from the URL
+        # b/ generate the correct input_ids, attention_mask, pixel_values, image_attention_mask to feed to the model
+        tokenized_inputs = processor(
+            prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_truncation,
+            # TODO Check impact on idefics
+            # add_end_of_utterance_token=False,  # Already taken care of inside the prompts, so bypassing the processor's handling of this token
+        ).to(device)
+        for _ in pb.requests:
+            input_len = tokenized_inputs["input_ids"].shape[1]
+            prefix_offsets.append(
+                input_len - 5
+            )  # To decode without potential fallbacks errors
+            read_offsets.append(
+                input_len
+            )  # To decode without potential fallbacks errors
+
+        input_lengths = tokenized_inputs["attention_mask"].sum(1)
+        max_input_length = input_lengths.max()
+
+        input_ids = tokenized_inputs["input_ids"]
+        pixel_values = tokenized_inputs.get("pixel_values", None)
+        image_hidden_states = None
+        # Allocate maximum attention_mask
+        attention_mask = input_ids.new_zeros(
+            (pb.size, max_input_length + padding_right_offset)
+        )
+        # Copy tokenizer attention_mask into fully allocated attention_mask
+        attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]
+        # Do the same for image_attention_mask
+        if pixel_values is None:
+            image_attention_mask = None
+        else:
+            image_attention_mask = input_ids.new_zeros(
+                (
+                    pb.size,
+                    max_input_length + padding_right_offset,
+                    pixel_values.size(1),
+                )
+            )
+            image_attention_mask[:, :max_input_length, :] = tokenized_inputs[
+                "image_attention_mask"
+            ]
+
+        position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
+        position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
+        all_input_ids = tokenized_inputs["input_ids"].T.split(
+            1, dim=1
+        )  # It's input_ids but splitted into a tuple of tensors where each tensor is (seq_len, 1) size. It is then transformed into a list
+
+        max_tokens = len(inputs) * (max_input_length + max_decode_tokens)
+
+        return cls(
+            batch_id=pb.id,
+            requests=pb.requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            image_hidden_states=image_hidden_states,
+            image_attention_mask=image_attention_mask,
+            past_key_values=None,
+            all_input_ids=list(all_input_ids),
+            input_lengths=input_lengths.tolist(),
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            next_token_choosers=next_token_choosers,
+            stopping_criterias=stopping_criterias,
+            max_input_length=max_input_length.item(),
+            padding_right_offset=padding_right_offset,
+            max_tokens=max_tokens,
+        )
+
+    @tracer.start_as_current_span("filter")
+    def filter(self, request_ids: List[int]) -> Optional["IdeficsCausalLMBatch"]:
+        # It deletes requests from the batch. For instance when client lost connection
+        if len(request_ids) == 0:
+            raise ValueError("Batch must have at least one request")
+        if len(request_ids) == len(self):
+            return self
+
+        keep_indices = []
+
+        # New values after filtering
+        requests_idx_mapping = {}
+        requests = []
+        input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        all_input_ids = []
+        max_input_length = 0
+
+        next_token_choosers = []
+        stopping_criterias = []
+
+        total_remaining_decode_tokens = 0
+        new_padding_right_offset = 0
+
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            requests_idx_mapping[request_id] = i
+            keep_indices.append(idx)
+
+            requests.append(self.requests[idx])
+            prefix_offsets.append(self.prefix_offsets[idx])
+            read_offsets.append(self.read_offsets[idx])
+            all_input_ids.append(self.all_input_ids[idx])
+
+            request_input_length = self.input_lengths[idx]
+            input_lengths.append(request_input_length)
+            max_input_length = max(max_input_length, request_input_length)
+
+            next_token_choosers.append(self.next_token_choosers[idx])
+            stopping_criteria = self.stopping_criterias[idx]
+            stopping_criterias.append(stopping_criteria)
+            remaining_decode_tokens = (
+                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
+            )
+            total_remaining_decode_tokens += remaining_decode_tokens
+            new_padding_right_offset = max(
+                new_padding_right_offset, remaining_decode_tokens
+            )
+
+        # Apply indices to input_ids, attention mask, past key values and other items that need to be cached
+        input_ids = self.input_ids[keep_indices]
+        position_ids = self.position_ids[keep_indices]
+        self.attention_mask = self.attention_mask[
+            keep_indices,
+            -(self.padding_right_offset + max_input_length) : (
+                self.attention_mask.shape[1] - self.padding_right_offset
+            )
+            + new_padding_right_offset,
+        ]
+        # Do the same for pixel_values and image_attention_mask
+        pixel_values = self.pixel_values[keep_indices]
+        self.image_attention_mask = self.image_attention_mask[
+            keep_indices,
+            -(self.padding_right_offset + max_input_length) : (
+                self.image_attention_mask.shape[1] - self.padding_right_offset
+            )
+            + new_padding_right_offset,
+            :,
+        ]
+        if self.image_hidden_states is None:
+            image_hidden_states = None
+        else:
+            image_hidden_states = self.image_hidden_states[keep_indices]
+
+        # Ensure that past_key_values tensors can be updated in-place
+        if type(self.past_key_values[0]) is tuple:
+            self.past_key_values = [list(layer) for layer in self.past_key_values]
+
+        # Update tensors in-place to allow incremental garbage collection
+        past_kv_length = max_input_length - 1
+        for layer in self.past_key_values:
+            past_keys, past_values = layer
+            if len(past_keys.shape) == 3:
+                # Force past to be of dim [self_size, num_heads, ...] for easy indexing
+                past_keys = past_keys.view(len(self), -1, *past_keys.shape[-2:])
+                past_values = past_values.view(len(self), -1, *past_values.shape[-2:])
+            if self.keys_head_dim_last:
+                layer[0] = past_keys[keep_indices, :, -past_kv_length:, :]
+            else:
+                layer[0] = past_keys[keep_indices, :, :, -past_kv_length:]
+            del past_keys
+            layer[1] = past_values[keep_indices, :, -past_kv_length:, :]
+            del past_values
+
+        max_tokens = len(request_ids) * max_input_length + total_remaining_decode_tokens
+
+        self.requests = requests
+        self.requests_idx_mapping = requests_idx_mapping
+        self.input_ids = input_ids
+        self.pixel_values = pixel_values
+        self.image_hidden_states = image_hidden_states
+        self.position_ids = position_ids
+        self.all_input_ids = all_input_ids
+        self.input_lengths = input_lengths
+        self.prefix_offsets = prefix_offsets
+        self.read_offsets = read_offsets
+        self.next_token_choosers = next_token_choosers
+        self.stopping_criterias = stopping_criterias
+        self.max_input_length = max_input_length
+        self.padding_right_offset = new_padding_right_offset
+        self.max_tokens = max_tokens
+
+        return self
+
+    @classmethod
+    @tracer.start_as_current_span("concatenate")
+    def concatenate(
+        cls, batches: List["IdeficsCausalLMBatch"]
+    ) -> "IdeficsCausalLMBatch":
+        # It adds new requests to the batch
+        # Used for padding
+        total_batch_size = 0
+        max_input_length = 0
+        max_num_images = 0
+        padding_right_offset = 0
+        for batch in batches:
+            total_batch_size += len(batch)
+            max_input_length = max(max_input_length, batch.max_input_length)
+            max_num_images = max(max_num_images, batch.pixel_values.size(1))
+            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)
+
+        # Batch attributes
+        requests = []
+        requests_idx_mapping = {}
+        input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        all_input_ids = []
+        next_token_choosers = []
+        stopping_criterias = []
+        max_tokens = 0
+
+        # Batch tensors
+        input_ids = None
+        attention_mask = None
+        position_ids = None
+        pixel_values = None
+        image_hidden_states = None
+        image_attention_mask = None
+        past_key_values = []
+
+        # Used for slicing correctly inside the tensors
+        # Equivalent to a cumsum on batch sizes
+        start_index = 0
+        for i, batch in enumerate(batches):
+            requests.extend(batch.requests)
+            input_lengths.extend(batch.input_lengths)
+            prefix_offsets.extend(batch.prefix_offsets)
+            read_offsets.extend(batch.read_offsets)
+            all_input_ids.extend(batch.all_input_ids)
+            next_token_choosers.extend(batch.next_token_choosers)
+            stopping_criterias.extend(batch.stopping_criterias)
+
+            if i == 0:
+                requests_idx_mapping = batch.requests_idx_mapping
+            else:
+                # We need to offset the mapping for each batch by the cumulative batch size
+                for k, v in batch.requests_idx_mapping.items():
+                    requests_idx_mapping[k] = v + start_index
+
+            # Slicing end index for this batch
+            end_index = start_index + len(batch)
+
+            # We only concatenate batches that did at least one step
+            if batch.past_key_values is None:
+                raise ValueError("only concatenate prefilled batches")
+
+            # Create empty tensor
+            # input_ids is always of shape [batch_size, 1]
+            # We do not need to pad it
+            if input_ids is None:
+                input_ids = batch.input_ids.new_empty((total_batch_size, 1))
+            # Copy to correct indices
+            input_ids[start_index:end_index] = batch.input_ids
+
+            # Create padded tensor
+            if attention_mask is None:
+                attention_mask = batch.attention_mask.new_zeros(
+                    (total_batch_size, max_input_length + padding_right_offset),
+                )
+
+            curr_batch_max_num_images = batch.pixel_values.size(1)
+            if pixel_values is None:
+                pixel_values = batch.pixel_values.new_zeros(
+                    (total_batch_size, max_num_images, 3, 224, 224)
+                )
+            pixel_values[start_index:end_index, :curr_batch_max_num_images] = (
+                batch.pixel_values
+            )
+
+            if image_attention_mask is None:
+                image_attention_mask = batch.image_attention_mask.new_zeros(
+                    (
+                        total_batch_size,
+                        max_input_length + padding_right_offset,
+                        max_num_images,
+                    )
+                )
+
+            # We need to slice the attention mask to remove padding from previous steps
+            # and to remove unused allocated space
+            left_offset = max_input_length - batch.max_input_length
+            batch_left_offset = (
+                batch.attention_mask.shape[1]
+                - batch.max_input_length
+                - batch.padding_right_offset
+            )
+            attention_mask[
+                start_index:end_index,
+                left_offset:-padding_right_offset,
+            ] = batch.attention_mask[
+                :,
+                batch_left_offset : -batch.padding_right_offset,
+            ]
+            image_attention_mask[
+                start_index:end_index,
+                left_offset:-padding_right_offset,
+                :curr_batch_max_num_images,
+            ] = batch.image_attention_mask[
+                :, batch_left_offset : -batch.padding_right_offset, :
+            ]
+
+            # Create empty tensor
+            # position_ids is always of shape [batch_size, 1]
+            if position_ids is None:
+                position_ids = batch.position_ids.new_empty((total_batch_size, 1))
+            position_ids[start_index:end_index] = batch.position_ids
+
+            # Shenanigans to get dimensions because BLOOM outputs a past with a different shape
+            # BLOOM Keys:   [batch_size * num_heads, head_dim, seq_length]
+            # BLOOM Values: [batch_size * num_heads, seq_length, head_dim]
+            # And ensure that we can update tensors in-place
+            if isinstance(batch.past_key_values[0], tuple):
+                batch.past_key_values = [
+                    [t.view(len(batch), -1, *t.shape[-2:]) for t in layer]
+                    for layer in batch.past_key_values
+                ]
+            elif len(batch.past_key_values[0][0].shape) == 3:
+                for layer in batch.past_key_values:
+                    for k, t in enumerate(layer):
+                        layer[k] = t.view(len(batch), -1, *t.shape[-2:])
+
+            # Add eventual padding tokens that were added while concatenating
+            max_tokens += batch.max_tokens + (
+                max_input_length - batch.max_input_length
+            ) * len(batch)
+
+            start_index = end_index
+
+        first_past_kvs = batches[0].past_key_values
+        _, num_heads, padded_sequence_length, head_dim = first_past_kvs[0][1].shape
+
+        padded_past_values_shape = (
+            total_batch_size,
+            num_heads,
+            max_input_length - 1,
+            head_dim,
+        )
+
+        if batches[0].keys_head_dim_last:
+            padded_past_keys_shape = padded_past_values_shape
+        else:
+            # seq_length is last for BLOOM
+            padded_past_keys_shape = (
+                total_batch_size,
+                num_heads,
+                head_dim,
+                max_input_length - 1,
+            )
+
+        # Iterate over attention layers
+        # Concatenate past key values layer by layer to allow incremental garbage collection
+        for j in range(len(first_past_kvs)):
+            padded_past_keys = first_past_kvs[j][0].new_zeros(padded_past_keys_shape)
+            start_index = 0
+            for batch in batches:
+                past_keys = batch.past_key_values[j][0]
+                # Clear reference to the original tensor
+                batch.past_key_values[j][0] = None
+
+                # Slicing end index for this batch
+                end_index = start_index + len(batch)
+                # We slice the keys to remove the padding from previous batches
+                past_seq_len = batch.max_input_length - 1
+                if batch.keys_head_dim_last:
+                    padded_past_keys[start_index:end_index, :, -past_seq_len:, :] = (
+                        past_keys[:, :, -past_seq_len:, :]
+                    )
+                else:
+                    # BLOOM case
+                    padded_past_keys[start_index:end_index, :, :, -past_seq_len:] = (
+                        past_keys[:, :, :, -past_seq_len:]
+                    )
+                del past_keys
+
+                start_index = end_index
+
+            padded_past_values = first_past_kvs[j][1].new_zeros(
+                padded_past_values_shape
+            )
+            start_index = 0
+            for batch in batches:
+                past_values = batch.past_key_values[j][1]
+                # Clear reference to the original tensor
+                batch.past_key_values[j][1] = None
+
+                # Slicing end index for this batch
+                end_index = start_index + len(batch)
+                # We slice the past values to remove the padding from previous batches
+                past_seq_len = batch.max_input_length - 1
+                padded_past_values[start_index:end_index, :, -past_seq_len:, :] = (
+                    past_values[:, :, -past_seq_len:, :]
+                )
+                del past_values
+
+                # Update values
+                start_index = end_index
+
+            past_key_values.append([padded_past_keys, padded_past_values])
+
+        return cls(
+            batch_id=batches[0].batch_id,
+            requests=requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            image_hidden_states=image_hidden_states,
+            image_attention_mask=image_attention_mask,
+            past_key_values=past_key_values,
+            all_input_ids=all_input_ids,
+            input_lengths=input_lengths,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            next_token_choosers=next_token_choosers,
+            stopping_criterias=stopping_criterias,
+            max_input_length=max_input_length,
+            padding_right_offset=padding_right_offset,
+            keys_head_dim_last=batches[0].keys_head_dim_last,
+            max_tokens=max_tokens,
+        )
+
+    def __len__(self):
+        return len(self.requests)
+
+
+class IdeficsCausalLM(Model):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+    ):
+        self.quantize = quantize
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            # 9b seems to work correctly enough in float16, but 80b seems
+            # to be really saturating for f16.
+            dtype = torch.float16 if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = torch.float16 if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                # Float16 doesn't exist on target.
+                dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+        self.device, self.dtype = device, dtype
+
+        config = AutoConfig.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+        config.vision_config.quantize = quantize
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        self.processor = AutoProcessor.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+
+        weights_loader = get_loader(
+            quantize=quantize, model_id=model_id, revision=revision
+        )
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device=device,
+            dtype=dtype,
+            process_group=self.process_group,
+            weights_loader=weights_loader,
+        )
+
+        model = IdeficsForVisionText2Text(config, weights)
+
+        self.config = config
+
+        torch.distributed.barrier(group=self.process_group)
+        super().__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+
+    @property
+    def batch_type(self) -> Type[IdeficsCausalLMBatch]:
+        return IdeficsCausalLMBatch
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        pixel_values,
+        image_hidden_states,
+        image_attention_mask,
+        past_key_values: Optional = None,
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Model Forward
+        kwargs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "image_hidden_states": image_hidden_states,
+            "image_attention_mask": image_attention_mask,
+            "past_key_values": past_key_values,
+            "use_cache": True,
+            "return_dict": True,
+        }
+        if self.has_position_ids:
+            kwargs["position_ids"] = position_ids
+
+        outputs, speculative_logits = self.model.forward(**kwargs)
+        return (
+            outputs.logits,
+            speculative_logits,
+            outputs.past_key_values,
+            outputs.image_hidden_states,
+        )
+
+    @tracer.start_as_current_span("generate_token")
+    def generate_token(
+        self, batch: IdeficsCausalLMBatch
+    ) -> Tuple[List[Generation], Optional[IdeficsCausalLMBatch], Tuple[int, int]]:
+        start = time.time_ns()
+        # slice the attention mask to the correct shape
+        attention_mask = batch.attention_mask[:, : -batch.padding_right_offset]
+        if batch.image_attention_mask is None:
+            image_attention_mask = None
+        else:
+            if batch.input_ids.size(1) == 1:
+                # THIS is a hack: when calling idefics.generate, the first time, we need the whole image_attention_mask (size bs x max_seq_len x max_num_images),
+                # but the subsequent times, we only need the last attention mask along the `max_seq_len` dimension
+                # this is due to the nature IDEFICS: it's an encoder decoder, and so when decoding, only the currently generated
+                # token need to attend to the encoder hidden states (i.e. the vision encoder)
+                # Also see seq2seq_lm.Seq2SeqLM.generate_token which has roughly the same logic
+                image_attention_mask = batch.image_attention_mask[
+                    :, -(batch.padding_right_offset + 1)
+                ].unsqueeze(1)
+            else:
+                image_attention_mask = batch.image_attention_mask[
+                    :, : -batch.padding_right_offset
+                ]
+
+        logits, speculative_logits, past, image_hidden_states = self.forward(
+            input_ids=batch.input_ids,
+            attention_mask=attention_mask,
+            position_ids=batch.position_ids,
+            pixel_values=batch.pixel_values,
+            image_hidden_states=batch.image_hidden_states,
+            image_attention_mask=image_attention_mask,
+            past_key_values=batch.past_key_values,
+        )
+        # Hardcoded remove image tokens
+        logits[:, 32000:32001] = torch.finfo(logits.dtype).min
+
+        start_decode = time.time_ns()
+
+        # Results
+        generations: List[Generation] = []
+        stopped = True
+
+        # Zipped iterator
+        iterator = zip(
+            batch.requests,
+            batch.input_lengths,
+            batch.prefix_offsets,
+            batch.read_offsets,
+            logits,
+            batch.next_token_choosers,
+            batch.stopping_criterias,
+            batch.all_input_ids,
+        )
+
+        # For each member of the batch
+        for i, (
+            request,
+            input_length,
+            prefix_offset,
+            read_offset,
+            logits,
+            next_token_chooser,
+            stopping_criteria,
+            all_input_ids,
+        ) in enumerate(iterator):
+            # Select next token
+            next_token_id, logprobs = next_token_chooser(
+                all_input_ids.view(1, -1), logits[-1:, :]
+            )
+
+            # Append next token to all tokens
+            all_input_ids = torch.cat([all_input_ids, next_token_id])
+            new_input_length = input_length + 1
+
+            # Generated token
+            next_token_logprob = logprobs[-1, next_token_id]
+            next_token_id_squeezed = next_token_id.squeeze()
+            next_token_text, prefix_offset, read_offset = self.decode_token(
+                all_input_ids[:, 0], prefix_offset, read_offset
+            )
+
+            # Evaluate stopping criteria
+            stop, reason = stopping_criteria(
+                next_token_id_squeezed,
+                next_token_text,
+            )
+
+            if not stop:
+                stopped = False
+
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Decode generated tokens
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids[:, 0],
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
+                    )
+                    # Get seed
+                    if isinstance(next_token_chooser.choice, Sampling):
+                        seed = next_token_chooser.choice.seed
+                    else:
+                        seed = None
+
+                    generated_text = GeneratedText(
+                        output_text, stopping_criteria.current_tokens, reason, seed
+                    )
+                else:
+                    generated_text = None
+
+                # Prefill
+                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
+                    # Remove generated token to only have prefill and add nan for first prompt token
+                    prefill_logprobs = [float("nan")] + torch.log_softmax(
+                        logits, -1
+                    ).gather(1, all_input_ids[1:]).squeeze(1)[
+                        -new_input_length:-1
+                    ].tolist()
+                    prefill_token_ids = all_input_ids[-new_input_length:-1]
+                    prefill_texts = self.tokenizer.batch_decode(
+                        prefill_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+                    prefill_tokens = Tokens(
+                        prefill_token_ids,
+                        prefill_logprobs,
+                        prefill_texts,
+                        is_special=[],
+                    )
+                else:
+                    prefill_tokens = None
+
+                top_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    Tokens(
+                        [next_token_id_squeezed],
+                        [next_token_logprob],
+                        [next_token_text],
+                        [next_token_id_squeezed.item() in self.all_special_ids],
+                    ),
+                    generated_text,
+                    top_tokens,
+                )
+
+                generations.append(generation)
+
+            # Update values
+            batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
+                next_token_id_squeezed.item()
+            )
+            batch.input_ids[i, 0] = next_token_id
+            batch.all_input_ids[i] = all_input_ids
+            batch.input_lengths[i] = new_input_length
+            batch.prefix_offsets[i] = prefix_offset
+            batch.read_offsets[i] = read_offset
+            batch.max_input_length = max(batch.max_input_length, new_input_length)
+
+        # We finished all generations in the batch; there is no next batch
+        if stopped:
+            forward_ns = start_decode - start
+            decode_ns = time.time_ns() - start_decode
+            return generations, None, (forward_ns, decode_ns)
+
+        # Slice unused values from prefill
+        batch.input_ids = batch.input_ids[:, :1]
+
+        # Update attention_mask as we added a new token to input_ids
+        batch.attention_mask[:, -batch.padding_right_offset] = 1
+        batch.image_attention_mask[:, -batch.padding_right_offset, :] = (
+            batch.image_attention_mask[:, -(batch.padding_right_offset + 1), :]
+        )
+        # Decrease right offset
+        batch.padding_right_offset -= 1
+
+        # Update position_ids
+        batch.position_ids = batch.position_ids[:, -1:] + 1
+
+        # Update past key values
+        batch.past_key_values = past
+        batch.image_hidden_states = image_hidden_states
+
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch, (forward_ns, decode_ns)
diff --git a/backends/gaudi/server/text_generation_server/models/mamba.py b/backends/gaudi/server/text_generation_server/models/mamba.py
new file mode 100644
index 00000000..f6dcde68
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/mamba.py
@@ -0,0 +1,814 @@
+import torch
+import torch.distributed
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from typing import Optional
+from text_generation_server.models.custom_modeling.mamba_modeling import (
+    MambaConfig,
+)
+from loguru import logger
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
+from text_generation_server.models.globals import CUDA_GRAPHS, MEM_POOL
+import time
+from text_generation_server.models.custom_modeling.mamba_modeling import (
+    MambaModel,
+    InferenceParams,
+)
+from text_generation_server.models import Model
+from typing import Any, List, Tuple, Type, Dict
+from text_generation_server.models.types import (
+    Batch,
+    Tokens,
+    Generation,
+    GeneratedText,
+)
+from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.quantization import get_loader
+from text_generation_server.utils.tokens import batch_top_tokens, Sampling
+from dataclasses import dataclass
+from text_generation_server.utils import NextTokenChooser, StoppingCriteria
+
+
+def new_inference_params(
+    n_blocks: int,
+    batch_size: int,
+    d_inner: int,
+    d_conv: int,
+    d_state: int,
+    seqlen_offset: int,
+    dtype: torch.dtype,
+    device: torch.device,
+):
+    max_seqlen = 0
+    conv_states = torch.zeros(
+        (
+            n_blocks,
+            batch_size,
+            d_inner,
+            d_conv,
+        ),
+        device=device,
+        dtype=dtype,
+    )
+    ssm_states = torch.zeros(
+        (
+            n_blocks,
+            batch_size,
+            d_inner,
+            d_state,
+        ),
+        device=device,
+        dtype=dtype,
+    )
+    inference_params = InferenceParams(
+        max_seqlen=max_seqlen,
+        max_batch_size=batch_size,
+        seqlen_offset=seqlen_offset,
+        conv_states=conv_states,
+        ssm_states=ssm_states,
+    )
+    return inference_params
+
+
+@dataclass
+class MambaBatch(Batch):
+    batch_id: int
+    requests: List[generate_pb2.Request]
+    requests_idx_mapping: Dict[int, int]
+
+    # Decoder values
+    input_ids: torch.Tensor
+
+    # All tokens
+    all_input_ids: List[torch.Tensor]
+
+    # Lengths of all generations present in the batch
+    input_lengths: List[int]
+    prefix_offsets: List[int]
+    read_offsets: List[int]
+
+    # Generation helpers
+    next_token_choosers: List[NextTokenChooser]
+    stopping_criterias: List[StoppingCriteria]
+    top_n_tokens: List[int]
+    top_n_tokens_tensor: torch.Tensor
+
+    # Metadata used for padding
+    max_input_length: int
+    padding_right_offset: int
+
+    # Maximum number of tokens this batch will grow to
+    max_tokens: int
+
+    # Past metadata
+    keys_head_dim_last: bool = True
+
+    # Inference params
+    inference_params: Optional[Dict[str, Any]] = None
+
+    def to_pb(self) -> generate_pb2.CachedBatch:
+        return generate_pb2.CachedBatch(
+            id=self.batch_id,
+            request_ids=[r.id for r in self.requests],
+            size=len(self),
+            max_tokens=self.max_tokens,
+        )
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "MambaBatch":
+        inputs = []
+        next_token_choosers = []
+        stopping_criterias = []
+        top_n_tokens = []
+        prefix_offsets = []
+        read_offsets = []
+        requests_idx_mapping = {}
+
+        # Parse batch
+        max_truncation = 0
+        padding_right_offset = 0
+        max_decode_tokens = 0
+        for i, r in enumerate(pb.requests):
+            requests_idx_mapping[r.id] = i
+            inputs.append(concat_text_chunks(r.input_chunks.chunks))
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(r.top_n_tokens)
+            max_truncation = max(max_truncation, r.truncate)
+            max_decode_tokens += stopping_criteria.max_new_tokens
+            padding_right_offset = max(
+                padding_right_offset, stopping_criteria.max_new_tokens
+            )
+
+        tokenized_inputs = tokenizer(
+            inputs,
+            return_tensors="pt",
+            padding=True,
+            return_token_type_ids=False,
+            truncation=True,
+            max_length=max_truncation,
+        ).to(device)
+        for _ in pb.requests:
+            input_len = tokenized_inputs["input_ids"].shape[1]
+            prefix_offsets.append(input_len - 5)
+            read_offsets.append(input_len)
+
+        input_lengths = tokenized_inputs["attention_mask"].sum(1)
+        max_input_length = input_lengths.max()
+        input_ids = tokenized_inputs["input_ids"]
+        all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+        max_tokens = len(inputs) * (max_input_length + max_decode_tokens)
+        return cls(
+            batch_id=pb.id,
+            requests=pb.requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            # past_input_ids=None,
+            all_input_ids=list(all_input_ids),
+            input_lengths=input_lengths.tolist(),
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            next_token_choosers=next_token_choosers,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            max_input_length=max_input_length.item(),
+            padding_right_offset=padding_right_offset,
+            max_tokens=max_tokens,
+        )
+
+    def filter(self, request_ids: List[int]) -> Optional["MambaBatch"]:
+        if len(request_ids) == 0:
+            raise ValueError("Batch must have at least one request")
+        if len(request_ids) == len(self):
+            return self
+
+        keep_indices = []
+
+        # New values after filtering
+        requests_idx_mapping = {}
+        requests = []
+        input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        all_input_ids = []
+        max_input_length = 0
+
+        next_token_choosers = []
+        stopping_criterias = []
+        top_n_tokens = []
+
+        total_remaining_decode_tokens = 0
+        new_padding_right_offset = 0
+
+        indices = []
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            requests_idx_mapping[request_id] = i
+            keep_indices.append(idx)
+
+            requests.append(self.requests[idx])
+            prefix_offsets.append(self.prefix_offsets[idx])
+            read_offsets.append(self.read_offsets[idx])
+            all_input_ids.append(self.all_input_ids[idx])
+
+            request_input_length = self.input_lengths[idx]
+            input_lengths.append(request_input_length)
+            max_input_length = max(max_input_length, request_input_length)
+            indices.append(idx)
+
+            next_token_choosers.append(self.next_token_choosers[idx])
+            stopping_criteria = self.stopping_criterias[idx]
+            stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(self.top_n_tokens[idx])
+            remaining_decode_tokens = (
+                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
+            )
+            total_remaining_decode_tokens += remaining_decode_tokens
+            new_padding_right_offset = max(
+                new_padding_right_offset, remaining_decode_tokens
+            )
+
+        # Apply indices to input_ids, attention mask, past key values and other items that need to be cached
+        input_ids = self.input_ids[keep_indices]
+
+        top_n_tokens_tensor = self.top_n_tokens_tensor[keep_indices]
+        max_tokens = len(request_ids) * max_input_length + total_remaining_decode_tokens
+
+        self.requests = requests
+        self.requests_idx_mapping = requests_idx_mapping
+        self.input_ids = input_ids
+        self.all_input_ids = all_input_ids
+        self.input_lengths = input_lengths
+        self.prefix_offsets = prefix_offsets
+        self.read_offsets = read_offsets
+        self.next_token_choosers = next_token_choosers
+        self.stopping_criterias = stopping_criterias
+        self.top_n_tokens = top_n_tokens
+        self.top_n_tokens_tensor = top_n_tokens_tensor
+        self.max_input_length = max_input_length
+        self.padding_right_offset = new_padding_right_offset
+        self.max_tokens = max_tokens
+
+        # TODO
+        # Kept it simple by just updating the state, maybe updating the other CPU values is necessary.
+        self.inference_params.conv_states = self.inference_params.conv_states[
+            :, indices
+        ]
+        self.inference_params.ssm_states = self.inference_params.ssm_states[:, indices]
+        return self
+
+    @classmethod
+    def concatenate(cls, batches: List["MambaBatch"]) -> "MambaBatch":
+        # Used for padding
+        total_batch_size = 0
+        max_input_length = 0
+        padding_right_offset = 0
+        for batch in batches:
+            total_batch_size += len(batch)
+            max_input_length = max(max_input_length, batch.max_input_length)
+            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)
+
+        # Batch attributes
+        requests = []
+        requests_idx_mapping = {}
+        input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        all_input_ids = []
+        next_token_choosers = []
+        stopping_criterias = []
+        top_n_tokens = []
+        max_tokens = 0
+        seqlen_offset = 0
+
+        (n_blocks, _, d_inner, d_conv) = batches[0].inference_params.conv_states.shape
+        (_, _, _, d_state) = batches[0].inference_params.ssm_states.shape
+        dtype = batches[0].inference_params.conv_states.dtype
+        device = batches[0].inference_params.conv_states.device
+        inference_params = new_inference_params(
+            n_blocks=n_blocks,
+            batch_size=total_batch_size,
+            d_state=d_state,
+            d_conv=d_conv,
+            d_inner=d_inner,
+            seqlen_offset=seqlen_offset,
+            device=device,
+            dtype=dtype,
+        )
+
+        # Batch tensors
+        input_ids = None
+        top_n_tokens_tensor = None
+
+        # Used for slicing correctly inside the tensors
+        # Equivalent to a cumsum on batch sizes
+        start_index = 0
+        for i, batch in enumerate(batches):
+            requests.extend(batch.requests)
+            input_lengths.extend(batch.input_lengths)
+            prefix_offsets.extend(batch.prefix_offsets)
+            read_offsets.extend(batch.read_offsets)
+            all_input_ids.extend(batch.all_input_ids)
+            next_token_choosers.extend(batch.next_token_choosers)
+            stopping_criterias.extend(batch.stopping_criterias)
+            top_n_tokens.extend(batch.top_n_tokens)
+
+            if i == 0:
+                requests_idx_mapping = batch.requests_idx_mapping
+            else:
+                # We need to offset the mapping for each batch by the cumulative batch size
+                for k, v in batch.requests_idx_mapping.items():
+                    requests_idx_mapping[k] = v + start_index
+
+            # Slicing end index for this batch
+            end_index = start_index + len(batch)
+
+            # Create empty tensor
+            # input_ids is always of shape [batch_size, 1]
+            # We do not need to pad it
+            if input_ids is None:
+                input_ids = batch.input_ids.new_empty((total_batch_size, 1))
+            # Copy to correct indices
+            input_ids[start_index:end_index] = batch.input_ids
+
+            if top_n_tokens_tensor is None:
+                top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
+                    total_batch_size,
+                )
+            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor
+
+            # Add eventual padding tokens that were added while concatenating
+            max_tokens += batch.max_tokens + (
+                max_input_length - batch.max_input_length
+            ) * len(batch)
+
+            inference_params.max_seqlen = max(
+                inference_params.max_seqlen, batch.inference_params.max_seqlen
+            )
+            assert batch.inference_params.seqlen_offset != 0, "Invalid seqlen offset"
+            inference_params.seqlen_offset = max(
+                inference_params.seqlen_offset, batch.inference_params.seqlen_offset
+            )
+
+            inference_params.conv_states[:, start_index:end_index] = (
+                batch.inference_params.conv_states
+            )
+            inference_params.ssm_states[:, start_index:end_index] = (
+                batch.inference_params.ssm_states
+            )
+
+            start_index = end_index
+
+        return cls(
+            batch_id=batches[0].batch_id,
+            requests=requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            all_input_ids=all_input_ids,
+            input_lengths=input_lengths,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            next_token_choosers=next_token_choosers,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            max_input_length=max_input_length,
+            padding_right_offset=padding_right_offset,
+            keys_head_dim_last=batches[0].keys_head_dim_last,
+            max_tokens=max_tokens,
+            inference_params=inference_params,
+        )
+
+    def __len__(self):
+        return len(self.requests)
+
+
+class Mamba(Model):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+    ):
+        self.quantize = quantize
+        self.process_group, _rank, world_size = initialize_torch_distributed()
+        if world_size > 1:
+            raise RuntimeError("Mamba does not support Tensor Parallelism (TP)")
+        self.cuda_graphs = {}
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            # Bf16 is important. In f16 accumulations in the matmul are causing
+            # differences while the server is under load.
+            # This is detectable by the integration load test
+            dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            if quantize:
+                raise ValueError("quantization is not available on CPU")
+
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            "EleutherAI/gpt-neox-20b",
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        config = MambaConfig.from_pretrained(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+
+        tokenizer.bos_token_id = config.bos_token_id
+        tokenizer.eos_token_id = config.eos_token_id
+        tokenizer.pad_token = tokenizer.eos_token
+
+        config.quantize = quantize
+        config.speculator = speculator
+        torch.distributed.barrier(group=self.process_group)
+        weights_loader = get_loader(
+            quantize=quantize, model_id=model_id, revision=revision
+        )
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device,
+            dtype,
+            process_group=self.process_group,
+            weights_loader=weights_loader,
+        )
+        model = MambaModel(config, weights)
+        torch.distributed.barrier(group=self.process_group)
+        super(Mamba, self).__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+        )
+
+    @property
+    def batch_type(self) -> Type[MambaBatch]:
+        return MambaBatch
+
+    def warmup(self, batch) -> Optional[int]:
+        # TODO: implement warmup for Mamba if needed
+        if CUDA_GRAPHS:
+            if self.speculate is None or self.speculate == 0:
+                try:
+                    logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}")
+                    # Warmup cuda graphs
+                    for bs in CUDA_GRAPHS:
+                        self.cuda_graph_warmup(bs)
+                except Exception:
+                    logger.exception("Decode cuda graph warmup failed")
+        else:
+            logger.info(f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS}).")
+
+        return None
+
+    def cuda_graph_warmup(self, batch_size: int):
+        input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device)
+        n_blocks = len(self.model.blocks)
+
+        d_state = self.model.config.d_state
+        d_conv = self.model.config.d_conv
+        # Inner takes the expand multiplication
+        d_inner = self.model.config.d_inner
+
+        # Important seqlen_offset to go through the update mecanism with the state
+        seqlen_offset = 1
+        inference_params = new_inference_params(
+            n_blocks=n_blocks,
+            batch_size=batch_size,
+            d_state=d_state,
+            d_conv=d_conv,
+            d_inner=d_inner,
+            seqlen_offset=seqlen_offset,
+            device=self.device,
+            dtype=self.dtype,
+        )
+
+        graph = torch.cuda.CUDAGraph()
+
+        torch.cuda.synchronize()
+        # Run once outside to warmup
+        self.model.forward(input_ids=input_ids, inference_params=inference_params)
+        torch.cuda.synchronize()
+
+        with torch.cuda.graph(graph, pool=MEM_POOL):
+            logits, speculative_logits = self.model.forward(
+                input_ids=input_ids, inference_params=inference_params
+            )
+        torch.cuda.synchronize()
+        graph_dict = {
+            "input_ids": input_ids,
+            "inference_params": inference_params,
+            "graph": graph,
+            "logits": logits,
+            "speculative_logits": speculative_logits,
+        }
+        self.cuda_graphs[batch_size] = graph_dict
+
+    def tunableop_warmup(self, batch_size: int, seqlen: int):
+        input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device)
+        n_blocks = len(self.model.blocks)
+
+        d_state = self.model.config.d_state
+        d_conv = self.model.config.d_conv
+        # Inner takes the expand multiplication
+        d_inner = self.model.config.d_inner
+
+        # Important seqlen_offset to go through the update mecanism with the state
+        seqlen_offset = 1
+        inference_params = new_inference_params(
+            n_blocks=n_blocks,
+            batch_size=seqlen,
+            d_state=d_state,
+            d_conv=d_conv,
+            d_inner=d_inner,
+            seqlen_offset=seqlen_offset,
+            device=self.device,
+            dtype=self.dtype,
+        )
+
+        self.model.forward(input_ids=input_ids, inference_params=inference_params)
+
+    def forward(
+        self, input_ids: torch.Tensor, inference_params: Any
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        bs = input_ids.shape[0]
+        padded_bs = bs
+        if bs == 3:
+            padded_bs = 4
+        elif 3 < bs <= 8:
+            padded_bs = 8
+        elif bs > 8:
+            padded_bs = (bs + 7) // 8 * 8
+
+        # Try to find an associated cuda graph
+        cuda_graph = self.cuda_graphs.get(padded_bs, None)
+        is_prefill = inference_params is None or inference_params.seqlen_offset == 0
+
+        if is_prefill or cuda_graph is None:
+            return self.model(
+                input_ids,
+                inference_params=inference_params,
+            )
+
+        # Copy inputs to the static inputs of the cuda graph
+        # Static inputs are potentially padded
+        cuda_graph["input_ids"][:bs] = input_ids
+        cuda_graph["inference_params"].conv_states[
+            :, :bs
+        ] = inference_params.conv_states
+        cuda_graph["inference_params"].ssm_states[:, :bs] = inference_params.ssm_states
+
+        # Replay the graph
+        cuda_graph["graph"].replay()
+
+        inference_params.conv_states.copy_(
+            cuda_graph["inference_params"].conv_states[:, :bs]
+        )
+        inference_params.ssm_states.copy_(
+            cuda_graph["inference_params"].ssm_states[:, :bs]
+        )
+        # Slice output to the correct shape
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
+        )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits
+
+    def generate_token(self, batch) -> Tuple[List[Any], Optional[Any], Tuple[int, int]]:
+        start = time.time_ns()
+        input_ids = (
+            batch.input_ids
+        )  # batch.past_input_ids if batch.past_input_ids is not None else batch.input_ids
+
+        batch_size, max_seqlen = input_ids.shape
+        # Inference params
+
+        if batch.inference_params is None:
+            # 0 is important here
+            seqlen_offset = 0
+            n_blocks = len(self.model.blocks)
+            d_state = self.model.config.d_state
+            d_conv = self.model.config.d_conv
+            d_inner = self.model.config.d_inner
+            inference_params = new_inference_params(
+                n_blocks=n_blocks,
+                batch_size=batch_size,
+                d_state=d_state,
+                d_conv=d_conv,
+                d_inner=d_inner,
+                seqlen_offset=seqlen_offset,
+                device=self.device,
+                dtype=self.dtype,
+            )
+            batch.inference_params = inference_params
+
+        # Forward pass
+        logits, speculative_logits = self.forward(
+            input_ids, inference_params=batch.inference_params
+        )
+
+        # batch.inference_params = new_inference_params
+        # Results
+        generations: List[Generation] = []
+        stopped = True
+
+        # Speculation is not active for causal
+        accepted_ids = torch.ones_like(batch.input_ids)[:, 0]
+        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
+            batch.top_n_tokens,
+            batch.top_n_tokens_tensor,
+            torch.log_softmax(logits[:, -1], -1),
+            accepted_ids,
+        )
+
+        start_decode = time.time_ns()
+
+        # Zipped iterator
+        iterator = zip(
+            batch.requests,
+            batch.input_lengths,
+            batch.prefix_offsets,
+            batch.read_offsets,
+            logits,
+            batch.next_token_choosers,
+            batch.stopping_criterias,
+            batch.all_input_ids,
+            batch.top_n_tokens,
+            batch_top_token_ids,
+            batch_top_token_logprobs,
+        )
+
+        # For each member of the batch
+        for i, (
+            request,
+            input_length,
+            prefix_offset,
+            read_offset,
+            logits,
+            next_token_chooser,
+            stopping_criteria,
+            all_input_ids,
+            top_n_tokens,
+            top_token_ids,
+            top_token_logprobs,
+        ) in enumerate(iterator):
+            # Select next token
+            next_token_id, logprobs = next_token_chooser(
+                all_input_ids.view(1, -1), logits[-1:, :]
+            )
+
+            # Append next token to all tokens
+            all_input_ids = torch.cat([all_input_ids, next_token_id])
+            new_input_length = input_length + 1
+
+            # Generated token
+            next_token_logprob = logprobs[-1, next_token_id]
+            next_token_id_squeezed = next_token_id.squeeze()
+            next_token_text, prefix_offset, read_offset = self.decode_token(
+                all_input_ids[:, 0], prefix_offset, read_offset
+            )
+
+            # Evaluate stopping criteria
+            stop, reason = stopping_criteria(
+                next_token_id_squeezed,
+                next_token_text,
+            )
+
+            if not stop:
+                stopped = False
+
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Decode generated tokens
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids[:, 0],
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
+                    )
+                    # Get seed
+                    if isinstance(next_token_chooser.choice, Sampling):
+                        seed = next_token_chooser.choice.seed
+                    else:
+                        seed = None
+
+                    generated_text = GeneratedText(
+                        output_text, stopping_criteria.current_tokens, reason, seed
+                    )
+                else:
+                    generated_text = None
+
+                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
+                    # Remove generated token to only have prefill and add nan for first prompt token
+                    prefill_logprobs = [float("nan")] + torch.log_softmax(
+                        logits, -1
+                    ).gather(1, all_input_ids[1:]).squeeze(1)[
+                        -new_input_length:-1
+                    ].tolist()
+                    prefill_token_ids = all_input_ids[-new_input_length:-1]
+                    prefill_texts = self.tokenizer.batch_decode(
+                        prefill_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+                    prefill_tokens = Tokens(
+                        prefill_token_ids,
+                        prefill_logprobs,
+                        prefill_texts,
+                        is_special=[],
+                    )
+                else:
+                    prefill_tokens = None
+
+                if top_n_tokens > 0:
+                    toptoken_texts = self.tokenizer.batch_decode(
+                        top_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+                    special_toptokens = [
+                        token_id in self.all_special_ids for token_id in top_token_ids
+                    ]
+                    top_tokens = Tokens(
+                        top_token_ids,
+                        top_token_logprobs,
+                        toptoken_texts,
+                        special_toptokens,
+                    )
+                else:
+                    top_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    Tokens(
+                        [next_token_id_squeezed],
+                        [next_token_logprob],
+                        [next_token_text],
+                        [next_token_id_squeezed.item() in self.all_special_ids],
+                    ),
+                    generated_text,
+                    top_tokens,
+                )
+
+                generations.append(generation)
+
+                # Update values
+                batch.next_token_choosers[i] = batch.next_token_choosers[
+                    i
+                ].advance_grammar(next_token_id_squeezed.item())
+                batch.input_ids[i, 0] = next_token_id
+                batch.all_input_ids[i] = all_input_ids
+                batch.input_lengths[i] = new_input_length
+                batch.prefix_offsets[i] = prefix_offset
+                batch.read_offsets[i] = read_offset
+                batch.max_input_length = max(batch.max_input_length, new_input_length)
+
+        # We finished all generations in the batch; there is no next batch
+        if stopped:
+            forward_ns = start_decode - start
+            decode_ns = time.time_ns() - start_decode
+            return generations, None, (forward_ns, decode_ns)
+
+        # Slice unused values from prefill
+        batch.input_ids = batch.input_ids[:, :1]
+
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch, (forward_ns, decode_ns)
diff --git a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
new file mode 100644
index 00000000..9e19e171
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
@@ -0,0 +1,357 @@
+from io import BytesIO
+from PIL import Image
+import torch
+from typing import Iterable, Optional, Tuple, List, Dict
+from text_generation_server.pb.generate_pb2 import Request
+
+from dataclasses import dataclass
+from opentelemetry import trace
+from transformers import (
+    PreTrainedTokenizerBase,
+)
+from text_generation_server.models.vlm_causal_lm import VlmCausalLMBatch, VlmCausalLM
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.flash_causal_lm import (
+    block_tables_to_ragged,
+)
+from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
+from text_generation_server.layers.attention import Seqlen
+
+
+tracer = trace.get_tracer(__name__)
+
+
+@dataclass
+class MllamaCausalLMBatch(VlmCausalLMBatch):
+    image_indices: List[int] = 42
+    aspect_ratio_ids: Optional[torch.Tensor] = None
+    aspect_ratio_mask: Optional[torch.Tensor] = None
+    cross_attention_states: Optional[torch.Tensor] = None
+
+    @classmethod
+    @tracer.start_as_current_span("concatenate")
+    def concatenate(cls, batches):
+        batch = super().concatenate(batches)
+        batch.pixel_values = None
+        batch.pixel_attention_mask = None
+
+        offset = 0
+        image_indices = []
+        attention_states = []
+        for b in batches:
+            if b.cross_attention_states is not None:
+                attention_states.append(b.cross_attention_states)
+            image_indices.extend([i + offset for i in b.image_indices])
+            offset += len(b.image_indices)
+        if len(attention_states) > 0:
+            assert len(image_indices) > 0
+            batch.cross_attention_states = torch.cat(attention_states, dim=0)
+            batch.image_indices = image_indices
+        else:
+            batch.cross_attention_states = None
+            batch.image_indices = []
+        return batch
+
+    @tracer.start_as_current_span("filter")
+    def filter(self, request_ids: List[int]):
+        assert self.image_indices is not None
+        batch = super().filter(request_ids)
+        assert self.image_indices is not None
+        indices = []
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            indices.append(idx)
+
+        offset = 0
+        new_image_indices = []
+        prev_i = None
+        for i in self.image_indices:
+            if i in indices:
+                new_image_indices.append(offset)
+                if i != prev_i:
+                    offset += 1
+                prev_i = i
+
+        batch.image_indices = new_image_indices
+        if len(new_image_indices) > 0:
+            assert max(new_image_indices) < self.cross_attention_states.shape[0]
+            assert offset <= self.cross_attention_states.shape[0]
+            batch.cross_attention_states = self.cross_attention_states[
+                new_image_indices
+            ]
+        else:
+            batch.cross_attention_states = None
+        return batch
+
+    @classmethod
+    def batch_tokenized_inputs(
+        cls, requests: Iterable[Request], tokenizer, processor, config
+    ):
+        image_inputs = []
+        texts = []
+        image_indices = []
+        batch_tokenized_inputs = []
+
+        for i, r in enumerate(requests):
+            # Each input is encoded into a list, where each element of this input list is either a string or a URL
+            curr_text = ""
+            curr_image = None
+            curr_i = None
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    curr_text += chunk.text
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    # TODO unsure about BOS
+                    curr_text += "<|image|>"
+                    image_input = processor.image_processor(image, return_tensors="pt")
+                    curr_image = image_input
+                    curr_i = i
+                    # image_inputs.append(image_input)
+                    # image_indices.append(i)
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+            texts.append(curr_text)
+            if curr_image is not None:
+                image_inputs.append(curr_image)
+                image_indices.append(curr_i)
+
+            input_ids = tokenizer(
+                curr_text,
+                truncation=True,
+                max_length=r.truncate,
+                add_special_tokens=r.add_special_tokens,
+            )["input_ids"]
+            batch_tokenized_inputs.append(input_ids)
+        if image_inputs:
+            image_input = image_inputs[0]
+            new_image_inputs = {
+                "pixel_values": torch.cat(
+                    [img["pixel_values"] for img in image_inputs], dim=0
+                ),
+            }
+            if "aspect_ratio_ids" in image_input:
+                new_image_inputs["aspect_ratio_ids"] = torch.cat(
+                    [img["aspect_ratio_ids"] for img in image_inputs], dim=0
+                )
+            if "aspect_ratio_mask" in image_input:
+                new_image_inputs["aspect_ratio_mask"] = torch.cat(
+                    [img["aspect_ratio_mask"] for img in image_inputs], dim=0
+                )
+            image_inputs = new_image_inputs
+            image_inputs["image_indices"] = image_indices
+        else:
+            image_inputs = None
+
+        if image_inputs is not None:
+            assert len(image_indices) == image_inputs["pixel_values"].shape[0]
+
+        return batch_tokenized_inputs, image_inputs
+
+    @classmethod
+    def from_pb_processor(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        processor,
+        config,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "VlmCausalLMBatch":
+        batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs(
+            pb.requests, tokenizer, processor, config
+        )
+        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
+        # XXX: <|image|> token is actually out of bounds and bugs out the logit processors.
+        batch.all_input_ids_tensor = batch.all_input_ids_tensor.clamp(
+            max=config.text_config.vocab_size - 1
+        )
+        batch.input_ids = batch.input_ids.clamp(max=config.text_config.vocab_size - 1)
+
+        if image_inputs is not None:
+            batch.pixel_values = image_inputs["pixel_values"].to(
+                device=device, dtype=dtype
+            )
+            batch.aspect_ratio_ids = image_inputs["aspect_ratio_ids"].to(device=device)
+            batch.aspect_ratio_mask = image_inputs["aspect_ratio_mask"].to(
+                device=device
+            )
+            batch.image_indices = image_inputs["image_indices"]
+        else:
+            batch.pixel_values = None
+            batch.aspect_ratio_ids = None
+            batch.aspect_ratio_mask = None
+            batch.image_indices = []
+        assert batch.image_indices is not None
+        return batch
+
+
+class MllamaCausalLM(VlmCausalLM):
+    def forward(
+        self,
+        batch: VlmCausalLMBatch,
+        adapter_data: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # Model Forward
+        if batch.speculative_ids is not None:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            max_s = batch.max_seqlen
+            lm_head_indices = batch.prefill_head_indices
+
+            speculative_ids = batch.speculative_ids
+
+            B, speculative_length = speculative_ids.shape
+            new_length = speculative_length + 1
+            new_input_ids = torch.cat(
+                [input_ids.unsqueeze(-1), speculative_ids], dim=1
+            ).reshape(-1)
+            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
+            arange_int = arange.to(dtype=torch.int32)
+            new_position_ids = (
+                position_ids.unsqueeze(-1).expand(B, new_length) + arange
+            ).view(-1)
+            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
+            input_lengths = (
+                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
+            ).view(-1)
+            prefix_lens_tensor = (
+                batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length)
+            ).reshape(-1)
+
+            # Add Copy the block tables for all members
+            block_tables = (
+                block_tables.unsqueeze(1)
+                .expand(B, new_length, -1)
+                .reshape(B * new_length, -1)
+                .contiguous()
+            )
+            max_s = max_s + speculative_length
+
+            input_ids = new_input_ids
+            position_ids = new_position_ids
+        else:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            prefix_lens_tensor = batch.prefix_lens_tensor
+            max_s = batch.max_seqlen
+            lm_head_indices = batch.prefill_head_indices
+
+        if cu_seqlen_prefill is None and self.max_past() is not None:
+            # In decode, not prefill, we're actually overwriting the KV-cache
+            # in a circular buffer mode.
+            # This makes sure the max_s for the decode pass is correct.
+            max_s = min(self.max_past(), max_s)
+
+        bs = input_ids.shape[0]
+        # Try to find an associated cuda graph
+        bs = input_ids.shape[0]
+        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
+        if sorted_padded_bs:
+            # Get associated cuda graph
+            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
+        else:
+            cuda_graph = None
+        if (
+            cu_seqlen_prefill is not None
+            or cuda_graph is None
+            # Only run cuda graphs when there's no images.
+            or batch.cross_attention_states is not None
+        ):
+            input_lengths = input_lengths + prefix_lens_tensor
+            if PREFIX_CACHING:
+                block_tables = block_tables_to_ragged(
+                    block_tables=block_tables,
+                    input_lengths=batch.input_lengths,
+                    prefix_lens=batch.prefix_lens,
+                )
+            with self._forward_context(
+                block_tables=block_tables,
+                cu_seqlen_prefill=cu_seqlen_prefill,
+                input_lengths_tensor=input_lengths,
+                prefix_lens_tensor=prefix_lens_tensor,
+            ):
+                max_k = (input_lengths + prefix_lens_tensor).max().item()
+                seqlen = Seqlen(
+                    input_lengths=input_lengths,
+                    prefix_lengths=prefix_lens_tensor,
+                    cu_seqlen_q=cu_seqlen_prefill,
+                    max_q=max_s,
+                    max_k=max_k,
+                )
+
+                if batch.pixel_values is not None:
+                    cross_attention_states = self.model.vision_forward(
+                        pixel_values=batch.pixel_values,
+                        aspect_ratio_ids=batch.aspect_ratio_ids,
+                        aspect_ratio_mask=batch.aspect_ratio_mask,
+                    )
+                    batch.cross_attention_states = cross_attention_states
+
+                cross_attention_states = batch.cross_attention_states
+
+                logits, speculative_logits = self.model.forward(
+                    input_ids=input_ids,
+                    position_ids=position_ids,
+                    cu_seqlen_prefill=cu_seqlen_prefill,
+                    kv_cache=kv_cache,
+                    block_tables=block_tables,
+                    slots=slots,
+                    seqlen=seqlen,
+                    max_s=max_s,
+                    prefill_cache_indices=batch.prefill_cache_indices,
+                    lm_head_indices=lm_head_indices,
+                    cross_attention_states=cross_attention_states,
+                    adapter_data=adapter_data,
+                    image_indices=batch.image_indices[:],
+                )
+                if batch.prefill_cache_indices is not None:
+                    batch.prefill_cache_indices = None
+                if batch.pixel_values is not None:
+                    batch.pixel_values = None
+                return logits, speculative_logits
+
+        # Copy inputs to the static inputs of the cuda graph
+        # Static inputs are potentially padded
+        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
+        cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
+        if ATTENTION == "flashinfer":
+            block_tables = block_tables_to_ragged(
+                block_tables=block_tables,
+                input_lengths=batch.input_lengths,
+                prefix_lens=batch.prefix_lens,
+            )
+            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
+        else:
+            cuda_graph["block_tables"][
+                : block_tables.shape[0], : block_tables.shape[1]
+            ] = block_tables
+        cuda_graph["slots"].fill_(0)
+        cuda_graph["slots"][: slots.shape[0]] = slots
+        cuda_graph["input_lengths"].zero_()
+        cuda_graph["input_lengths"][: input_lengths.shape[0]] = (
+            input_lengths + prefix_lens_tensor
+        )
+
+        # Replay the graph
+        cuda_graph["graph"].replay()
+
+        # Slice output to the correct shape
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
+        )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/model.py b/backends/gaudi/server/text_generation_server/models/model.py
new file mode 100644
index 00000000..4fda2271
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/model.py
@@ -0,0 +1,141 @@
+import inspect
+import torch
+
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Optional, TypeVar, Type, Dict
+from collections import defaultdict
+from transformers import PreTrainedTokenizerBase
+
+from text_generation_server.models.types import Batch, Generation
+from text_generation_server.models.globals import BLOCK_SIZE
+from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.pb.generate_pb2 import InfoResponse
+from text_generation_server.adapters.weights import LayerAdapterWeights
+from text_generation_server.pb import generate_pb2
+
+BASE_MODEL_ADAPTER_ID = "__base_model__"
+
+
+B = TypeVar("B", bound=Batch)
+
+
+class Model(ABC):
+    def __init__(
+        self,
+        model_id: str,
+        model: torch.nn.Module,
+        tokenizer: PreTrainedTokenizerBase,
+        requires_padding: bool,
+        dtype: torch.dtype,
+        device: torch.device,
+        rank: int = 0,
+        world_size: int = 1,
+        sliding_window: Optional[int] = None,
+        speculate: Optional[int] = None,
+        adapter_id: str = BASE_MODEL_ADAPTER_ID,
+    ):
+        self.model_id = model_id
+        self.model = model.eval()
+        self.tokenizer = tokenizer
+
+        # all_special_ids is not set correctly if the rust tokenizer is unpacked
+        # TODO report this to transformers.
+        other_special_ids = {
+            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
+        }
+        self.all_special_ids = set(tokenizer.all_special_ids)
+        self.all_special_ids.update(other_special_ids)
+        self.requires_padding = requires_padding
+        self.dtype = dtype
+        self.device = device
+        self.rank = rank
+        self.world_size = world_size
+        self.sliding_window = sliding_window if sliding_window != -1 else None
+
+        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
+            LayerAdapterWeights
+        )
+        self.loaded_adapters = set()
+        self.static_adapter_id = adapter_id
+
+        if speculate is None:
+            speculate = get_speculate()
+        self.speculate = speculate
+
+        self.has_position_ids = (
+            inspect.signature(model.forward).parameters.get("position_ids", None)
+            is not None
+        )
+
+        self.check_initialized()
+
+    @property
+    def info(self) -> InfoResponse:
+        if self.requires_padding and self.sliding_window is not None:
+            raise NotImplementedError("sliding_window is not implemented with padding")
+
+        return InfoResponse(
+            requires_padding=self.requires_padding,
+            dtype=str(self.dtype),
+            device_type=self.device.type,
+            window_size=self.sliding_window,
+            speculate=self.speculate,
+            block_size=BLOCK_SIZE,
+        )
+
+    @property
+    @abstractmethod
+    def batch_type(self) -> Type[B]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def generate_token(
+        self, batch: B
+    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
+        raise NotImplementedError
+
+    def warmup(
+        self, batch: generate_pb2.WarmupRequest
+    ) -> Tuple[Optional[int], Optional[int], Optional[int]]:
+        self.generate_token(batch)
+        return None, None, None
+
+    def decode_token(
+        self,
+        all_input_ids: List[int],
+        prefix_offset: int = 0,
+        read_offset: int = 0,
+        skip_special_tokens: bool = False,
+    ) -> Tuple[str, int, int]:
+        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
+
+        # The prefix text is necessary only to defeat cleanup algorithms in the decode
+        # which decide to add a space or not depending on the surrounding ids.
+        prefix_text = self.tokenizer.decode(
+            all_input_ids[prefix_offset:read_offset],
+            skip_special_tokens=skip_special_tokens,
+        )
+
+        new_text = self.tokenizer.decode(
+            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
+        )
+
+        if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
+            # utf-8 char at the end means it's a potential unfinished byte sequence
+            # from byte fallback tokenization.
+            # If it's in the middle, it's probably a real invalid id generated
+            # by the model
+            new_text = new_text[len(prefix_text) :]
+            return new_text, read_offset, len(all_input_ids)
+        else:
+            return "", prefix_offset, read_offset
+
+    def check_initialized(self):
+        uninitialized_parameters = []
+        for n, p in self.model.named_parameters():
+            if p.data.device == torch.device("meta"):
+                uninitialized_parameters.append(n)
+        if uninitialized_parameters:
+            raise RuntimeError(
+                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
+            )
diff --git a/backends/gaudi/server/text_generation_server/models/pali_gemma.py b/backends/gaudi/server/text_generation_server/models/pali_gemma.py
new file mode 100644
index 00000000..fe75570e
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/pali_gemma.py
@@ -0,0 +1,71 @@
+from io import BytesIO
+from PIL import Image
+import torch
+import torch.distributed
+from opentelemetry import trace
+from typing import Iterable
+from text_generation_server.models.vlm_causal_lm import (
+    VlmCausalLMBatch,
+    image_text_replacement,
+)
+
+from text_generation_server.pb.generate_pb2 import Request
+
+tracer = trace.get_tracer(__name__)
+
+
+class PaliGemmaBatch(VlmCausalLMBatch):
+    @classmethod
+    def batch_tokenized_inputs(
+        cls, requests: Iterable[Request], tokenizer, processor, config
+    ):
+        batch_inputs = []
+        image_inputs = []
+        max_truncation = 0
+        for r in requests:
+            full_text = ""
+            image_id = 0
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    full_text += "<bos>" + chunk.text + "\n"
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    # TODO do_convert_RGB should be on by default ?
+                    image = image.convert("RGB")
+                    image_input = processor.image_processor(image, return_tensors="pt")
+                    full_text += image_text_replacement(
+                        processor, image_input, config, image_id
+                    )
+                    image_inputs.append(image_input)
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+
+            batch_inputs.append(full_text)
+            max_truncation = max(max_truncation, r.truncate)
+
+        batch_tokenized_inputs = tokenizer(
+            batch_inputs,
+            truncation=True,
+            max_length=max_truncation,
+            add_special_tokens=False,
+        )["input_ids"]
+        if image_inputs:
+            image_input = image_inputs[0]
+            new_image_inputs = {
+                "pixel_values": torch.cat(
+                    [img["pixel_values"] for img in image_inputs], dim=0
+                ),
+            }
+            if "pixel_attention_mask" in image_input:
+                new_image_inputs["pixel_attention_mask"] = torch.cat(
+                    [img["pixel_attention_mask"] for img in image_inputs], dim=0
+                )
+            if "image_sizes" in image_input:
+                new_image_inputs["image_sizes"] = torch.cat(
+                    [img["image_sizes"] for img in image_inputs], dim=0
+                )
+            image_inputs = new_image_inputs
+        else:
+            image_inputs = None
+        return batch_tokenized_inputs, image_inputs
diff --git a/backends/gaudi/server/text_generation_server/models/seq2seq_lm.py b/backends/gaudi/server/text_generation_server/models/seq2seq_lm.py
new file mode 100644
index 00000000..04d4c28b
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/seq2seq_lm.py
@@ -0,0 +1,932 @@
+import torch
+import torch.distributed
+import time
+from dataclasses import dataclass
+from opentelemetry import trace
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    PreTrainedTokenizerBase,
+    AutoConfig,
+)
+from typing import Optional, Tuple, List, Type, Dict
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
+from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.quantization import get_loader
+from text_generation_server.utils.tokens import batch_top_tokens
+from text_generation_server.models import Model
+from text_generation_server.models.types import (
+    GeneratedText,
+    Batch,
+    Generation,
+    Tokens,
+)
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
+
+tracer = trace.get_tracer(__name__)
+
+
+@dataclass
+class Seq2SeqLMBatch(Batch):
+    batch_id: int
+    requests: List[generate_pb2.Request]
+    requests_idx_mapping: Dict[int, int]
+
+    # Encoder values
+    input_ids: Optional[torch.Tensor]
+    attention_mask: torch.Tensor
+
+    # Decoder values
+    decoder_input_ids: torch.Tensor
+    decoder_attention_mask: Optional[torch.Tensor]
+    encoder_last_hidden_state: Optional[torch.Tensor]
+
+    # All tokens
+    all_decoder_input_ids: List[torch.Tensor]
+
+    # Seq2SeqLM keeps track of both encoder and decoder attention keys and values
+    past_key_values: Optional[List[Tuple]]
+
+    # Lengths of all generations present in the batch
+    input_lengths: List[int]
+    decoder_input_lengths: List[int]
+    prefix_offsets: List[int]
+    read_offsets: List[int]
+
+    # Generation helpers
+    next_token_choosers: List[NextTokenChooser]
+    stopping_criterias: List[StoppingCriteria]
+    top_n_tokens: List[int]
+    top_n_tokens_tensor: torch.Tensor
+
+    # Metadata used for padding
+    max_input_length: int
+    max_decoder_input_length: int
+    padding_right_offset: int
+
+    # Maximum number of tokens this batch will grow to
+    max_tokens: int
+
+    def to_pb(self) -> generate_pb2.CachedBatch:
+        """Convert a Seq2SeqLMBatch to a text_generation_server.v1.CachedBatch protobuf"""
+        return generate_pb2.CachedBatch(
+            id=self.batch_id,
+            request_ids=[r.id for r in self.requests],
+            size=len(self),
+            max_tokens=self.max_tokens,
+        )
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "Seq2SeqLMBatch":
+        """Convert a text_generation_server.v1.Batch protobuf to a Seq2SeqLMBatch"""
+        inputs = []
+        next_token_choosers = []
+        stopping_criterias = []
+        top_n_tokens = []
+        decoder_input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        requests_idx_mapping = {}
+
+        # Parse batch
+        max_truncation = 0
+        padding_right_offset = 0
+        max_decode_tokens = 0
+        for i, r in enumerate(pb.requests):
+            inputs.append(concat_text_chunks(r.input_chunks.chunks))
+            requests_idx_mapping[r.id] = i
+            decoder_input_lengths.append(1)
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(r.top_n_tokens)
+            max_truncation = max(max_truncation, r.truncate)
+            max_decode_tokens += stopping_criteria.max_new_tokens
+            padding_right_offset = max(
+                padding_right_offset, stopping_criteria.max_new_tokens
+            )
+
+        # Tokenize batch
+        tokenized_inputs = tokenizer(
+            inputs,
+            return_tensors="pt",
+            padding=True,
+            return_token_type_ids=False,
+            truncation=True,
+            max_length=max_truncation,
+        ).to(device)
+
+        input_lengths = tokenized_inputs["attention_mask"].sum(1)
+        max_input_length = input_lengths.max()
+
+        # Decoder sequence only contains the bos_token
+        decoder_input_ids = (
+            torch.tensor(tokenizer.bos_token_id, device=device)
+            .repeat(len(pb.requests))
+            .view(-1, 1)
+        )
+        for _ in pb.requests:
+            prefix_offsets.append(0)
+            read_offsets.append(1)
+        all_decoder_input_ids = decoder_input_ids.view(-1).split(1)
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+
+        max_tokens = len(inputs) * (max_input_length + max_decode_tokens)
+
+        return cls(
+            batch_id=pb.id,
+            requests=pb.requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=tokenized_inputs["input_ids"],
+            attention_mask=tokenized_inputs["attention_mask"],
+            decoder_input_ids=decoder_input_ids,
+            all_decoder_input_ids=list(all_decoder_input_ids),
+            decoder_attention_mask=None,
+            encoder_last_hidden_state=None,
+            past_key_values=None,
+            input_lengths=input_lengths.tolist(),
+            decoder_input_lengths=decoder_input_lengths,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            next_token_choosers=next_token_choosers,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            max_input_length=max_input_length.item(),
+            max_decoder_input_length=1,
+            padding_right_offset=padding_right_offset,
+            max_tokens=max_tokens,
+        )
+
+    @tracer.start_as_current_span("filter")
+    def filter(self, request_ids: List[int]) -> Optional["Seq2SeqLMBatch"]:
+        if len(request_ids) == 0:
+            raise ValueError("Batch must have at least one request")
+        if len(request_ids) == len(self):
+            return self
+
+        keep_indices = []
+
+        # New values after filtering
+        requests_idx_mapping = {}
+        requests = []
+        input_lengths = []
+        decoder_input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+
+        all_decoder_input_ids = []
+
+        next_token_choosers = []
+        stopping_criterias = []
+        top_n_tokens = []
+
+        max_input_length = 0
+        max_decoder_input_length = 0
+        padding_right_offset = 0
+
+        total_remaining_decode_tokens = 0
+
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            requests_idx_mapping[request_id] = i
+            keep_indices.append(idx)
+
+            requests.append(self.requests[idx])
+            prefix_offsets.append(self.prefix_offsets[idx])
+            read_offsets.append(self.read_offsets[idx])
+
+            all_decoder_input_ids.append(self.all_decoder_input_ids[idx])
+
+            request_input_length = self.input_lengths[idx]
+            input_lengths.append(request_input_length)
+            max_input_length = max(max_input_length, request_input_length)
+
+            request_decoder_input_length = self.decoder_input_lengths[idx]
+            decoder_input_lengths.append(request_decoder_input_length)
+            max_decoder_input_length = max(
+                max_decoder_input_length, request_decoder_input_length
+            )
+
+            next_token_choosers.append(self.next_token_choosers[idx])
+            stopping_criteria = self.stopping_criterias[idx]
+            stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(self.top_n_tokens[idx])
+            remaining_decode_tokens = (
+                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
+            )
+            total_remaining_decode_tokens += remaining_decode_tokens
+            padding_right_offset = max(padding_right_offset, remaining_decode_tokens)
+
+        # Apply indices to input_ids, attention mask, past key values and other items that need to be cached
+        self.decoder_input_ids = self.decoder_input_ids[keep_indices]
+        self.attention_mask = self.attention_mask[keep_indices, -max_input_length:]
+        if self.decoder_attention_mask is not None:
+            self.decoder_attention_mask = self.decoder_attention_mask[
+                keep_indices,
+                -(self.padding_right_offset + max_decoder_input_length) : (
+                    self.decoder_attention_mask.shape[1] - self.padding_right_offset
+                )
+                + padding_right_offset,
+            ]
+
+        self.encoder_last_hidden_state = self.encoder_last_hidden_state[
+            keep_indices, -max_input_length:
+        ]
+
+        # Ensure that past_key_values tensors can be updated in-place
+        if type(self.past_key_values[0]) is tuple:
+            self.past_key_values = [
+                [t for t in layer] for layer in self.past_key_values
+            ]
+
+        decoder_past_seq_len = max_decoder_input_length - 1
+        for layer in self.past_key_values:
+            layer[0] = layer[0][keep_indices, :, -decoder_past_seq_len:]
+            layer[1] = layer[1][keep_indices, :, -decoder_past_seq_len:]
+            layer[2] = layer[2][keep_indices, :, -max_input_length:]
+            layer[3] = layer[3][keep_indices, :, -max_input_length:]
+
+        top_n_tokens_tensor = self.top_n_tokens_tensor[keep_indices]
+        max_tokens = (
+            len(request_ids) * (max_input_length + max_decoder_input_length)
+            + remaining_decode_tokens
+        )
+
+        self.requests = requests
+        self.requests_idx_mapping = requests_idx_mapping
+        self.input_ids = None
+        self.all_decoder_input_ids = all_decoder_input_ids
+        self.input_lengths = input_lengths
+        self.decoder_input_lengths = decoder_input_lengths
+        self.prefix_offsets = prefix_offsets
+        self.read_offsets = read_offsets
+        self.next_token_choosers = next_token_choosers
+        self.stopping_criterias = stopping_criterias
+        self.top_n_tokens = top_n_tokens
+        self.top_n_tokens_tensor = top_n_tokens_tensor
+        self.max_input_length = max_input_length
+        self.max_decoder_input_length = max_decoder_input_length
+        self.padding_right_offset = padding_right_offset
+        self.max_tokens = max_tokens
+
+        return self
+
+    @classmethod
+    @tracer.start_as_current_span("concatenate")
+    def concatenate(cls, batches: List["Seq2SeqLMBatch"]) -> "Seq2SeqLMBatch":
+        """Concatenate multiple batches together by padding internal torch tensors"""
+
+        # Used for padding
+        total_batch_size = 0
+        max_input_length = 0
+        max_decoder_input_length = 0
+        padding_right_offset = 0
+        for batch in batches:
+            total_batch_size += len(batch)
+            max_input_length = max(max_input_length, batch.max_input_length)
+            max_decoder_input_length = max(
+                max_decoder_input_length, batch.max_decoder_input_length
+            )
+            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)
+
+        # Batch attributes
+        requests = []
+        requests_idx_mapping = {}
+        all_decoder_input_ids = []
+        input_lengths = []
+        decoder_input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        next_token_choosers = []
+        stopping_criterias = []
+        top_n_tokens = []
+        max_tokens = 0
+
+        # Batch tensors
+        attention_mask = None
+        decoder_input_ids = None
+        decoder_attention_mask = None
+        encoder_last_hidden_state = None
+        top_n_tokens_tensor = None
+        past_key_values = []
+
+        # Used for slicing correctly inside the tensors
+        # Equivalent to a cumsum on batch sizes
+        start_index = 0
+
+        for i, batch in enumerate(batches):
+            # Extend all list attributes
+            requests.extend(batch.requests)
+            all_decoder_input_ids.extend(batch.all_decoder_input_ids)
+            input_lengths.extend(batch.input_lengths)
+            decoder_input_lengths.extend(batch.decoder_input_lengths)
+            prefix_offsets.extend(batch.prefix_offsets)
+            read_offsets.extend(batch.read_offsets)
+            next_token_choosers.extend(batch.next_token_choosers)
+            stopping_criterias.extend(batch.stopping_criterias)
+            top_n_tokens.extend(batch.top_n_tokens)
+
+            if i == 0:
+                requests_idx_mapping = batch.requests_idx_mapping
+            else:
+                # We need to offset the mapping for each batch by the cumulative batch size
+                for k, v in batch.requests_idx_mapping.items():
+                    requests_idx_mapping[k] = v + start_index
+
+            # Slicing end index for this batch
+            end_index = start_index + len(batch)
+
+            # We only concatenate batches that did at least one step
+            if batch.encoder_last_hidden_state is None:
+                raise ValueError("Batch encoder_last_hidden_state cannot be None")
+
+            # Create padded tensor
+            if attention_mask is None:
+                attention_mask = batch.attention_mask.new_zeros(
+                    (total_batch_size, max_input_length),
+                )
+            # Copy to correct indices
+            attention_mask[start_index:end_index, -batch.max_input_length :] = (
+                batch.attention_mask[:, -batch.max_input_length :]
+            )
+
+            # Create padded tensor
+            if decoder_input_ids is None:
+                decoder_input_ids = batch.decoder_input_ids.new_zeros(
+                    (total_batch_size, 1),
+                )
+            # Copy to correct indices
+            decoder_input_ids[start_index:end_index] = batch.decoder_input_ids
+
+            # Create padded tensor
+            if decoder_attention_mask is None:
+                # As decoder_attention_mask might not exist, we use `batch.attention_mask` for device here
+                decoder_attention_mask = batch.attention_mask.new_zeros(
+                    (total_batch_size, max_decoder_input_length + padding_right_offset),
+                )
+            # If the decoder mask does not exist yet, all generations started at the same time and we never concatenated
+            # this batch. All generations are of length `batch.max_decoder_input_length`.
+            left_offset = max_decoder_input_length - batch.max_decoder_input_length
+            if batch.decoder_attention_mask is None:
+                decoder_attention_mask[
+                    start_index:end_index,
+                    left_offset:-padding_right_offset,
+                ] = 1
+            # If it exists, we need to index
+            else:
+                batch_left_offset = (
+                    batch.decoder_attention_mask.shape[1]
+                    - batch.max_decoder_input_length
+                    - batch.padding_right_offset
+                )
+                decoder_attention_mask[
+                    start_index:end_index,
+                    left_offset:-padding_right_offset,
+                ] = batch.decoder_attention_mask[
+                    :,
+                    batch_left_offset : -batch.padding_right_offset,
+                ]
+
+            # Create padded tensor
+            if encoder_last_hidden_state is None:
+                encoder_last_hidden_state = batch.encoder_last_hidden_state.new_zeros(
+                    (
+                        total_batch_size,
+                        max_input_length,
+                        batch.encoder_last_hidden_state.shape[-1],
+                    ),
+                )
+
+            if top_n_tokens_tensor is None:
+                top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
+                    total_batch_size,
+                )
+            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor
+
+            # Copy to correct indices
+            encoder_last_hidden_state[
+                start_index:end_index, -batch.max_input_length :, :
+            ] = batch.encoder_last_hidden_state[:, -batch.max_input_length :, :]
+            batch.encoder_last_hidden_state = None
+
+            # Ensure that we can update tensors in-place
+            if isinstance(batch.past_key_values[0], tuple):
+                batch.past_key_values = [
+                    [t for t in layer] for layer in batch.past_key_values
+                ]
+
+            # Add eventual padding tokens that were added while concatenating
+            max_tokens += batch.max_tokens + (
+                max_input_length
+                - batch.max_input_length
+                + max_decoder_input_length
+                - batch.max_decoder_input_length
+            ) * len(batch)
+
+            start_index = end_index
+
+        # Determine shapes for new past kv tensors
+        first_past_kvs = batches[0].past_key_values
+        _, num_heads, _, head_dim = first_past_kvs[0][0].shape
+
+        padded_dec_t_shape = (
+            total_batch_size,
+            num_heads,
+            (max_decoder_input_length - 1),
+            head_dim,
+        )
+
+        padded_enc_t_shape = (
+            total_batch_size,
+            num_heads,
+            max_input_length,
+            head_dim,
+        )
+
+        # Iterate over attention layers
+        for j in range(len(first_past_kvs)):
+            past_key_values.append([])
+
+            # Decoder past
+            for k in range(0, 2):
+                # Initialize tensors
+                padded_past_values = first_past_kvs[j][k].new_zeros(padded_dec_t_shape)
+                past_key_values[j].append(padded_past_values)
+
+                start_index = 0
+                for batch in batches:
+                    t = batch.past_key_values[j][k]
+                    # Clear reference to the original tensor
+                    batch.past_key_values[j][k] = None
+                    # Slicing end index for this batch
+                    end_index = start_index + len(batch)
+                    # We slice the past keys and values to remove the padding from previous batches
+                    past_seq_len = batch.max_decoder_input_length - 1
+                    padded_past_values[start_index:end_index, :, -past_seq_len:, :] = t[
+                        :, :, -past_seq_len:, :
+                    ]
+                    del t
+
+                    start_index = end_index
+
+            # Encoder past
+            for k in range(2, 4):
+                # Initialize tensors
+                padded_past_values = first_past_kvs[j][k].new_zeros(padded_enc_t_shape)
+                past_key_values[j].append(padded_past_values)
+
+                start_index = 0
+                for batch in batches:
+                    t = batch.past_key_values[j][k]
+                    # Clear reference to the original tensor
+                    batch.past_key_values[j][k] = None
+                    # Slicing end index for this batch
+                    end_index = start_index + len(batch)
+                    # We slice the past keys and values to remove the padding from previous batches
+                    padded_past_values[
+                        start_index:end_index, :, -batch.max_input_length :, :
+                    ] = t[:, :, -batch.max_input_length :, :]
+                    del t
+
+                    start_index = end_index
+
+        return cls(
+            batch_id=batches[0].batch_id,
+            requests=requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=None,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            all_decoder_input_ids=all_decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_last_hidden_state=encoder_last_hidden_state,
+            past_key_values=past_key_values,
+            input_lengths=input_lengths,
+            decoder_input_lengths=decoder_input_lengths,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            next_token_choosers=next_token_choosers,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            max_input_length=max_input_length,
+            max_decoder_input_length=max_decoder_input_length,
+            padding_right_offset=padding_right_offset,
+            max_tokens=max_tokens,
+        )
+
+    def __len__(self):
+        return len(self.requests)
+
+
+class Seq2SeqLM(Model):
+    def __init__(
+        self,
+        model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        config_class=AutoConfig,
+        tokenizer_class=AutoTokenizer,
+        aliases=None,
+    ):
+        self.quantize = quantize
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = default_dtype if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                # Float16 doesn't exist on target.
+                dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+
+        config = config_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        tokenizer.bos_token_id = config.decoder_start_token_id
+
+        weights_loader = get_loader(
+            quantize=quantize, model_id=model_id, revision=revision
+        )
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device=device,
+            dtype=dtype,
+            process_group=self.process_group,
+            aliases=aliases,
+            weights_loader=weights_loader,
+        )
+        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
+            weights._set_gptq_params(model_id, revision)
+
+        model = model_class(config, weights)
+
+        torch.distributed.barrier(group=self.process_group)
+        super().__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+
+    @classmethod
+    def fallback(
+        cls,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+    ):
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
+
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            dtype = torch.float16 if dtype is None else dtype
+        else:
+            if quantize:
+                raise ValueError("quantization is not available on CPU")
+
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=dtype,
+            device_map=(
+                "auto"
+                if torch.cuda.is_available() and torch.cuda.device_count() > 1
+                else None
+            ),
+            load_in_8bit=quantize == "bitsandbytes",
+            trust_remote_code=trust_remote_code,
+        )
+        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
+            model = model.cuda()
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        tokenizer.bos_token_id = model.config.decoder_start_token_id
+
+        self = cls.__new__(
+            cls,
+        )
+        super().__init__(
+            self,
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+        )
+        self.quantize = quantize
+        return self
+
+    @property
+    def batch_type(self) -> Type[Seq2SeqLMBatch]:
+        return Seq2SeqLMBatch
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask: Optional,
+        encoder_last_hidden_state: Optional,
+        past_key_values: Optional = None,
+    ) -> Tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        torch.Tensor,
+        List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
+    ]:
+        # Model Forward
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_last_hidden_state,
+            past_key_values=past_key_values,
+            use_cache=True,
+        )
+        if isinstance(outputs, tuple):
+            # Our custom models
+            outputs, speculative_logits = outputs
+        else:
+            # Generic transformers models
+            speculative_logits = None
+        return (
+            outputs.logits,
+            speculative_logits,
+            outputs.encoder_last_hidden_state,
+            outputs.past_key_values,
+        )
+
+    @tracer.start_as_current_span("generate_token")
+    def generate_token(
+        self, batch: Seq2SeqLMBatch
+    ) -> Tuple[List[Generation], Optional[Seq2SeqLMBatch], Tuple[int, int]]:
+        start = time.time_ns()
+        if batch.decoder_attention_mask is not None:
+            # slice to the correct shape
+            decoder_attention_mask = batch.decoder_attention_mask[
+                :, : -batch.padding_right_offset
+            ]
+        else:
+            decoder_attention_mask = None
+
+        # Wrap `encoder_last_hidden_state` because for some reason, Transformers does a `encoder_last_hidden_state[0]`
+        # internally...
+        if batch.encoder_last_hidden_state is not None:
+            encoder_last_hidden_state = [batch.encoder_last_hidden_state]
+        else:
+            encoder_last_hidden_state = None
+
+        logits, speculative_logits, encoder_last_hidden_state, past = self.forward(
+            batch.input_ids,
+            batch.attention_mask,
+            batch.decoder_input_ids,
+            decoder_attention_mask,
+            encoder_last_hidden_state,
+            batch.past_key_values,
+        )
+
+        # Speculation is not active for seq2seq
+        accepted_ids = torch.ones_like(batch.decoder_input_ids)[:, 0]
+        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
+            batch.top_n_tokens,
+            batch.top_n_tokens_tensor,
+            torch.log_softmax(logits[:, -1], -1),
+            accepted_ids,
+        )
+
+        start_decode = time.time_ns()
+
+        # Finished requests
+        generations: List[Generation] = []
+        stopped = True
+
+        # Zipped iterator
+        iterator = zip(
+            batch.requests,
+            batch.input_lengths,
+            batch.prefix_offsets,
+            batch.read_offsets,
+            batch.decoder_input_lengths,
+            logits,
+            batch.next_token_choosers,
+            batch.stopping_criterias,
+            batch.all_decoder_input_ids,
+            batch.top_n_tokens,
+            batch_top_token_ids,
+            batch_top_token_logprobs,
+        )
+
+        # For each member of the batch
+        for i, (
+            request,
+            input_length,
+            prefix_offset,
+            read_offset,
+            decoder_input_length,
+            logits,
+            next_token_chooser,
+            stopping_criteria,
+            all_decoder_input_ids,
+            top_n_tokens,
+            top_token_ids,
+            top_token_logprobs,
+        ) in enumerate(iterator):
+            # Select next token
+            next_token_id, logprobs = next_token_chooser(
+                all_decoder_input_ids.view(1, -1), logits[-1:, :]
+            )
+
+            # Append next token to decoder tokens
+            all_decoder_input_ids = torch.cat(
+                [all_decoder_input_ids, next_token_id.squeeze(1)]
+            )
+            new_decoder_input_length = decoder_input_length + 1
+
+            # Generated token
+            next_token_logprob = logprobs[-1, next_token_id]
+            next_token_id_squeezed = next_token_id.squeeze()
+            next_token_text, prefix_offset, read_offset = self.decode_token(
+                all_decoder_input_ids, prefix_offset, read_offset
+            )
+
+            # Evaluate stopping criteria
+            stop, reason = stopping_criteria(next_token_id, next_token_text)
+
+            if not stop:
+                stopped = False
+
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Slice with decoder_input_length to remove padding
+                    # Decode all tokens
+                    output_text, _, _ = self.decode_token(
+                        all_decoder_input_ids,
+                        prefix_offset=len(all_decoder_input_ids)
+                        - decoder_input_length
+                        - 1,
+                        read_offset=len(all_decoder_input_ids) - decoder_input_length,
+                        skip_special_tokens=True,
+                    )
+
+                    # Get seed
+                    if isinstance(next_token_chooser.choice, Sampling):
+                        seed = next_token_chooser.choice.seed
+                    else:
+                        seed = None
+
+                    generated_text = GeneratedText(
+                        output_text, stopping_criteria.current_tokens, reason, seed
+                    )
+                else:
+                    generated_text = None
+
+                # Prefill
+                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
+                    prefill_tokens = Tokens(
+                        [self.tokenizer.bos_token_id],
+                        [float("nan")],
+                        [self.tokenizer.bos_token],
+                        [False],
+                    )
+                else:
+                    prefill_tokens = None
+
+                if top_n_tokens > 0:
+                    all_top_tokens = []
+                    for top_token_ids, top_token_logprobs in zip(
+                        top_token_ids, top_token_logprobs
+                    ):
+                        toptoken_texts = self.tokenizer.batch_decode(
+                            top_token_ids,
+                            clean_up_tokenization_spaces=False,
+                            skip_special_tokens=False,
+                        )
+                        special_toptokens = [
+                            token_id in self.all_special_ids
+                            for token_id in top_token_ids
+                        ]
+                        top_tokens = Tokens(
+                            top_token_ids,
+                            top_token_logprobs,
+                            toptoken_texts,
+                            special_toptokens,
+                        )
+                        all_top_tokens.append(top_tokens)
+                    top_tokens = all_top_tokens
+                else:
+                    top_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    Tokens(
+                        [next_token_id_squeezed],
+                        [next_token_logprob],
+                        [next_token_text],
+                        [next_token_id_squeezed.item() in self.all_special_ids],
+                    ),
+                    generated_text,
+                    top_tokens,
+                )
+
+                generations.append(generation)
+
+            # Update values
+            batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
+                next_token_id_squeezed.item()
+            )
+            batch.decoder_input_ids[i] = next_token_id
+            batch.all_decoder_input_ids[i] = all_decoder_input_ids
+            batch.input_lengths[i] = input_length
+            batch.decoder_input_lengths[i] = new_decoder_input_length
+            batch.prefix_offsets[i] = prefix_offset
+            batch.read_offsets[i] = read_offset
+            batch.max_input_length = max(batch.max_input_length, input_length)
+            batch.max_decoder_input_length = max(
+                batch.max_decoder_input_length, new_decoder_input_length
+            )
+
+        # We finished all generations in the batch; there is no next batch
+        if stopped:
+            forward_ns = start_decode - start
+            decode_ns = time.time_ns() - start_decode
+            return generations, None, (forward_ns, decode_ns)
+
+        # We don't need input_ids after the prefill forward
+        batch.input_ids = None
+        batch.encoder_last_hidden_state = encoder_last_hidden_state
+        batch.past_key_values = past
+        # Update decoder_attention_mask as we added a new token to input_ids
+        if batch.decoder_attention_mask is not None:
+            batch.decoder_attention_mask[:, -batch.padding_right_offset] = 1
+        batch.padding_right_offset -= 1
+
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch, (forward_ns, decode_ns)
diff --git a/backends/gaudi/server/text_generation_server/models/starcoder.py b/backends/gaudi/server/text_generation_server/models/starcoder.py
new file mode 100644
index 00000000..6c6ca2cf
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/starcoder.py
@@ -0,0 +1,47 @@
+import torch
+from dataclasses import dataclass
+from typing import List, Optional, Type
+
+from text_generation_server.models import CausalLM
+from text_generation_server.models.causal_lm import CausalLMBatch
+
+
+@dataclass
+class StarCoderCausalLMBatch(CausalLMBatch):
+    past_key_values: Optional[List[torch.Tensor]]
+
+    def detach_kv_cache(self):
+        past_keys = []
+        past_values = []
+        last_dim = int(self.past_key_values[0].size(dim=-1) / 2)
+        for key_value in self.past_key_values:
+            past_keys.append(key_value.split((last_dim, last_dim), dim=-1)[0])
+            past_values.append(key_value.split((last_dim, last_dim), dim=-1)[1])
+        del self.past_key_values
+
+        return past_keys, past_values
+
+    def attach_kv_cache(self, past_keys, past_values):
+        self.past_key_values = [
+            torch.cat((key, value), dim=-1)
+            for key, value in zip(past_keys, past_values)
+        ]
+
+
+class StarCoder(CausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+
+        super(StarCoder, self).__init__(
+            model_id=model_id,
+            revision=revision,
+            dtype=dtype,
+        )
+
+    @property
+    def batch_type(self) -> Type[CausalLMBatch]:
+        return StarCoderCausalLMBatch
diff --git a/backends/gaudi/server/text_generation_server/models/types.py b/backends/gaudi/server/text_generation_server/models/types.py
new file mode 100644
index 00000000..d4e7cca7
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/types.py
@@ -0,0 +1,102 @@
+import torch
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import List, Optional
+
+from transformers import PreTrainedTokenizerBase
+
+from text_generation_server.pb import generate_pb2
+from text_generation_server.pb.generate_pb2 import FinishReason
+
+
+class Batch(ABC):
+    @abstractmethod
+    def to_pb(self) -> generate_pb2.CachedBatch:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "Batch":
+        raise NotImplementedError
+
+    @abstractmethod
+    def filter(self, request_ids: List[int]) -> "Batch":
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def concatenate(cls, batches: List["Batch"]) -> "Batch":
+        raise NotImplementedError
+
+    @abstractmethod
+    def __len__(self):
+        raise NotImplementedError
+
+
+@dataclass
+class GeneratedText:
+    text: str
+    generated_tokens: int
+    finish_reason: FinishReason
+    seed: Optional[int]
+
+    def to_pb(self) -> generate_pb2.GeneratedText:
+        return generate_pb2.GeneratedText(
+            text=self.text,
+            generated_tokens=self.generated_tokens,
+            finish_reason=self.finish_reason,
+            seed=self.seed,
+        )
+
+
+@dataclass
+class Tokens:
+    token_ids: List[int]
+    logprobs: List[float]
+    texts: List[str]
+    is_special: List[bool]
+
+    def to_pb(self) -> generate_pb2.Tokens:
+        return generate_pb2.Tokens(
+            ids=self.token_ids,
+            logprobs=self.logprobs,
+            texts=self.texts,
+            is_special=self.is_special,
+        )
+
+    def __len__(self):
+        return len(self.token_ids)
+
+
+@dataclass
+class Generation:
+    request_id: int
+    prefill_tokens: Optional[Tokens]
+    tokens: Tokens
+    generated_text: Optional[GeneratedText]
+    # Optional for now, since it's not yet supported for every model.
+    top_tokens: Optional[List[Tokens]]
+
+    def to_pb(self) -> generate_pb2.Generation:
+        return generate_pb2.Generation(
+            request_id=self.request_id,
+            prefill_tokens=(
+                self.prefill_tokens.to_pb() if self.prefill_tokens is not None else None
+            ),
+            tokens=self.tokens.to_pb(),
+            generated_text=(
+                self.generated_text.to_pb() if self.generated_text is not None else None
+            ),
+            top_tokens=(
+                [top_tokens.to_pb() for top_tokens in self.top_tokens]
+                if self.top_tokens is not None
+                else None
+            ),
+        )
diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
new file mode 100644
index 00000000..d4f4c1af
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
@@ -0,0 +1,1330 @@
+import re
+import torch
+import os
+import time
+import math
+from PIL import Image
+from io import BytesIO
+from opentelemetry import trace
+from loguru import logger
+from typing import Iterable, Optional, Tuple, List, Type, Dict
+import itertools
+import tempfile
+import copy
+from text_generation_server.models import Model
+from transformers import PreTrainedTokenizerBase
+from text_generation_server.utils.tokens import batch_top_tokens
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.causal_lm import (
+    CausalLMBatch,
+    CausalLMRequest,
+    remove_kv_cache_from_output,
+    biggest_single_chunk,
+)
+
+from transformers.models.llava_next.modeling_llava_next import (
+    get_anyres_image_grid_shape,
+)
+
+from transformers import AutoProcessor
+import text_generation_server.habana_quantization_env as hq_env
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+from text_generation_server.utils import (
+    HeterogeneousNextTokenChooser,
+    make_tokenizer_optional,
+    is_tokenizer_transparent,
+    pad_next_token_chooser_parameters,
+)
+import habana_frameworks.torch as htorch
+from optimum.habana.utils import HabanaProfile
+from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
+from optimum.habana.utils import get_hpu_memory_stats
+from optimum.habana.checkpoint_utils import get_ds_injection_policy
+
+from transformers import (
+    AutoTokenizer,
+    AutoConfig,
+)
+from optimum.habana.checkpoint_utils import (
+    get_repo_root,
+    model_on_meta,
+    write_checkpoints_json,
+)
+
+from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.models.types import (
+    Tokens,
+    Generation,
+    GeneratedText,
+)
+from text_generation_server.utils.debug import dbg_trace
+
+tracer = trace.get_tracer(__name__)
+
+IDEFICS2_FAKE_TOKEN = "<fake_token_around_image>"
+IDEFICS2_IMAGE_TOKEN = "<image>"
+
+
+IMAGES = re.compile(r"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)")
+BASE_IMAGE_TOKENS = int(os.environ.get("BASE_IMAGE_TOKENS", 2048))
+MAX_TOTAL_TOKENS = int(os.environ.get("MAX_TOTAL_TOKENS", 8192))
+MAX_BATCH_TOTAL_TOKENS = int(os.environ.get("MAX_BATCH_TOTAL_TOKENS", 131072))
+PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 256))
+CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+LAZY_MODE = int(os.environ.get("PT_HPU_LAZY_MODE", 1))
+
+PREFILL_WARMUP_BATCH_SIZE_LIST = []
+PREFILL_WARMUP_SEQLEN_LIST = []
+DECODE_WARMUP_BATCH_SIZE_LIST = []
+
+
+def round_up(warmup_list: list, num):
+    i = 0
+    for i in warmup_list:
+        if num <= i:
+            break
+    return i
+
+
+def split(string) -> List[Dict[str, str]]:
+    parts = []
+    cursor = 0
+    for pattern in IMAGES.finditer(string):
+        start = pattern.start()
+        if start != cursor:
+            parts.append({"type": "text", "content": string[cursor:start]})
+
+        parts.append({"type": "image", "content": pattern.group(1)})
+        cursor = pattern.end()
+
+    if cursor != len(string):
+        parts.append({"type": "text", "content": string[cursor:]})
+
+    return parts
+
+
+def image_text_replacement(processor, image_input, config, image_id: int) -> str:
+    if config.model_type == "idefics2":
+        image_seq_len = 64
+        image_str = f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_IMAGE_TOKEN * image_seq_len}{IDEFICS2_FAKE_TOKEN}"
+        if processor.image_processor.do_image_splitting:
+            image_str *= 5
+        return image_str
+    elif config.model_type == "llava_next":
+        height, width = image_input["image_sizes"][image_id]
+        num_features = get_number_of_features(height, width, config)
+        return "<image>" * num_features
+
+    elif config.model_type == "paligemma":
+        return "<image>" * config.text_config.num_image_tokens
+    else:
+        raise RuntimeError(f"Unknown config {config.model_type} for multimodal")
+
+
+def image_text_replacement_fixup(config, text: str) -> str:
+    if config.model_type == "idefics2":
+        return text.replace(
+            f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_FAKE_TOKEN}", IDEFICS2_FAKE_TOKEN
+        )
+    return text
+
+
+def get_unpadded_features(
+    original_height: int,
+    original_width: int,
+    npatches: int,
+    num_patch_height: int,
+    num_patch_width: int,
+) -> Tuple[int, int]:
+    current_height = npatches * num_patch_height
+    current_width = npatches * num_patch_width
+
+    aspect_ratio: float = original_width / original_height
+    current_aspect_ratio: float = current_width / current_height
+
+    if aspect_ratio > current_aspect_ratio:
+        new_height = (original_height * current_width) // original_width
+        padding = (current_height - new_height) // 2
+        current_height = current_height - (2 * padding)
+    else:
+        new_width = (original_width * current_height) // original_height
+        padding = (current_width - new_width) // 2
+        current_width = current_width - (2 * padding)
+
+    unpadded_features = current_height * current_width
+    newline_features = current_height
+    return (unpadded_features, newline_features)
+
+
+def get_number_of_features(height: int, width: int, config) -> int:
+    # From config
+    # Hardcoded for CLIP for now
+    # image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
+    image_grid_pinpoints = config.image_grid_pinpoints
+    image_size = config.vision_config.image_size
+    patch_size = config.vision_config.patch_size
+
+    assert image_size % patch_size == 0
+
+    npatches = image_size // patch_size
+
+    # Dimensions are intentionally swapped to be bug-compatible with
+    # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
+    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+        [height, width],
+        image_grid_pinpoints,
+        image_size,
+    )
+
+    unpadded_features, newline_features = get_unpadded_features(
+        height, width, npatches, num_patch_height, num_patch_width
+    )
+    # The base patch covers the entire image
+    base_features = npatches**2
+    return unpadded_features + newline_features + base_features
+
+
+class VlmCausalLMBatch(CausalLMBatch):
+    pixel_values: Optional[List[torch.Tensor]]
+    pixel_attention_mask: Optional[List[torch.Tensor]]
+    image_sizes: Optional[List[Tuple[int, int]]]
+
+    @classmethod
+    def from_tokenized(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        batch_tokenized_inputs,
+        dtype: torch.dtype,
+        device: torch.device,
+        is_warmup: bool = False,
+    ) -> "VlmCausalLMBatch":
+
+        dbg_trace("FROM_PB", f"num_reqs:{len(pb.requests)}")
+        requests = [
+            CausalLMRequest.from_pb(idx, req, tokenizer)
+            for idx, req in enumerate(pb.requests)
+        ]
+
+        max_input_length = max(r.data.truncate for r in requests)
+        max_new_tokens = max(r.stopping_criteria.max_new_tokens for r in requests)
+        # TODO: Add support for sparse batches
+        top_n_tokens = [r.top_n_tokens for r in pb.requests]
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+
+        # TODO: by tokenizing all inputs at once we loose information on actual input lengths
+        # this means that we cannot shift inputs to the left after a long input sequence
+        # was filtered out
+        new_bs = round_up(PREFILL_WARMUP_BATCH_SIZE_LIST, len(requests))
+        parameters = [r.parameters for r in pb.requests]
+        # append the dummy parameters for dummy request
+        parameters = pad_next_token_chooser_parameters(parameters, new_bs)
+
+        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
+            pb=parameters,
+            dtype=dtype,
+            device=device,
+            tokenizer=tokenizer,
+            quantization_enabled=hq_env.is_quantization_enabled,
+        )
+        tokenized_inputs = batch_tokenized_inputs
+        input_len = tokenized_inputs["input_ids"].shape[1]
+
+        bucket_size = max_input_length
+        left_padding = max_input_length - input_len
+        if is_warmup is False:
+            if input_len < max_input_length:
+                rounded_seq_len = round_up(PREFILL_WARMUP_SEQLEN_LIST, input_len + 1)
+                if rounded_seq_len <= max_input_length:
+                    bucket_size = rounded_seq_len - 1
+                else:
+                    bucket_size = max_input_length - 1
+                left_padding = bucket_size - input_len
+
+        input_ids = tokenized_inputs["input_ids"]
+        attention_mask = tokenized_inputs["attention_mask"]
+        # Allocate space for first token
+        if left_padding > 0:
+            input_ids = torch.nn.functional.pad(
+                input_ids, (left_padding, 1), value=tokenizer.pad_token_id
+            )
+            attention_mask = torch.nn.functional.pad(
+                attention_mask, (left_padding, 1), value=0
+            )
+        all_input_ids = torch.nn.functional.pad(
+            input_ids, (0, max_new_tokens), value=tokenizer.pad_token_id
+        ).T.split(1, dim=1)
+
+        # New input length after left padding
+        input_len = bucket_size
+        for r in requests:
+            r.input_length = input_len
+            r.prefix_offset = input_len - 5
+            r.read_offset = input_len
+            r.all_input_ids = all_input_ids[r.idx]
+        input_ids = input_ids.to(device)
+        attention_mask = attention_mask.to(device)
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+
+        htorch.core.mark_step()
+
+        return cls(
+            batch_id=pb.id,
+            requests=requests,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=None,
+            merged_kv_cache=False,
+            next_token_chooser=next_token_chooser,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            input_length=input_len,
+        )
+
+    @classmethod
+    def batch_tokenized_inputs(
+        cls,
+        requests: Iterable[generate_pb2.Request],
+        tokenizer,
+        processor,
+        config,
+        is_warmup,
+    ):
+        # Process images first. We need all of them so that the processor
+        # can make the image splits the same size. And we need the final
+        # sizes to insert correct number of image tokens.
+        images = []
+        for r in requests:
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    pass
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    if config.model_type == "llava_next":
+                        images.append(image)
+                    else:
+                        images.append([image])
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+
+        image_inputs = None
+        if images:
+            image_inputs = processor.image_processor(images, return_tensors="pt")
+
+        batch_inputs = []
+        max_truncation = 0
+        image_id = 0
+        for r in requests:
+            full_text = ""
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    full_text += chunk.text
+                elif chunk_type == "image":
+                    full_text += image_text_replacement(
+                        processor, image_inputs, config, image_id
+                    )
+                    image_id += 1
+            full_text = image_text_replacement_fixup(config, full_text)
+
+            batch_inputs.append(full_text)
+            max_truncation = max(max_truncation, r.truncate)
+
+        missing_inputs = 0
+        dummy_images = None
+        if is_warmup is False:
+            new_bs = round_up(PREFILL_WARMUP_BATCH_SIZE_LIST, len(requests))
+            missing_inputs = new_bs - len(requests)
+            if missing_inputs > 0:
+                dummy_inputs = []
+                if len(batch_inputs) > 0:
+                    dummy_inputs = [batch_inputs[0]] * missing_inputs
+
+                batch_inputs += dummy_inputs
+
+        batch_tokenized_inputs = tokenizer(
+            batch_inputs,
+            truncation=True,
+            max_length=max_truncation,
+            add_special_tokens=not config.model_type == "paligemma",
+            return_tensors="pt",
+            padding="longest",
+            return_token_type_ids=False,
+        )
+
+        if missing_inputs > 0 and image_inputs is not None:
+            dummy_shape = list(image_inputs["pixel_values"].shape)
+            dummy_shape[0] = missing_inputs
+            dummy_images = torch.rand(dummy_shape)
+            new_image_inputs = {
+                "pixel_values": torch.cat(
+                    (image_inputs["pixel_values"], dummy_images), dim=0
+                ),
+            }
+            if "pixel_attention_mask" in image_inputs:
+                dummy_shape = list(image_inputs["pixel_attention_mask"].shape)
+                dummy_shape[0] = missing_inputs
+                dummy_attention = torch.zeros(dummy_shape)
+                new_image_inputs["pixel_attention_mask"] = torch.cat(
+                    (image_inputs["pixel_attention_mask"], dummy_attention), dim=0
+                )
+            if "image_sizes" in image_inputs:
+                dummy_shape = list(list(image_inputs["image_sizes"])[0])
+                dummy_shape = missing_inputs * [dummy_shape]
+                dummy_sizes = torch.IntTensor(dummy_shape)
+                new_image_inputs["image_sizes"] = torch.cat(
+                    (image_inputs["image_sizes"], dummy_sizes), dim=0
+                )
+            image_inputs = new_image_inputs
+
+        return batch_tokenized_inputs, image_inputs
+
+    @classmethod
+    def from_pb_processor(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        processor,
+        config,
+        dtype: torch.dtype,
+        device: torch.device,
+        is_warmup: bool = False,
+    ) -> "VlmCausalLMBatch":
+        batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs(
+            pb.requests, tokenizer, processor, config, is_warmup
+        )
+        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
+        if image_inputs is not None:
+            batch.pixel_values = image_inputs["pixel_values"].to(device=device)
+            if "pixel_attention_mask" in image_inputs:
+                batch.pixel_attention_mask = image_inputs["pixel_attention_mask"].to(
+                    device=device
+                )
+            else:
+                batch.pixel_attention_mask = None
+            if "image_sizes" in image_inputs:
+                batch.image_sizes = image_inputs["image_sizes"].to(device=device)
+            else:
+                batch.image_sizes = None
+        else:
+            batch.pixel_values = None
+            batch.pixel_attention_mask = None
+            batch.image_sizes = None
+        return batch
+
+    @classmethod
+    @tracer.start_as_current_span("concatenate")
+    def concatenate(
+        cls,
+        batches: List["CausalLMBatch"],
+        pad_token_id: int = 0,
+        is_warmup: bool = False,
+    ) -> "CausalLMBatch":
+        return cls.recombine(batches, pad_token_id, is_warmup)
+
+    @classmethod
+    def recombine(
+        cls,
+        batches: List["VlmCausalLMBatch"],
+        pad_token_id: int,
+        is_warmup: bool = False,
+    ) -> "VlmCausalLMBatch":
+        if not all(b.past_key_values is not None for b in batches):
+            raise ValueError("KV cache not allocated! Cannot recombine before prefill!")
+
+        total_requests = sum(len(b) for b in batches)
+        new_bs = total_requests
+        if is_warmup is False:
+            new_bs = round_up(DECODE_WARMUP_BATCH_SIZE_LIST, total_requests)
+        batch_id = batches[0].batch_id
+        device = batches[0].input_ids.device
+
+        input_lengths = [b.input_length for b in batches]
+        max_input_length = max(input_lengths)
+        offsets = [max_input_length - b.input_length for b in batches]
+
+        cur_padding = [b.right_padding for b in batches]
+        # For prefill there is a space allocated only for first token
+        # Need to add padding to the max total tokens before first decode
+
+        moves_needed = [
+            total_requests - len(b) if b.batch_size == new_bs else total_requests
+            for b in batches
+        ]
+        dst_batch_idx = min(enumerate(moves_needed), key=lambda idx_val: idx_val[1])[0]
+        reshape = batches[dst_batch_idx].batch_size < new_bs
+
+        # TODO: Add support for changing max seq len, i.e. due to output length bucketing
+        # FIXME: max_seq_len for non optimized code
+        if len(batches) > 1:
+            scenario = "CONCAT"
+        elif reshape:
+            scenario = "RESHAPE"
+        elif cur_padding[dst_batch_idx] <= 0:
+            scenario = "SHIFT"
+            offsets = [
+                biggest_single_chunk(b.max_input_length - max_input_length)
+                for b in batches
+            ]
+            max_input_length = max_input_length + offsets[dst_batch_idx]
+        else:
+            # Nothing to do
+            return batches[0]
+
+        dbg_trace(
+            scenario,
+            f"bs:{[b.batch_size for b in batches]}->{new_bs}"
+            f" reqs:{[len(b) for b in batches]}"
+            f" offsets:{offsets}"
+            f" input_lengths:{input_lengths}"
+            f" cur_padding:{cur_padding}"
+            f" dst_batch:{dst_batch_idx}",
+        )
+
+        grouped_requests = [[req for req in batch.requests] for batch in batches]
+        flat_requests = list(itertools.chain(*grouped_requests))
+
+        for i in range(len(batches)):
+            target_bs = new_bs if i == dst_batch_idx else batches[i].batch_size
+            batches[i].merge_kv_cache_if_needed(target_bs, offsets[i])
+            batches[i].realign(target_bs, offsets[i], pad_token_id)
+            batches[i].split_kv_cache_if_needed(i == dst_batch_idx)
+        batches[dst_batch_idx].expand_bs(new_bs)
+        batches[dst_batch_idx].move_data(
+            [batches[i] for i in range(len(batches)) if i != dst_batch_idx]
+        )
+
+        top_n_tokens = [r.data.top_n_tokens for r in flat_requests]
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+
+        parameters = [r.data.parameters for r in flat_requests]
+        # append the dummy parameters for dummy requests
+        batch_size = batches[dst_batch_idx].batch_size
+        parameters = pad_next_token_chooser_parameters(parameters, batch_size)
+
+        # update past grammar states
+        fsm_grammar_states = [0] * batch_size
+        for batch in batches:
+            for i, req in enumerate(batch.requests):
+                fsm_grammar_states[req.idx] = (
+                    batch.next_token_chooser.fsm_grammar_states[i]
+                )
+
+        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
+            parameters,
+            batches[dst_batch_idx].next_token_chooser.dtype,
+            batches[dst_batch_idx].next_token_chooser.device,
+            batches[dst_batch_idx].next_token_chooser.tokenizer,
+            fsm_grammar_states,
+            quantization_enabled=hq_env.is_quantization_enabled,
+        )
+
+        input_ids = batches[dst_batch_idx].input_ids
+        attention_mask = batches[dst_batch_idx].attention_mask
+        position_ids = batches[dst_batch_idx].position_ids
+        past_key_values = batches[dst_batch_idx].past_key_values
+        input_length = max_input_length
+
+        htorch.core.mark_step()
+
+        return cls(
+            batch_id=batch_id,
+            requests=flat_requests,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            merged_kv_cache=False,
+            next_token_chooser=next_token_chooser,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            input_length=input_length,
+        )
+
+
+class VlmCausalLM(Model):
+    def __init__(
+        self,
+        model_class,
+        model_id: str,
+        *,
+        processor_class=AutoProcessor,
+        processor_kwargs=None,
+        batch_class=VlmCausalLMBatch,
+        revision,
+        quantize: Optional[str] = None,
+        dtype,
+        trust_remote_code: bool,
+        **kwargs,
+    ):
+        adapt_transformers_to_gaudi()
+        if processor_kwargs is None:
+            processor_kwargs = {}
+        self.processor = processor_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **processor_kwargs,
+        )
+        self.batch_class = batch_class
+        self.prev_bs = 0
+        self.quantize = quantize
+
+        # Create tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        make_tokenizer_optional(tokenizer)
+
+        # Create model
+        world_size = int(os.getenv("WORLD_SIZE", "1"))
+        rank = int(os.getenv("RANK", "0"))
+        dtype = torch.bfloat16 if dtype is None else dtype
+        device = torch.device("hpu")
+
+        if hq_env.is_quantization_enabled:
+            htorch.core.hpu_set_env()
+
+        if world_size > 1:
+            model = self.get_deepspeed_model(model_class, model_id, dtype, revision)
+            model = hq_env.prepare_model_for_quantization(model)
+        else:
+            get_repo_root(model_id)
+
+            # Check support for rope scaling
+            model_kwargs = {}
+            config = AutoConfig.from_pretrained(model_id)
+            if hasattr(config, "rope_scaling"):
+                model_kwargs["rope_scaling"] = self.get_rope_scaling()
+
+            model = model_class.from_pretrained(
+                model_id,
+                revision=revision,
+                torch_dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+            model = hq_env.prepare_model_for_quantization(model)
+            model = model.eval().to(device)
+
+        self.enable_hpu_graph = (
+            os.getenv("ENABLE_HPU_GRAPH", "true").lower() == "true" and LAZY_MODE == 1
+        )
+        self.limit_hpu_graph = os.getenv("LIMIT_HPU_GRAPH", "true").lower() == "true"
+        model = remove_kv_cache_from_output(model)
+        if self.enable_hpu_graph:
+            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+            model = wrap_in_hpu_graph(model, disable_tensor_cache=True)
+        else:
+            if LAZY_MODE == 0:
+                # It is said that "keep_input_mutations" is safe for inference to be done
+                dbg_trace("TORCH COMPILE", "Torch compiling of model")
+                model.model = torch.compile(
+                    model.model,
+                    backend="hpu_backend",
+                    options={"keep_input_mutations": True},
+                )
+
+        model = hq_env.setup_quantization(model)
+
+        if model.config.model_type not in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
+            raise ValueError(f"Model type {model.config.model_type} is not supported!")
+
+        if tokenizer.pad_token_id is None:
+            if model.config.pad_token_id is not None:
+                tokenizer.pad_token_id = model.config.pad_token_id
+            elif model.config.eos_token_id is not None:
+                if isinstance(model.config.eos_token_id, int):
+                    tokenizer.pad_token_id = model.config.eos_token_id
+                elif isinstance(model.config.eos_token_id, list):
+                    tokenizer.pad_token_id = model.config.eos_token_id[0]
+                else:
+                    raise ValueError(
+                        f"{type(model.config.eos_token_id)} type of eos_token_id in the model's config is not supported for tokenizer.pad_token_id"
+                    )
+            elif tokenizer.eos_token_id is not None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            else:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+        self.kwargs = {
+            "use_cache": True,
+            "return_dict": True,
+        }
+
+        if model.config.model_type in ["llava_next"]:
+            self.kwargs["attn_softmax_bf16"] = True
+            self.kwargs["trim_logits"] = True
+
+            if os.getenv("USE_FLASH_ATTENTION", "true").lower() == "true":
+                self.kwargs["use_flash_attention"] = True
+            if os.getenv("FLASH_ATTENTION_RECOMPUTE", "true").lower() == "true":
+                self.kwargs["flash_attention_recompute"] = True
+
+        self.speculate = get_speculate()
+        super(VlmCausalLM, self).__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+        )
+
+        # Create profiler
+        ranks_to_profile = [int(val) for val in os.getenv("PROF_RANKS", "0").split(",")]
+        record_shapes = os.getenv("PROF_RECORD_SHAPES", "false").lower() == "true"
+        output_dir = os.getenv("PROF_PATH", "/tmp/hpu_profile")
+        self.profiling_warmup_steps = (
+            int(os.getenv("PROF_WARMUPSTEP", "0")) if rank in ranks_to_profile else 0
+        )
+        self.profiling_steps = (
+            int(os.getenv("PROF_STEP", "0")) if rank in ranks_to_profile else 0
+        )
+        self.profiling_wait_steps = int(os.getenv("PROF_WAITSTEP", "0"))
+        if self.profiling_steps > 0:
+            self.hb_profiler = HabanaProfile(
+                wait=self.profiling_wait_steps,
+                warmup=self.profiling_warmup_steps,
+                active=self.profiling_steps,
+                output_dir=output_dir,
+                record_shapes=record_shapes,
+            )
+            self.hb_profiler.start()
+        else:
+            self.hb_profiler = None
+        self.step = 0
+
+    @property
+    def batch_type(self) -> Type[VlmCausalLMBatch]:
+        return self.batch_class
+
+    def max_past(self) -> Optional[int]:
+        return getattr(self.model.text_model, "max_past", None)
+
+    def get_deepspeed_model(
+        self,
+        model_class,
+        model_id: str,
+        dtype: torch.dtype,
+        revision: Optional[str] = None,
+    ) -> torch.nn.Module:
+        import deepspeed
+        from habana_frameworks.torch.distributed.hccl import initialize_distributed_hpu
+
+        world_size, rank, local_rank = initialize_distributed_hpu()
+        model_kwargs = {"revision": revision}
+
+        # Initialize process(es) for DeepSpeed
+        deepspeed.init_distributed(dist_backend="hccl")
+        logger.info(
+            "DeepSpeed is enabled. world_size {} rank {} local_rank {}".format(
+                world_size, rank, local_rank
+            )
+        )
+        config = AutoConfig.from_pretrained(model_id, **model_kwargs)
+        load_to_meta = model_on_meta(config)
+
+        # Check support for rope scaling
+        if hasattr(config, "rope_scaling"):
+            config.rope_scaling = self.get_rope_scaling()
+            model_kwargs["rope_scaling"] = self.get_rope_scaling()
+
+        if load_to_meta:
+            # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
+            with deepspeed.OnDevice(dtype=dtype, device="meta"):
+                model = model_class.from_config(config, torch_dtype=dtype)
+        else:
+            get_repo_root(model_id, local_rank=os.getenv("LOCAL_RANK"))
+            # TODO: revisit placement on CPU when auto-injection is possible
+            with deepspeed.OnDevice(dtype=dtype, device="cpu"):
+                model = model_class.from_pretrained(
+                    model_id, torch_dtype=dtype, **model_kwargs
+                )
+        model = model.eval()
+
+        # Initialize the model
+        ds_inference_kwargs = {"dtype": dtype}
+        ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size}
+        ds_inference_kwargs["enable_cuda_graph"] = False
+        ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(
+            model.language_model.config
+        )
+
+        if load_to_meta:
+            # model loaded to meta is managed differently
+            checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
+            write_checkpoints_json(model_id, local_rank, checkpoints_json)
+            ds_inference_kwargs["checkpoint"] = checkpoints_json.name
+        model = deepspeed.init_inference(model, **ds_inference_kwargs)
+
+        return model.module
+
+    def get_rope_scaling(self) -> Optional[Dict]:
+        rope_scaling = os.getenv("ROPE_SCALING", None)
+        if rope_scaling is None:
+            return None
+
+        rope_factor = float(os.getenv("ROPE_FACTOR", 1.0))
+        return {"type": rope_scaling, "factor": float(rope_factor)}
+
+    def decode(self, generated_ids: List[int]) -> str:
+        return self.tokenizer.decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+    def decode_token(
+        self,
+        all_input_ids: List[int],
+        prefix_offset: int = 0,
+        read_offset: int = 0,
+    ) -> Tuple[str, int, int]:
+        if is_tokenizer_transparent(self.tokenizer):
+            new_text = self.tokenizer.decode(
+                all_input_ids[read_offset:], skip_special_tokens=False
+            )
+            return new_text, read_offset, len(all_input_ids)
+        else:
+            return super().decode_token(all_input_ids, prefix_offset, read_offset)
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        token_idx,
+        past_key_values: Optional[List[Tuple]] = None,
+        pixel_values: Optional[List[torch.Tensor]] = None,
+        image_sizes: Optional[List[Tuple[int, int]]] = None,
+        bypass_hpu_graph: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Model Forward
+        kwargs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "token_idx": token_idx,
+            "pixel_values": pixel_values,
+            "image_sizes": image_sizes,
+        }
+
+        hpu_kwargs = {}
+        # Optimum Habana got "lazy_mode" key-val only supported for llama type of models
+        if self.model.config.model_type == "llama":
+            hpu_kwargs["lazy_mode"] = LAZY_MODE == 1
+
+        if self.has_position_ids:
+            kwargs["position_ids"] = position_ids
+
+        if bypass_hpu_graph is not None:
+            hpu_kwargs["bypass_hpu_graphs"] = bypass_hpu_graph
+
+        kwargs.update(self.kwargs)
+        model_inputs = self.model.prepare_inputs_for_generation(**kwargs)
+        if past_key_values is not None:
+            return self.model.forward(**model_inputs, **hpu_kwargs)
+        else:
+            outputs = self.model.forward(**model_inputs, **hpu_kwargs)
+            return outputs.logits, outputs.past_key_values
+
+    @tracer.start_as_current_span("generate_token")
+    def generate_token(
+        self, batches: List[VlmCausalLMBatch], is_warmup: bool = False
+    ) -> Tuple[List[Generation], Optional[CausalLMBatch], Tuple[int, int]]:
+        start = time.time_ns()
+        # Results
+        generations: List[Generation] = []
+        prev_batches = []
+        requests_to_generate = []
+        # In order to pipeline any actions on CPU we perform the operation in 3 main stages:
+        # Stage 1. Collect next token ids of any previously started generations
+        for batch_id, batch in enumerate(batches):
+            if batch.logits is not None:
+                logits = batch.logits
+                past = batch.past
+                prefill = batch.past_key_values is None
+                if prefill:
+                    # no right padding for prefill
+                    token_idx_scalar = batch.attention_mask.shape[-1] - 1
+                    token_idx = torch.tensor(token_idx_scalar).to(self.device)
+                else:
+                    token_idx_scalar = (
+                        batch.attention_mask.shape[-1] - batch.right_padding
+                    )
+                    token_idx = torch.tensor(token_idx_scalar).to(self.device)
+
+                # Select next token
+                input_length = batch.input_length
+                if logits.shape[-2] > 1:
+                    next_token_ids, next_token_logprobs, logprobs, _, _ = (
+                        batch.next_token_chooser(
+                            batch.input_ids,
+                            logits[:, input_length - 1 : input_length, :].squeeze(-2),
+                            self.speculate,
+                        )
+                    )
+                else:
+                    next_token_ids, next_token_logprobs, logprobs, _, _ = (
+                        batch.next_token_chooser(
+                            batch.input_ids, logits.squeeze(-2), self.speculate
+                        )
+                    )
+                # Speculation is not active for causal
+                accepted_ids = torch.ones_like(batch.input_ids)[:, 0]
+                batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
+                    batch.top_n_tokens,
+                    batch.top_n_tokens_tensor,
+                    logprobs,
+                    accepted_ids,
+                )
+
+                prev_batches.append(
+                    {
+                        "next_token_ids": next_token_ids,
+                        "next_token_logprobs": next_token_logprobs,
+                    }
+                )
+
+                for req_idx, req in enumerate(batch.requests):
+                    requests_to_generate.append(
+                        {
+                            "req": req,
+                            "prev_req_idx": req.idx,
+                            "batch_id": batch_id,
+                            "seed": batch.next_token_chooser.seeds[req_idx],
+                            "do_sample": batch.next_token_chooser.do_sample[req_idx],
+                            "top_n_tokens": batch.top_n_tokens[req_idx],
+                            "top_token_ids": batch_top_token_ids[req_idx],
+                            "top_token_logprobs": batch_top_token_logprobs[req_idx],
+                            "grammar_state": batch.next_token_chooser.fsm_grammar_states[
+                                req.idx
+                            ],
+                        }
+                    )
+
+                htorch.core.mark_step()
+
+                # Add new token into input_ids
+                batch.input_ids.index_copy_(1, token_idx, next_token_ids.unsqueeze(1))
+
+                # Update attention_mask as we added a new token to input_ids
+                batch.attention_mask.index_fill_(1, token_idx, 1)
+
+                # Adjust lengths
+                batch.input_length += 1
+
+                # Update position_ids
+                if prefill:
+                    batch.position_ids = (
+                        torch.index_select(batch.position_ids, 1, token_idx - 1) + 1
+                    )
+                else:
+                    batch.position_ids += 1
+                # Update past key values
+                if prefill:
+                    batch.past_key_values = past
+
+        htorch.core.mark_step()
+
+        # Stage 2. Prepare new batch for speculative scheduling
+        if len(batches) > 1:
+            batch = self.batch_type.concatenate(
+                batches, self.tokenizer.pad_token_id, is_warmup
+            )
+        else:
+            batch = batches[0]
+
+        prefill = batch.past_key_values is None
+
+        # Check if we need to do any bookkeeping first
+        if not prefill:
+            batch = batch.__class__.recombine(
+                [batch], self.tokenizer.pad_token_id, is_warmup
+            )
+
+        scenario = "PREFILL" if prefill else "GENERATE"
+        if (
+            self.enable_hpu_graph
+            and self.limit_hpu_graph
+            and round_up(DECODE_WARMUP_BATCH_SIZE_LIST, batch.batch_size)
+            != self.prev_bs
+        ):
+            self.model.clear_cache()
+            self.prev_bs = round_up(DECODE_WARMUP_BATCH_SIZE_LIST, batch.batch_size)
+        dbg_trace(
+            scenario,
+            f"bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}",
+        )
+        # assert batch.right_padding > 0, 'No more room for next token!'
+
+        # Execute batch
+        if prefill:
+            # no right padding for prefill
+            token_idx = torch.tensor(batch.attention_mask.shape[-1] - 1).to(self.device)
+            batch.logits, batch.past = self.forward(
+                batch.input_ids,
+                batch.attention_mask,
+                batch.position_ids,
+                token_idx,
+                batch.past_key_values,
+                batch.pixel_values,
+                batch.image_sizes,
+                bypass_hpu_graph=(
+                    prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
+                ),
+            )
+        elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]):
+            # Don't schedule next forward if max_new_tokens for all requests equals 1
+            # - we've already generated the first and only needed token in the prefill phase
+            pass
+        else:
+            token_idx = torch.tensor(
+                batch.attention_mask.shape[-1] - batch.right_padding
+            ).to(self.device)
+            batch.logits = self.forward(
+                batch.input_ids,
+                batch.attention_mask,
+                batch.position_ids,
+                token_idx,
+                batch.past_key_values,
+                bypass_hpu_graph=(
+                    prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
+                ),
+            )
+
+        htorch.core.mark_step()
+
+        start_decode = time.time_ns()
+
+        # Stage 3. Finish and return previous generations
+        stopped = len(requests_to_generate) > 0
+        for prev_batch in prev_batches:
+            prev_batch["next_token_logprobs"] = prev_batch[
+                "next_token_logprobs"
+            ].tolist()
+            prev_batch["next_token_ids_cpu"] = prev_batch["next_token_ids"].cpu()
+        htorch.core.mark_step()
+
+        for req_data in requests_to_generate:
+            req = req_data["req"]
+            i = req_data["prev_req_idx"]
+            prev_batch_id = req_data["batch_id"]
+            assert len(prev_batches) > prev_batch_id
+            next_token_ids_cpu = prev_batches[prev_batch_id]["next_token_ids_cpu"]
+            next_token_logprobs = prev_batches[prev_batch_id]["next_token_logprobs"]
+
+            request = req.data
+            input_length = req.input_length
+            prefix_offset = req.prefix_offset
+            read_offset = req.read_offset
+            do_sample = req_data["do_sample"]
+            seed = req_data["seed"]
+            stopping_criteria = req.stopping_criteria
+            all_input_ids = req.all_input_ids
+            next_token_id = next_token_ids_cpu[i]
+            next_token_logprob = next_token_logprobs[i]
+            top_n_tokens = req_data["top_n_tokens"]
+            top_token_ids = req_data["top_token_ids"]
+            top_token_logprobs = req_data["top_token_logprobs"]
+            grammar_state = req_data["grammar_state"]
+
+            # Append next token to all tokens
+            all_input_ids[input_length] = next_token_id
+            new_input_length = input_length + 1
+
+            # Generated token
+            if (
+                is_tokenizer_transparent(self.tokenizer)
+                and len(stopping_criteria.stop_sequence_criterias) == 0
+            ):
+                next_token_text = ""
+            else:
+                next_token_text, prefix_offset, read_offset = self.decode_token(
+                    all_input_ids[0:new_input_length, 0], prefix_offset, read_offset
+                )
+
+            # Evaluate stopping criteria
+            stop, reason = stopping_criteria(
+                next_token_id,
+                next_token_text,
+            )
+
+            if not stop:
+                stopped = False
+
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Decode generated tokens
+                    if is_tokenizer_transparent(self.tokenizer):
+                        output_text = None
+                    else:
+                        output_text = self.decode(
+                            all_input_ids[
+                                new_input_length
+                                - stopping_criteria.current_tokens : new_input_length,
+                                0,
+                            ]
+                        )
+                    generated_text = GeneratedText(
+                        output_text,
+                        stopping_criteria.current_tokens,
+                        reason,
+                        seed if do_sample else None,
+                    )
+                else:
+                    generated_text = None
+
+                # Prefill
+                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
+                    # Remove generated token to only have prefill and add nan for first prompt token
+                    prefill_logprobs = [float("nan")] + next_token_logprobs
+                    prefill_token_ids = all_input_ids[0 : new_input_length - 1]
+                    prefill_texts = self.tokenizer.batch_decode(
+                        prefill_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+                    prefill_tokens = Tokens(
+                        prefill_token_ids,
+                        prefill_logprobs,
+                        prefill_texts,
+                        is_special=[],
+                    )
+                else:
+                    prefill_tokens = None
+
+                if top_n_tokens > 0:
+                    all_top_tokens = []
+                    for top_token_ids, top_token_logprobs in zip(
+                        top_token_ids, top_token_logprobs
+                    ):
+                        toptoken_texts = self.tokenizer.batch_decode(
+                            top_token_ids,
+                            clean_up_tokenization_spaces=False,
+                            skip_special_tokens=False,
+                        )
+                        special_toptokens = [
+                            token_id in self.all_special_ids
+                            for token_id in top_token_ids
+                        ]
+                        top_tokens = Tokens(
+                            top_token_ids,
+                            top_token_logprobs,
+                            toptoken_texts,
+                            special_toptokens,
+                        )
+                        all_top_tokens.append(top_tokens)
+                    top_tokens = all_top_tokens
+                else:
+                    top_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    Tokens(
+                        [next_token_id],
+                        [next_token_logprob],
+                        [next_token_text],
+                        [next_token_id in self.all_special_ids],
+                    ),
+                    generated_text,
+                    top_tokens,
+                )
+
+                generations.append(generation)
+
+            batch.next_token_chooser = (
+                batch.next_token_chooser.advance_grammar_single_with_past_state(
+                    req.idx, next_token_id, grammar_state
+                )
+            )
+
+            req.all_input_ids = all_input_ids
+            req.input_length = new_input_length
+            req.prefix_offset = prefix_offset
+            req.read_offset = read_offset
+
+        htorch.core.mark_step()
+        self.step = self.step + 1
+        if self.hb_profiler is not None:
+            if (
+                self.step
+                > self.profiling_wait_steps
+                + self.profiling_warmup_steps
+                + self.profiling_steps
+            ):
+                self.hb_profiler.stop()
+            else:
+                self.hb_profiler.step()
+
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch if not stopped else None, (forward_ns, decode_ns)
+
+    def batch_from_pb(self, batch, is_warmup):
+        return VlmCausalLMBatch.from_pb_processor(
+            batch,
+            self.tokenizer,
+            self.processor,
+            self.model.config,
+            self.dtype,
+            self.device,
+            is_warmup,
+        )
+
+    def generate_warmup_batch(self, request, seq_len, batch_size, is_warmup):
+        batch = copy.deepcopy(request.batch)
+        for req in batch.requests:
+            req.truncate = seq_len
+
+        for i in range(len(batch.requests) - batch_size):
+            batch.requests.pop()
+
+        return self.batch_from_pb(batch, is_warmup)
+
+    def warmup(self, request) -> None:
+        is_warmup = True
+        batch = self.batch_from_pb(request.batch, is_warmup)
+
+        try:
+            # max prefill batch size warmup
+            _, prefill_batch, _ = self.generate_token([batch], is_warmup)
+        except Exception:
+            raise RuntimeError(
+                f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
+                f"You need to decrease `--max-batch-prefill-tokens`"
+            )
+
+        global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, MAX_BATCH_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST
+        max_input_length = batch.input_ids.shape[1]
+        max_prefill_batch_size = batch.input_ids.shape[0]
+        PREFILL_WARMUP_BATCH_SIZE_LIST = []
+        batch_size = 1
+        while batch_size <= max_prefill_batch_size:
+            PREFILL_WARMUP_BATCH_SIZE_LIST.append(batch_size)
+            batch_size = batch_size * 2
+        if PREFILL_WARMUP_BATCH_SIZE_LIST[-1] < max_prefill_batch_size:
+            PREFILL_WARMUP_BATCH_SIZE_LIST.append(max_prefill_batch_size)
+
+        seq_len = BASE_IMAGE_TOKENS
+        PREFILL_WARMUP_SEQLEN_LIST = []
+        i = 0
+        while seq_len <= max_input_length:
+            PREFILL_WARMUP_SEQLEN_LIST.append(seq_len)
+            seq_len += PAD_SEQUENCE_TO_MULTIPLE_OF * (2**i)
+            i += 1
+        if PREFILL_WARMUP_SEQLEN_LIST[-1] < max_input_length:
+            PREFILL_WARMUP_SEQLEN_LIST.append(max_input_length)
+
+        # Prefill and decode warmup
+        DECODE_WARMUP_BATCH_SIZE_LIST = []
+        prefill_batch = None
+        decode_batch = None
+        try:
+            for batch_size in PREFILL_WARMUP_BATCH_SIZE_LIST:
+                for seq_len in PREFILL_WARMUP_SEQLEN_LIST:
+                    batch = self.generate_warmup_batch(
+                        request, seq_len, batch_size, is_warmup
+                    )
+                    _, prefill_batch, _ = self.generate_token([batch], is_warmup)
+                    _, decode_batch, _ = self.generate_token([prefill_batch], is_warmup)
+
+                DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size)
+
+        except Exception:
+            raise RuntimeError(
+                f"Not enough memory to handle following prefill and decode warmup."
+                f"Prefill batch size list:{PREFILL_WARMUP_BATCH_SIZE_LIST}"
+                f"Prefill sequence length list:{PREFILL_WARMUP_SEQLEN_LIST}"
+                f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}"
+                f"You need to decrease `--max-batch-prefill-tokens`"
+            )
+
+        mem_stats = get_hpu_memory_stats(self.device)
+        logger.info(
+            f"\nFollowing prefill and decode warmup successfully.\n"
+            f"Prefill batch size list:{PREFILL_WARMUP_BATCH_SIZE_LIST}\n"
+            f"Prefill sequence length list:{PREFILL_WARMUP_SEQLEN_LIST}\n"
+            f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}\n"
+            f"Memory stats: {mem_stats} "
+        )
+
+        max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS)
+        batch_size = max_prefill_batch_size * 2
+        # Decode warmup with bigger batch_size
+        try:
+            if (
+                DECODE_WARMUP_BATCH_SIZE_LIST[-1] < max_decode_batch_size
+                and batch_size <= max_decode_batch_size
+            ):
+                batches = []
+                for i in range(int(batch_size / max_prefill_batch_size)):
+                    batch = self.generate_warmup_batch(
+                        request,
+                        PREFILL_WARMUP_SEQLEN_LIST[0],
+                        DECODE_WARMUP_BATCH_SIZE_LIST[-1],
+                        is_warmup,
+                    )
+                    _, prefill_batch, _ = self.generate_token([batch], is_warmup)
+                    batches.append(prefill_batch)
+                while batch_size <= max_decode_batch_size:
+                    _, decode_batch, _ = self.generate_token(batches, is_warmup)
+                    DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size)
+                    batch_size = batch_size * 2
+                    batches.clear()
+
+                    for i in range(int(batch_size / max_prefill_batch_size)):
+                        batch = self.generate_warmup_batch(
+                            request,
+                            PREFILL_WARMUP_SEQLEN_LIST[0],
+                            DECODE_WARMUP_BATCH_SIZE_LIST[-1],
+                            is_warmup,
+                        )
+                        _, prefill_batch, _ = self.generate_token([batch], is_warmup)
+                        batches.append(prefill_batch)
+
+                batches.clear()
+                if DECODE_WARMUP_BATCH_SIZE_LIST[-1] < max_decode_batch_size:
+                    max_decode_batch_size = math.floor(max_decode_batch_size / 2) * 2
+                    batch_size = max_decode_batch_size
+                    for i in range(int(max_decode_batch_size / 2)):
+                        batch = self.generate_warmup_batch(
+                            request, PREFILL_WARMUP_SEQLEN_LIST[0], 2, is_warmup
+                        )
+                        _, prefill_batch, _ = self.generate_token([batch], is_warmup)
+                        batches.append(prefill_batch)
+                    _, decode_batch, _ = self.generate_token(batches, is_warmup)
+                    DECODE_WARMUP_BATCH_SIZE_LIST.append(max_decode_batch_size)
+                max_batch_total_tokens = max_decode_batch_size * MAX_TOTAL_TOKENS
+                MAX_BATCH_TOTAL_TOKENS = max_batch_total_tokens
+        except Exception:
+            raise RuntimeError(
+                f"Not enough memory to handle batch_size({batch_size}) decode warmup."
+                f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}"
+                f"max_decode_batch_size is {max_decode_batch_size}"
+                f"You need to decrease env `MAX_BATCH_TOTAL_TOKENS` or '--max_batch_total_tokens'"
+            )
+
+        mem_stats = get_hpu_memory_stats(self.device)
+        logger.info(
+            f"\nFollowing decode warmup successfully.\n"
+            f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}\n"
+            f"Memory stats: {mem_stats}"
+        )
+
+        return MAX_BATCH_TOTAL_TOKENS
diff --git a/backends/gaudi/server/text_generation_server/pb/.gitignore b/backends/gaudi/server/text_generation_server/pb/.gitignore
new file mode 100644
index 00000000..5a68d631
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/pb/.gitignore
@@ -0,0 +1,3 @@
+*.py
+*.pyi
+*.py-e
diff --git a/backends/gaudi/server/text_generation_server/server.py b/backends/gaudi/server/text_generation_server/server.py
new file mode 100644
index 00000000..674a8aed
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/server.py
@@ -0,0 +1,290 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
+import asyncio
+import os
+import torch
+import time
+import signal
+
+from grpc import aio
+from loguru import logger
+
+from grpc_reflection.v1alpha import reflection
+from pathlib import Path
+from typing import List, Optional
+
+from text_generation_server.cache import Cache
+from text_generation_server.interceptor import ExceptionInterceptor
+from text_generation_server.models import Model, get_model_with_lora_adapters
+from text_generation_server.pb import generate_pb2_grpc, generate_pb2
+from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
+from text_generation_server.models.globals import set_model_id
+from text_generation_server.models.globals import set_adapter_to_index
+from text_generation_server.utils.adapter import AdapterInfo
+from text_generation_server.utils.tokens import make_tokenizer_optional
+
+try:
+    from text_generation_server.models.pali_gemma import PaliGemmaBatch
+    from text_generation_server.models.vlm_causal_lm import (
+        VlmCausalLMBatch,
+    )
+    from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch
+
+    VLM_BATCH_TYPES = {
+        PaliGemmaBatch,
+        VlmCausalLMBatch,
+        IdeficsCausalLMBatch,
+    }
+except (ImportError, NotImplementedError):
+    # These imports can fail on CPU/Non flash.
+    VLM_BATCH_TYPES = set()
+from text_generation_server.utils.version import (
+    is_driver_compatible,
+    MIN_TGI_GAUDI_SYNAPSE_VERSION,
+)
+
+
+class SignalHandler:
+    KEEP_PROCESSING = True
+
+    def __init__(self):
+        signal.signal(signal.SIGINT, self.exit_gracefully)
+        signal.signal(signal.SIGTERM, self.exit_gracefully)
+
+    def exit_gracefully(self, signum, frame):
+        print(f"Exiting gracefully: Signal {signum}")
+        self.KEEP_PROCESSING = False
+
+
+class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
+    def __init__(
+        self,
+        model: Model,
+        cache: Cache,
+        server_urls: List[str],
+    ):
+        self.cache = cache
+        self.model = model
+        # Quantize is resolved during model loading
+        self.quantize = model.quantize
+        self.server_urls = server_urls
+        # For some reason, inference_mode does not work well with GLOO which we use on CPU
+        # TODO: The inferecemode set messes up the autograd op dispatch. And results in aten::matmul
+        # op not optimized issue. Will investigate further.
+        # if model.device.type == "hpu":
+        # Force inference mode for the lifetime of TextGenerationService
+        # self._inference_mode_raii_guard = torch._C._InferenceMode(True)
+
+    async def Info(self, request, context):
+        return self.model.info
+
+    async def Health(self, request, context):
+        if self.model.device.type == "hpu":
+            torch.zeros((2, 2)).to("hpu")
+        return generate_pb2.HealthResponse()
+
+    async def ServiceDiscovery(self, request, context):
+        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)
+
+    async def ClearCache(self, request, context):
+        if request.HasField("id"):
+            self.cache.delete(request.id)
+        else:
+            self.cache.clear()
+        return generate_pb2.ClearCacheResponse()
+
+    async def FilterBatch(self, request, context):
+        batch = self.cache.pop(request.batch_id)
+        if batch is None:
+            raise ValueError(f"Batch ID {request.batch_id} not found in cache.")
+        filtered_batch = batch.filter(request.request_ids)
+        self.cache.set(filtered_batch)
+
+        return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
+
+    async def Warmup(self, request, context):
+        max_supported_total_tokens, max_input_tokens, max_total_tokens = (
+            self.model.warmup(request)
+        )
+
+        # W/A for the skip tokenizer path
+        # We need to call make_tokenizer_optional after the warmup,
+        # because router is not aware of that feature
+        make_tokenizer_optional(self.model.tokenizer)
+
+        return generate_pb2.WarmupResponse(
+            max_supported_total_tokens=max_supported_total_tokens,
+            max_input_tokens=max_input_tokens,
+            max_total_tokens=max_total_tokens,
+        )
+
+    async def Prefill(self, request, context):
+        start = time.time_ns()
+        if (
+            self.model.batch_type in VLM_BATCH_TYPES
+        ):  # Hack, i would rather use kwargs in the `from_pb` call
+            batch = self.model.batch_type.from_pb_processor(
+                request.batch,
+                self.model.tokenizer,
+                self.model.processor,
+                self.model.model.config,
+                self.model.dtype,
+                self.model.device,
+            )
+        else:
+            batch = self.model.batch_type.from_pb(
+                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
+            )
+
+        generations, next_batch, timings = self.model.generate_token([batch])
+        self.cache.set(next_batch)
+
+        return generate_pb2.PrefillResponse(
+            generations=[generation.to_pb() for generation in generations],
+            batch=next_batch.to_pb() if next_batch else None,
+            forward_ns=timings[0],
+            decode_ns=timings[1],
+            total_ns=time.time_ns() - start,
+        )
+
+    async def Decode(self, request, context):
+        start = time.time_ns()
+        if len(request.batches) == 0:
+            raise ValueError("Must provide at least one batch")
+
+        batches = []
+        for batch_pb in request.batches:
+            batch = self.cache.pop(batch_pb.id)
+            if batch is None:
+                raise ValueError(f"Batch ID {batch_pb.id} not found in cache.")
+            batches.append(batch)
+
+        if len(batches) == 0:
+            raise ValueError("All batches are empty")
+
+        generations, next_batch, timings = self.model.generate_token(batches)
+        self.cache.set(next_batch)
+
+        return generate_pb2.DecodeResponse(
+            generations=[generation.to_pb() for generation in generations],
+            batch=next_batch.to_pb() if next_batch else None,
+            concat_ns=None,
+            forward_ns=timings[0],
+            decode_ns=timings[1],
+            total_ns=time.time_ns() - start,
+        )
+
+
+def serve(
+    model_id: str,
+    lora_adapters: Optional[List[AdapterInfo]],
+    revision: Optional[str],
+    sharded: bool,
+    quantize: Optional[str],
+    speculate: Optional[int],
+    dtype: Optional[str],
+    trust_remote_code: bool,
+    uds_path: Path,
+    max_input_tokens: int,
+):
+    async def serve_inner(
+        model_id: str,
+        lora_adapters: Optional[List[AdapterInfo]],
+        revision: Optional[str],
+        sharded: bool = False,
+        quantize: Optional[str] = None,
+        speculate: Optional[int] = None,
+        dtype: Optional[str] = None,
+        trust_remote_code: bool = False,
+    ):
+        if not is_driver_compatible():
+            logger.warning(
+                f"Current Synapse version is lower than the minimum version supported: {MIN_TGI_GAUDI_SYNAPSE_VERSION}, this could result in failures"
+            )
+
+        unix_socket_template = "unix://{}-{}"
+        adapter_to_index = {}
+        logger.info("Server:server_inner: sharded ={}".format(sharded))
+
+        if sharded:
+            rank = int(os.environ["RANK"])
+            logger.info("Server:server_inner: rank ={}".format(rank))
+            server_urls = [
+                unix_socket_template.format(uds_path, rank)
+                for rank in range(int(os.environ["WORLD_SIZE"]))
+            ]
+            local_url = server_urls[int(os.environ["RANK"])]
+        else:
+            local_url = unix_socket_template.format(uds_path, 0)
+            server_urls = [local_url]
+
+        logger.info(
+            "Server:server_inner: data type = {}, local_url = {}".format(
+                dtype, local_url
+            )
+        )
+        if dtype == "bfloat16" or None:
+            data_type = torch.bfloat16
+        else:
+            data_type = torch.float
+        if revision == "None":
+            revision = None
+        try:
+            model = get_model_with_lora_adapters(
+                model_id,
+                lora_adapters,
+                revision,
+                sharded,
+                quantize,
+                speculate,
+                data_type,
+                trust_remote_code,
+                max_input_tokens,
+                adapter_to_index,
+            )
+
+        except Exception:
+            logger.exception("Error when initializing model")
+            raise
+
+        set_adapter_to_index(adapter_to_index)
+        server = aio.server(
+            interceptors=[
+                ExceptionInterceptor(),
+                UDSOpenTelemetryAioServerInterceptor(),
+            ],
+            options=[
+                # Set the maximum possible message length: i32::MAX
+                ("grpc.max_receive_message_length", (1 << 31) - 1)
+            ],
+        )
+        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
+            TextGenerationService(model, Cache(), server_urls), server
+        )
+        SERVICE_NAMES = (
+            generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
+            reflection.SERVICE_NAME,
+        )
+        reflection.enable_server_reflection(SERVICE_NAMES, server)
+        server.add_insecure_port(local_url)
+
+        await server.start()
+
+        logger.info("Server started at {}".format(local_url))
+        signal_handler = SignalHandler()
+        while signal_handler.KEEP_PROCESSING:
+            await asyncio.sleep(0.5)
+
+    set_model_id(model_id)
+    asyncio.run(
+        serve_inner(
+            model_id,
+            lora_adapters,
+            revision,
+            sharded,
+            quantize,
+            speculate,
+            dtype,
+            trust_remote_code,
+        )
+    )
diff --git a/backends/gaudi/server/text_generation_server/tgi_service.py b/backends/gaudi/server/text_generation_server/tgi_service.py
new file mode 100644
index 00000000..18e88a7e
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/tgi_service.py
@@ -0,0 +1,49 @@
+import os
+from pathlib import Path
+from loguru import logger
+from text_generation_server import server
+import argparse
+from text_generation_server.utils.adapter import parse_lora_adapters
+
+
+def main(args):
+    logger.info("TGIService: starting tgi service .... ")
+    logger.info(
+        "TGIService: --model_id {}, --revision {}, --sharded {}, --speculate {}, --dtype {}, --trust_remote_code {}, --uds_path {} ".format(
+            args.model_id,
+            args.revision,
+            args.sharded,
+            args.speculate,
+            args.dtype,
+            args.trust_remote_code,
+            args.uds_path,
+        )
+    )
+    lora_adapters = parse_lora_adapters(os.getenv("LORA_ADAPTERS"))
+    server.serve(
+        model_id=args.model_id,
+        lora_adapters=lora_adapters,
+        revision=args.revision,
+        sharded=args.sharded,
+        quantize=args.quantize,
+        speculate=args.speculate,
+        dtype=args.dtype,
+        trust_remote_code=args.trust_remote_code,
+        uds_path=args.uds_path,
+        max_input_tokens=args.max_input_tokens,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_id", type=str)
+    parser.add_argument("--revision", type=str)
+    parser.add_argument("--sharded", type=bool)
+    parser.add_argument("--speculate", type=int, default=None)
+    parser.add_argument("--dtype", type=str)
+    parser.add_argument("--trust_remote_code", type=bool)
+    parser.add_argument("--uds_path", type=Path)
+    parser.add_argument("--quantize", type=str)
+    parser.add_argument("--max_input_tokens", type=int)
+    args = parser.parse_args()
+    main(args)
diff --git a/backends/gaudi/server/text_generation_server/tracing.py b/backends/gaudi/server/text_generation_server/tracing.py
new file mode 100644
index 00000000..bc7a04ee
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/tracing.py
@@ -0,0 +1,63 @@
+import grpc
+
+from opentelemetry import trace
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.grpc._aio_server import (
+    OpenTelemetryAioServerInterceptor,
+)
+from opentelemetry.semconv.trace import SpanAttributes
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import (
+    BatchSpanProcessor,
+)
+
+
+class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
+    def __init__(self):
+        super().__init__(trace.get_tracer(__name__))
+
+    def _start_span(self, handler_call_details, context, set_status_on_exception=False):
+        """
+        Rewrite _start_span method to support Unix Domain Socket gRPC contexts
+        """
+
+        # standard attributes
+        attributes = {
+            SpanAttributes.RPC_SYSTEM: "grpc",
+            SpanAttributes.RPC_GRPC_STATUS_CODE: grpc.StatusCode.OK.value[0],
+        }
+
+        # if we have details about the call, split into service and method
+        if handler_call_details.method:
+            service, method = handler_call_details.method.lstrip("/").split("/", 1)
+            attributes.update(
+                {
+                    SpanAttributes.RPC_METHOD: method,
+                    SpanAttributes.RPC_SERVICE: service,
+                }
+            )
+
+        # add some attributes from the metadata
+        metadata = dict(context.invocation_metadata())
+        if "user-agent" in metadata:
+            attributes["rpc.user_agent"] = metadata["user-agent"]
+
+        # We use gRPC over a UNIX socket
+        attributes.update({SpanAttributes.NET_TRANSPORT: "unix"})
+
+        return self._tracer.start_as_current_span(
+            name=handler_call_details.method,
+            kind=trace.SpanKind.SERVER,
+            attributes=attributes,
+            set_status_on_exception=set_status_on_exception,
+        )
+
+
+def setup_tracing(otlp_service_name: str, otlp_endpoint: str):
+    resource = Resource.create(attributes={"service.name": otlp_service_name})
+    span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
+    span_processor = BatchSpanProcessor(span_exporter)
+
+    trace.set_tracer_provider(TracerProvider(resource=resource))
+    trace.get_tracer_provider().add_span_processor(span_processor)
diff --git a/backends/gaudi/server/text_generation_server/utils/__init__.py b/backends/gaudi/server/text_generation_server/utils/__init__.py
new file mode 100644
index 00000000..cda3a4da
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/__init__.py
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
+from text_generation_server.utils.convert import convert_file, convert_files
+from text_generation_server.utils.dist import initialize_torch_distributed
+from text_generation_server.utils.weights import Weights
+from text_generation_server.utils.peft import download_and_unload_peft
+from text_generation_server.utils.hub import (
+    weight_files,
+    weight_hub_files,
+    download_weights,
+    EntryNotFoundError,
+    LocalEntryNotFoundError,
+    RevisionNotFoundError,
+)
+from text_generation_server.utils.tokens import (
+    NextTokenChooser,
+    HeterogeneousNextTokenChooser,
+    StoppingCriteria,
+    StopSequenceCriteria,
+    FinishReason,
+    Sampling,
+    Greedy,
+    make_tokenizer_optional,
+    is_tokenizer_transparent,
+    pad_next_token_chooser_parameters,
+)
+
+__all__ = [
+    "convert_file",
+    "convert_files",
+    "initialize_torch_distributed",
+    "weight_files",
+    "weight_hub_files",
+    "download_weights",
+    "download_and_unload_peft",
+    "EntryNotFoundError",
+    "HeterogeneousNextTokenChooser",
+    "LocalEntryNotFoundError",
+    "RevisionNotFoundError",
+    "Greedy",
+    "NextTokenChooser",
+    "Sampling",
+    "StoppingCriteria",
+    "StopSequenceCriteria",
+    "FinishReason",
+    "Weights",
+    "make_tokenizer_optional",
+    "is_tokenizer_transparent",
+    "pad_next_token_chooser_parameters",
+]
diff --git a/backends/gaudi/server/text_generation_server/utils/adapter.py b/backends/gaudi/server/text_generation_server/utils/adapter.py
new file mode 100644
index 00000000..2b61f9bb
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/adapter.py
@@ -0,0 +1,320 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/utils/adapter.py
+# License:  Apache License Version 2.0, January 2004
+
+import warnings
+import re
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import TYPE_CHECKING, Set, Tuple, Optional, List
+
+from safetensors.torch import load_file
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+
+from text_generation_server.utils.merges.strategies import merge_adapters
+
+from text_generation_server.utils import hub
+from text_generation_server.adapters.lora import LoraConfig
+
+
+if TYPE_CHECKING:
+    from text_generation_server.adapters.config import AdapterConfig, ModuleMap
+
+
+BASE_MODEL_ADAPTER_ID = "__base_model__"
+
+
+@dataclass
+class AdapterInfo:
+    id: str
+    path: Optional[str]
+    revision: Optional[str] = None
+
+
+@dataclass
+class AdapterParameters:
+    adapter_info: Tuple[AdapterInfo]
+    weights: Tuple[float]
+    merge_strategy: NotImplemented
+    density: float
+    majority_sign_method: NotImplemented
+
+
+@dataclass
+class AdapterSource:
+    adapter_id: str
+    model_id: str
+    revision: str
+
+
+def parse_lora_adapters(lora_adapters: Optional[str]) -> List[AdapterInfo]:
+    if not lora_adapters:
+        return []
+
+    adapter_list = []
+    for adapter in lora_adapters.split(","):
+        adapter = adapter.strip()
+        if adapter.count("=") > 1 or adapter.count("@") > 1:
+            raise ValueError(f"Invalid LoRA adapter format: {adapter}")
+        match = re.match(r"^([^=@]+)(?:=([^@]+))?(?:@(.+))?$", adapter)
+
+        if match:
+            adapter_id, path, revision = match.groups()
+            adapter_list.append(
+                AdapterInfo(id=adapter_id, path=path, revision=revision)
+            )
+        else:
+            raise ValueError(f"Invalid LoRA adapter format: {adapter}")
+    return adapter_list
+
+
+def load_and_merge_adapters(
+    model_id: str,
+    adapter_parameters: AdapterParameters,
+    adapter_index: int,
+    weight_names: Tuple[str],
+    trust_remote_code: bool = False,
+) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
+    if len(adapter_parameters.adapter_info) == 1:
+        adapter = next(iter(adapter_parameters.adapter_info))
+        return load_module_map(
+            model_id,
+            adapter.revision,
+            adapter.id,
+            adapter.path,
+            weight_names,
+            trust_remote_code,
+        )
+
+    adapter_params = AdapterParametersContainer(adapter_parameters, adapter_index)
+    return _load_and_merge(
+        model_id,
+        adapter_params,
+        weight_names,
+        trust_remote_code,
+    )
+
+
+@dataclass
+class AdapterParametersContainer:
+    adapter_parameters: AdapterParameters
+    adapter_index: int
+
+    def __hash__(self) -> int:
+        return self.adapter_index
+
+
+@lru_cache(maxsize=32)
+def _load_and_merge(
+    model_id: str,
+    adapter_params: AdapterParametersContainer,
+    weight_names: Tuple[str],
+    trust_remote_code: bool = False,
+) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
+    params = adapter_params.adapter_parameters
+
+    adapters_to_merge = []
+    merged_weight_names = set()
+    tokenizer = None
+    for adapter in params.adapter_info:
+        if adapter.id == BASE_MODEL_ADAPTER_ID:
+            raise ValueError("Base model adapter cannot be merged.")
+
+        module_map, adapter_config, adapter_weight_names, adapter_tokenizer = (
+            load_module_map(
+                model_id,
+                adapter.revision,
+                adapter.id,
+                adapter.path,
+                weight_names,
+                trust_remote_code,
+            )
+        )
+
+        adapters_to_merge.append((module_map, adapter_config))
+        merged_weight_names = merged_weight_names.union(adapter_weight_names)
+        if tokenizer is None:
+            tokenizer = adapter_tokenizer
+
+    if len(adapters_to_merge) == 0:
+        raise ValueError("No adapters to merge.")
+
+    module_map, adapter_config = merge_adapters(adapters_to_merge, params)
+    return module_map, adapter_config, merged_weight_names, tokenizer
+
+
+def check_architectures(
+    model_id: str,
+    adapter_id: str,
+    adapter_config: "AdapterConfig",
+    trust_remote_code: bool = False,
+):
+    try:
+        if not adapter_config.base_model_name_or_path:
+            # Avoid execution latency caused by the network connection retrying for AutoConfig.from_pretrained(None)
+            return
+
+        expected_config = AutoConfig.from_pretrained(
+            model_id, trust_remote_code=trust_remote_code
+        )
+        model_config = AutoConfig.from_pretrained(
+            adapter_config.base_model_name_or_path, trust_remote_code=trust_remote_code
+        )
+    except Exception as e:
+        warnings.warn(
+            f"Unable to check architecture compatibility for adapter '{adapter_id}' "
+            f"against model '{model_id}'. Assuming they are compatible. Error: {e}"
+        )
+        return
+
+    if model_config.architectures == expected_config.architectures:
+        warnings.warn(
+            f"Adapter '{adapter_id}' was not trained on base model '{model_id}'. "
+            f"If you encounter issues, use --model-id '{adapter_config.base_model_name_or_path}' instead."
+        )
+    else:
+        # TODO(travis): revisit this when we support clasification heads which will not use CausalLM
+        raise ValueError(
+            f"Adapter '{adapter_id}' is not compatible with model '{model_id}'. "
+            f"Architectures differ: {model_config.architectures} != {expected_config.architectures}. "
+            f"Use --model-id '{adapter_config.base_model_name_or_path}' instead."
+        )
+
+
+@lru_cache(maxsize=128)
+def load_module_map(
+    model_id: str,
+    revision: str,
+    adapter_id: str,
+    adapter_path: Optional[str],
+    weight_names: Tuple[str],
+    trust_remote_code: bool = False,
+) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
+    adapter_config = LoraConfig.load(adapter_path or adapter_id, None)
+
+    if not adapter_path and adapter_config.base_model_name_or_path != model_id:
+        check_architectures(model_id, adapter_id, adapter_config, trust_remote_code)
+
+    adapter_filenames = (
+        hub._weight_files_from_dir(adapter_path, extension=".safetensors")
+        if adapter_path
+        else hub._cached_weight_files(
+            adapter_id, revision=revision, extension=".safetensors"
+        )
+    )
+
+    # throw an error if no adapter weights are found
+    if not adapter_filenames:
+        raise FileNotFoundError(
+            f"No adapter weights found for adapter '{adapter_id}' and revision '{revision}'."
+        )
+
+    try:
+        adapter_tokenizer = AutoTokenizer.from_pretrained(
+            adapter_config.config_path,
+            trust_remote_code=trust_remote_code,
+        )
+    except Exception:
+        # Adapter does not have a tokenizer, so fallback to base model tokenizer
+        adapter_tokenizer = None
+
+    # load adapter weights from all shards (should have relatively small memory footprint)
+    adapter_weights = {}
+    for filename in adapter_filenames:
+        adapter_weights.update(load_file(filename))
+
+    # map the model weights to the relevant adapter weights (LoRA A and B matrices)
+    module_map, adapter_weight_names = adapter_config.map_weights_for_model(
+        adapter_weights, weight_names
+    )
+    return module_map, adapter_config, adapter_weight_names, adapter_tokenizer
+
+
+def get_attn_weights(i, layer):
+    qkv = layer.self_attn.query_key_value
+    weights = {}
+
+    for k in ["q", "k", "v"]:
+        key = (i, f"{k}_proj")
+        value = (f"model.layers.{i}.self_attn.{k}_proj", qkv)
+        weights[key] = value
+
+    # also add the qkv_proj weight for the adapter
+    weights[(i, "qkv_proj")] = (
+        f"model.layers.{i}.self_attn.qkv_proj",
+        qkv,
+    )
+
+    weights[(i, "o_proj")] = (
+        f"model.layers.{i}.self_attn.o_proj",
+        layer.self_attn.o_proj,
+    )
+
+    return weights
+
+
+def get_mlp_weights(i, layer):
+    weights = {}
+    if hasattr(layer, "mlp"):
+        mlp = layer.mlp
+        if hasattr(mlp, "gate_up_proj"):
+            # handle combined gate_up_proj (e.g., for some LLaMA variants)
+            weights.update(
+                {
+                    (i, "gate_proj"): (
+                        f"model.layers.{i}.mlp.gate_proj",
+                        mlp.gate_up_proj,
+                    ),
+                    (i, "up_proj"): (f"model.layers.{i}.mlp.up_proj", mlp.gate_up_proj),
+                }
+            )
+        else:
+            # handle separate gate_proj, up_proj, and down_proj (e.g., for Gemma)
+            if hasattr(mlp, "gate_proj"):
+                weights[(i, "gate_proj")] = (
+                    f"model.layers.{i}.mlp.gate_proj",
+                    mlp.gate_proj,
+                )
+            if hasattr(mlp, "up_proj"):
+                weights[(i, "up_proj")] = (f"model.layers.{i}.mlp.up_proj", mlp.up_proj)
+
+        if hasattr(mlp, "down_proj"):
+            weights[(i, "down_proj")] = (
+                f"model.layers.{i}.mlp.down_proj",
+                mlp.down_proj,
+            )
+
+    return weights
+
+
+# build_layer_weight_lookup creates a mapping of model layers to their corresponding
+# weight tensors and paths. It builds a dictionary that maps layer identifiers to tuples
+# containing the weight tensor path and the actual layer object. This mapping is needed
+# for the lora adapter to know which weights to update when applying the adapter.
+def build_layer_weight_lookup(model):
+    if hasattr(model, "language_model"):
+        m = model.language_model.model
+    elif hasattr(model, "text_model"):
+        m = model.text_model.model
+    else:
+        m = model.model
+
+    layer_weights = {}
+
+    for i, layer in enumerate(m.layers):
+        attn_weights = get_attn_weights(i, layer)
+        mlp_weights = get_mlp_weights(i, layer)
+
+        layer_weights.update(attn_weights)
+        layer_weights.update(mlp_weights)
+
+    lm_head = None
+    if hasattr(m, "lm_head"):
+        lm_head = m.lm_head
+    elif hasattr(model, "lm_head"):
+        lm_head = model.lm_head
+
+    if lm_head:
+        layer_weights[(0, "lm_head")] = ("lm_head", lm_head)
+
+    return layer_weights
diff --git a/backends/gaudi/server/text_generation_server/utils/chunks.py b/backends/gaudi/server/text_generation_server/utils/chunks.py
new file mode 100644
index 00000000..73962ea3
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/chunks.py
@@ -0,0 +1,27 @@
+from typing import Iterable
+
+from loguru import logger
+
+from text_generation_server.pb import generate_pb2
+
+
+def concat_text_chunks(chunks: Iterable[generate_pb2.InputChunk]) -> str:
+    """
+    Concatenate text in text chunks. Non-text chunks are dropped.
+    """
+    text = None
+    for chunk in chunks:
+        chunk_type = chunk.WhichOneof("chunk")
+        if chunk_type == "text":
+            if text is None:
+                text = chunk.text
+            else:
+                raise NotImplementedError("Request contained more than one text chunk")
+        else:
+            # We cannot reject this, e.g. warmup sends an image chunk.
+            logger.debug(f"Encountered non-text chunk type {chunk_type}")
+
+    if text is None:
+        raise NotImplementedError("Request without a text chunk")
+
+    return text
diff --git a/backends/gaudi/server/text_generation_server/utils/convert.py b/backends/gaudi/server/text_generation_server/utils/convert.py
new file mode 100644
index 00000000..d9c3276b
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/convert.py
@@ -0,0 +1,114 @@
+import datetime
+import torch
+import os
+
+from loguru import logger
+from pathlib import Path
+from safetensors.torch import save_file, load_file, _find_shared_tensors, _is_complete
+from typing import List, Dict
+from collections import defaultdict
+
+
+def _remove_duplicate_names(
+    state_dict: Dict[str, torch.Tensor],
+    *,
+    preferred_names: List[str] = None,
+    discard_names: List[str] = None,
+) -> Dict[str, List[str]]:
+    if preferred_names is None:
+        preferred_names = []
+    preferred_names = set(preferred_names)
+    if discard_names is None:
+        discard_names = []
+    discard_names = set(discard_names)
+
+    shareds = _find_shared_tensors(state_dict)
+    to_remove = defaultdict(list)
+    for shared in shareds:
+        complete_names = set(
+            [name for name in shared if _is_complete(state_dict[name])]
+        )
+        if not complete_names:
+            if len(shared) == 1:
+                # Force contiguous
+                name = list(shared)[0]
+                state_dict[name] = state_dict[name].clone()
+                complete_names = {name}
+            else:
+                raise RuntimeError(
+                    f"Error while trying to find names to remove to save state dict, but found no suitable name to keep for saving amongst: {shared}. None is covering the entire storage.Refusing to save/load the model since you could be storing much more memory than needed. Please refer to https://huggingface.co/docs/safetensors/torch_shared_tensors for more information. Or open an issue."
+                )
+
+        keep_name = sorted(list(complete_names))[0]
+
+        # Mecanism to preferentially select keys to keep
+        # coming from the on-disk file to allow
+        # loading models saved with a different choice
+        # of keep_name
+        preferred = complete_names.difference(discard_names)
+        if preferred:
+            keep_name = sorted(list(preferred))[0]
+
+        if preferred_names:
+            preferred = preferred_names.intersection(complete_names)
+            if preferred:
+                keep_name = sorted(list(preferred))[0]
+        for name in sorted(shared):
+            if name != keep_name:
+                to_remove[keep_name].append(name)
+    return to_remove
+
+
+def convert_file(pt_file: Path, sf_file: Path, discard_names: List[str]):
+    """
+    Convert a pytorch file to a safetensors file
+    This will remove duplicate tensors from the file.
+
+    Unfortunately, this might not respect *transformers* convention.
+    Forcing us to check for potentially different keys during load when looking
+    for specific tensors (making tensor sharing explicit).
+    """
+    loaded = torch.load(pt_file, map_location="cpu", weights_only=True)
+    if "state_dict" in loaded:
+        loaded = loaded["state_dict"]
+    to_removes = _remove_duplicate_names(loaded, discard_names=discard_names)
+
+    metadata = {"format": "pt"}
+    for kept_name, to_remove_group in to_removes.items():
+        for to_remove in to_remove_group:
+            if to_remove not in metadata:
+                metadata[to_remove] = kept_name
+            del loaded[to_remove]
+    # Force tensors to be contiguous
+    loaded = {k: v.contiguous() for k, v in loaded.items()}
+
+    dirname = os.path.dirname(sf_file)
+    os.makedirs(dirname, exist_ok=True)
+    save_file(loaded, sf_file, metadata=metadata)
+    reloaded = load_file(sf_file)
+    for k in loaded:
+        pt_tensor = loaded[k]
+        sf_tensor = reloaded[k]
+        if not torch.equal(pt_tensor, sf_tensor):
+            raise RuntimeError(f"The output tensors do not match for key {k}")
+
+
+def convert_files(pt_files: List[Path], sf_files: List[Path], discard_names: List[str]):
+    assert len(pt_files) == len(sf_files)
+
+    N = len(pt_files)
+    # We do this instead of using tqdm because we want to parse the logs with the launcher
+
+    for i, (pt_file, sf_file) in enumerate(zip(pt_files, sf_files)):
+        # Skip blacklisted files
+        if (
+            "arguments" in pt_file.name
+            or "args" in pt_file.name
+            or "training" in pt_file.name
+        ):
+            continue
+
+        start = datetime.datetime.now()
+        convert_file(pt_file, sf_file, discard_names)
+        elapsed = datetime.datetime.now() - start
+        logger.info(f"Convert: [{i + 1}/{N}] -- Took: {elapsed}")
diff --git a/backends/gaudi/server/text_generation_server/utils/debug.py b/backends/gaudi/server/text_generation_server/utils/debug.py
new file mode 100644
index 00000000..8bbcad6a
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/debug.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
+import os
+import glob
+import time
+
+from optimum.habana.utils import to_gb_rounded
+import habana_frameworks.torch as htorch
+
+START_TS = None
+DBG_TRACE_FILENAME = os.environ.get("DBG_TRACE_FILENAME")
+if "GRAPH_VISUALIZATION" in os.environ:
+    for f in glob.glob(".graph_dumps/*"):
+        os.remove(f)
+
+
+def count_hpu_graphs():
+    return len(glob.glob(".graph_dumps/*PreGraph*"))
+
+
+def dbg_trace(tag, txt):
+    global START_TS
+    if DBG_TRACE_FILENAME is not None and int(os.getenv("RANK", 0)) == 0:
+        if START_TS is None:
+            START_TS = time.perf_counter()
+        time_offset = time.perf_counter() - START_TS
+        mem_stats = htorch.hpu.memory.memory_stats()
+        mem_used = to_gb_rounded(mem_stats["InUse"])
+        max_mem_used = to_gb_rounded(mem_stats["MaxInUse"])
+        print(
+            f"ts:{time_offset:.3f}s g:{count_hpu_graphs()} mu:{mem_used:.1f}GB "
+            f"mmu:{max_mem_used:.1f}GB | {tag} | {txt}",
+            flush=True,
+            file=open(DBG_TRACE_FILENAME, "a"),
+        )
diff --git a/backends/gaudi/server/text_generation_server/utils/dist.py b/backends/gaudi/server/text_generation_server/utils/dist.py
new file mode 100644
index 00000000..0e9b97fb
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/dist.py
@@ -0,0 +1,99 @@
+import os
+import torch
+
+from datetime import timedelta
+from loguru import logger
+
+# Tensor Parallelism settings
+RANK = int(os.getenv("RANK", "0"))
+WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
+
+# CUDA memory fraction
+MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))
+
+
+class FakeBarrier:
+    def wait(self):
+        pass
+
+
+class FakeGroup:
+    def __init__(self, rank, size):
+        self._rank = rank
+        self._size = size
+
+    def allreduce(self, *args, **kwargs):
+        return FakeBarrier()
+
+    def allgather(self, inputs, local_tensor, **kwargs):
+        assert (
+            len(inputs[0]) == len(local_tensor) == 1
+        ), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"
+        for input_ in inputs:
+            input_[0].data = local_tensor[0].data
+        return FakeBarrier()
+
+    def barrier(self, *args, **kwargs):
+        return FakeBarrier()
+
+    def size(self):
+        return self._size
+
+    def rank(self):
+        return self._rank
+
+
+def initialize_torch_distributed():
+
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+
+    options = None
+    if torch.cuda.is_available():
+        from torch.distributed import ProcessGroupNCCL
+
+        # Set the device id.
+        assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"
+        device = RANK % torch.cuda.device_count()
+        torch.cuda.set_device(device)
+        torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)
+        backend = "nccl"
+        options = ProcessGroupNCCL.Options()
+        options.is_high_priority_stream = True
+        options._timeout = timedelta(seconds=60)
+    elif torch.hpu.is_available():
+        backend = "hccl"
+        n_hpus = torch.hpu.device_count()
+        if world_size > n_hpus:
+            raise ValueError(
+                f"WORLD_SIZE ({world_size}) is higher than the number of available HPUs ({n_hpus})."
+            )
+    else:
+        try:
+            import oneccl_bindings_for_pytorch  # noqa: F401
+
+            backend = "ccl"
+            if os.getenv("CCL_WORKER_COUNT", None) is None:
+                os.environ["CCL_WORKER_COUNT"] = str(1)
+        except ImportError:
+            backend = "gloo"
+        options = None
+
+    if WORLD_SIZE == 1:
+        return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
+    else:
+        if os.getenv("DEBUG", None) == "1":
+            return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
+
+        if not torch.distributed.is_initialized():
+            # Call the init process.
+            torch.distributed.init_process_group(
+                backend=backend,
+                world_size=WORLD_SIZE,
+                rank=RANK,
+                timeout=timedelta(seconds=60),
+                pg_options=options,
+            )
+        else:
+            logger.warning("torch.distributed is already initialized.")
+
+        return torch.distributed.group.WORLD, RANK, WORLD_SIZE
diff --git a/backends/gaudi/server/text_generation_server/utils/hub.py b/backends/gaudi/server/text_generation_server/utils/hub.py
new file mode 100644
index 00000000..f9c476ac
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/hub.py
@@ -0,0 +1,234 @@
+import time
+import os
+
+from datetime import timedelta
+from loguru import logger
+from pathlib import Path
+from typing import Optional, List
+
+from huggingface_hub import file_download, hf_api, HfApi, hf_hub_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from huggingface_hub.utils import (
+    LocalEntryNotFoundError,
+    EntryNotFoundError,
+    RevisionNotFoundError,  # noqa # Import here to ease try/except in other part of the lib
+)
+
+WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None)
+HF_HUB_OFFLINE = os.environ.get("HF_HUB_OFFLINE", "0").lower() in ["true", "1", "yes"]
+
+
+def _cached_weight_files(
+    model_id: str, revision: Optional[str], extension: str
+) -> List[str]:
+    """Guess weight files from the cached revision snapshot directory"""
+    d = _get_cached_revision_directory(model_id, revision)
+    if not d:
+        return []
+    filenames = _weight_files_from_dir(d, extension)
+    return filenames
+
+
+def _weight_hub_files_from_model_info(
+    info: hf_api.ModelInfo, extension: str
+) -> List[str]:
+    return [
+        s.rfilename
+        for s in info.siblings
+        if s.rfilename.endswith(extension)
+        and len(s.rfilename.split("/")) == 1
+        and "arguments" not in s.rfilename
+        and "args" not in s.rfilename
+        and "training" not in s.rfilename
+    ]
+
+
+def _weight_files_from_dir(d: Path, extension: str) -> List[str]:
+    # os.walk: do not iterate, just scan for depth 1, not recursively
+    # see _weight_hub_files_from_model_info, that's also what is
+    # done there with the len(s.rfilename.split("/")) == 1 condition
+    root, _, files = next(os.walk(str(d)))
+    filenames = [
+        os.path.join(root, f)
+        for f in files
+        if f.endswith(extension)
+        and "arguments" not in f
+        and "args" not in f
+        and "training" not in f
+    ]
+    return filenames
+
+
+def _get_cached_revision_directory(
+    model_id: str, revision: Optional[str]
+) -> Optional[Path]:
+    if revision is None:
+        revision = "main"
+
+    repo_cache = Path(HUGGINGFACE_HUB_CACHE) / Path(
+        file_download.repo_folder_name(repo_id=model_id, repo_type="model")
+    )
+
+    if not repo_cache.is_dir():
+        # No cache for this model
+        return None
+
+    refs_dir = repo_cache / "refs"
+    snapshots_dir = repo_cache / "snapshots"
+
+    # Resolve refs (for instance to convert main to the associated commit sha)
+    if refs_dir.is_dir():
+        revision_file = refs_dir / revision
+        if revision_file.exists():
+            with revision_file.open() as f:
+                revision = f.read()
+
+    # Check if revision folder exists
+    if not snapshots_dir.exists():
+        return None
+    cached_shas = os.listdir(snapshots_dir)
+    if revision not in cached_shas:
+        # No cache for this revision and we won't try to return a random revision
+        return None
+
+    return snapshots_dir / revision
+
+
+def weight_hub_files(
+    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
+) -> List[str]:
+    """Get the weights filenames on the hub"""
+    api = HfApi()
+
+    if HF_HUB_OFFLINE:
+        filenames = _cached_weight_files(model_id, revision, extension)
+    else:
+        # Online case, fetch model info from the Hub
+        info = api.model_info(model_id, revision=revision)
+        filenames = _weight_hub_files_from_model_info(info, extension)
+
+    if not filenames:
+        raise EntryNotFoundError(
+            f"No {extension} weights found for model {model_id} and revision {revision}.",
+            None,
+        )
+
+    return filenames
+
+
+def try_to_load_from_cache(
+    model_id: str, revision: Optional[str], filename: str
+) -> Optional[Path]:
+    """Try to load a file from the Hugging Face cache"""
+
+    d = _get_cached_revision_directory(model_id, revision)
+    if not d:
+        return None
+
+    # Check if file exists in cache
+    cached_file = d / filename
+    return cached_file if cached_file.is_file() else None
+
+
+def weight_files(
+    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
+) -> List[Path]:
+    """Get the local files"""
+    # Local model
+    d = Path(model_id)
+    if d.exists() and d.is_dir():
+        local_files = _weight_files_from_dir(d, extension)
+        if not local_files:
+            raise FileNotFoundError(
+                f"No local weights found in {model_id} with extension {extension}"
+            )
+        return [Path(f) for f in local_files]
+
+    try:
+        filenames = weight_hub_files(model_id, revision, extension)
+    except EntryNotFoundError as e:
+        if extension != ".safetensors":
+            raise e
+        # Try to see if there are pytorch weights
+        pt_filenames = weight_hub_files(model_id, revision, extension=".bin")
+        # Change pytorch extension to safetensors extension
+        # It is possible that we have safetensors weights locally even though they are not on the
+        # hub if we converted weights locally without pushing them
+        filenames = [
+            f"{Path(f).stem.lstrip('pytorch_')}.safetensors" for f in pt_filenames
+        ]
+
+    if WEIGHTS_CACHE_OVERRIDE is not None:
+        files = []
+        for filename in filenames:
+            p = Path(WEIGHTS_CACHE_OVERRIDE) / filename
+            if not p.exists():
+                raise FileNotFoundError(
+                    f"File {p} not found in {WEIGHTS_CACHE_OVERRIDE}."
+                )
+            files.append(p)
+        return files
+
+    files = []
+    for filename in filenames:
+        cache_file = try_to_load_from_cache(
+            model_id, revision=revision, filename=filename
+        )
+        if cache_file is None:
+            raise LocalEntryNotFoundError(
+                f"File {filename} of model {model_id} not found in "
+                f"{os.getenv('HUGGINGFACE_HUB_CACHE', 'the local cache')}. "
+                f"Please run `text-generation-server download-weights {model_id}` first."
+            )
+        files.append(cache_file)
+
+    return files
+
+
+def download_weights(
+    filenames: List[str], model_id: str, revision: Optional[str] = None
+) -> List[Path]:
+    """Download the safetensors files from the hub"""
+
+    def download_file(fname, tries=5, backoff: int = 5):
+        local_file = try_to_load_from_cache(model_id, revision, fname)
+        if local_file is not None:
+            logger.info(f"File {fname} already present in cache.")
+            return Path(local_file)
+
+        for idx in range(tries):
+            try:
+                logger.info(f"Download file: {fname}")
+                stime = time.time()
+                local_file = hf_hub_download(
+                    filename=fname,
+                    repo_id=model_id,
+                    revision=revision,
+                    local_files_only=HF_HUB_OFFLINE,
+                )
+                logger.info(
+                    f"Downloaded {local_file} in {timedelta(seconds=int(time.time() - stime))}."
+                )
+                return Path(local_file)
+            except Exception as e:
+                if idx + 1 == tries:
+                    raise e
+                logger.error(e)
+                logger.info(f"Retrying in {backoff} seconds")
+                time.sleep(backoff)
+                logger.info(f"Retry {idx + 1}/{tries - 1}")
+
+    # We do this instead of using tqdm because we want to parse the logs with the launcher
+    start_time = time.time()
+    files = []
+    for i, filename in enumerate(filenames):
+        file = download_file(filename)
+
+        elapsed = timedelta(seconds=int(time.time() - start_time))
+        remaining = len(filenames) - (i + 1)
+        eta = (elapsed / (i + 1)) * remaining if remaining > 0 else 0
+
+        logger.info(f"Download: [{i + 1}/{len(filenames)}] -- ETA: {eta}")
+        files.append(file)
+
+    return files
diff --git a/backends/gaudi/server/text_generation_server/utils/import_utils.py b/backends/gaudi/server/text_generation_server/utils/import_utils.py
new file mode 100644
index 00000000..782b4f15
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/import_utils.py
@@ -0,0 +1,75 @@
+import torch
+from loguru import logger
+import os
+
+
+import importlib.util
+
+
+def is_ipex_available():
+    return importlib.util.find_spec("intel_extension_for_pytorch") is not None
+
+
+def get_cuda_free_memory(device, memory_fraction):
+    total_free_memory, _ = torch.cuda.mem_get_info(device)
+    total_gpu_memory = torch.cuda.get_device_properties(device).total_memory
+    free_memory = max(0, total_free_memory - (1 - memory_fraction) * total_gpu_memory)
+    return free_memory
+
+
+def get_xpu_free_memory(device, memory_fraction):
+    total_memory = torch.xpu.get_device_properties(device).total_memory
+    device_id = device.index
+    memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "1.0"))
+    free_memory = max(
+        0,
+        int(
+            total_memory * 0.9 * memory_fraction - torch.xpu.memory_reserved(device_id)
+        ),
+    )
+    return free_memory
+
+
+def get_cpu_free_memory(device, memory_fraction):
+    import psutil
+    from text_generation_server.utils.dist import WORLD_SIZE
+
+    mem = psutil.virtual_memory()
+    free_memory = int(mem.available * 0.95 / WORLD_SIZE)
+    return free_memory
+
+
+def noop(*args, **kwargs):
+    pass
+
+
+SYSTEM = None
+if torch.version.hip is not None:
+    SYSTEM = "rocm"
+    empty_cache = torch.cuda.empty_cache
+    synchronize = torch.cuda.synchronize
+    get_free_memory = get_cuda_free_memory
+elif torch.version.cuda is not None and torch.cuda.is_available():
+    SYSTEM = "cuda"
+    empty_cache = torch.cuda.empty_cache
+    synchronize = torch.cuda.synchronize
+    get_free_memory = get_cuda_free_memory
+elif is_ipex_available():
+    SYSTEM = "ipex"
+    import intel_extension_for_pytorch  # noqa: F401
+
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        empty_cache = torch.xpu.empty_cache
+        synchronize = torch.xpu.synchronize
+        get_free_memory = get_xpu_free_memory
+    else:
+        empty_cache = noop
+        synchronize = noop
+        get_free_memory = get_cpu_free_memory
+else:
+    SYSTEM = "cpu"
+
+    empty_cache = noop
+    synchronize = noop
+    get_free_memory = get_cpu_free_memory
+logger.info(f"Detected system {SYSTEM}")
diff --git a/backends/gaudi/server/text_generation_server/utils/log.py b/backends/gaudi/server/text_generation_server/utils/log.py
new file mode 100644
index 00000000..4385c71e
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/log.py
@@ -0,0 +1,15 @@
+from functools import lru_cache
+from text_generation_server.utils.dist import RANK
+
+
+@lru_cache(10)
+def log_once(log, msg: str, master=True):
+    if master:
+        log_master(log, msg)
+    else:
+        log(msg)
+
+
+def log_master(log, msg: str):
+    if RANK == 0:
+        log(msg)
diff --git a/backends/gaudi/server/text_generation_server/utils/logits_process.py b/backends/gaudi/server/text_generation_server/utils/logits_process.py
new file mode 100644
index 00000000..472f2dcb
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/logits_process.py
@@ -0,0 +1,611 @@
+import math
+import torch
+import habana_frameworks.torch.core as htcore
+
+from loguru import logger
+from typing import Dict, Union
+from text_generation_server.pb.generate_pb2 import GrammarType
+
+from outlines.fsm.fsm import RegexFSM
+from outlines.fsm.json_schema import build_regex_from_schema
+from functools import lru_cache
+from typing import List, Optional, DefaultDict
+import time
+
+from transformers import (
+    LogitsWarper,
+    LogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+    TypicalLogitsWarper,
+)
+
+mempool = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
+
+
+class StaticWarper:
+    def __init__(
+        self,
+        temperature=1.0,
+        top_k=None,
+        top_p=None,
+        typical_p=None,
+    ):
+        self.warpers = []
+
+        if temperature is not None and temperature != 1.0:
+            temperature = float(temperature)
+            self.warpers.append(TemperatureLogitsWarper(temperature))
+        if top_k is not None and top_k != 0:
+            self.warpers.append(TopKLogitsWarper(top_k=top_k))
+        if top_p is not None and top_p < 1.0:
+            self.warpers.append(TopPLogitsWarper(top_p=top_p))
+        if typical_p is not None and typical_p < 1.0:
+            self.warpers.append(TypicalLogitsWarper(mass=typical_p))
+
+        self.hpu_graph = None
+        self.static_scores = None
+        self.static_warped_scores = None
+        self.static_next_logprob = None
+
+    def __call__(self, scores):
+        if self.hpu_graph is None:
+            self.static_scores = scores.clone().contiguous()
+            self.static_warped_scores = scores.clone().contiguous()
+            self.static_next_logprob = scores.clone().contiguous()
+            self.hpu_graph = htcore.hpu.HPUGraph()
+
+            with htcore.hpu.graph(self.hpu_graph):
+                local_scores = self.static_scores
+                for warper in self.warpers:
+                    local_scores = warper(None, local_scores)
+
+                self.static_warped_scores.copy_(local_scores)
+                # Compute logprobs
+                self.static_next_logprob.copy_(
+                    torch.log_softmax(self.static_warped_scores, -1)
+                )
+
+        self.static_scores.copy_(scores)
+        self.hpu_graph.replay()
+
+        return self.static_warped_scores, self.static_next_logprob
+
+
+@lru_cache(10)
+def static_warper(
+    temperature: Optional[float],
+    top_k: Optional[int],
+    top_p: Optional[float],
+    typical_p: Optional[float],
+) -> StaticWarper:
+    return StaticWarper(
+        temperature=temperature, top_k=top_k, top_p=top_p, typical_p=typical_p
+    )
+
+
+class HeterogeneousRepetitionPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences.
+    This version allows for a separate value for each sample and runs inplace when possible.
+    It doesn't validate inputs.
+
+    Args:
+        repetition_penalty (`List[float]`):
+            The parameter for repetition penalty. 1.0 means no penalty. See [this
+            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    """
+
+    def __init__(self, penalty: List[float], dtype: torch.dtype, device: torch.device):
+        self.penalty = penalty
+        self.penalty_tensor = torch.tensor(
+            penalty, dtype=dtype, device=device
+        ).unsqueeze(1)
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        score = torch.gather(scores, 1, input_ids)
+
+        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
+        score = torch.where(
+            score < 0, score * self.penalty_tensor, score / self.penalty_tensor
+        )
+
+        scores.scatter_(1, input_ids, score)
+        return scores
+
+    def filter(self, indices):
+        self.penalty = [self.penalty[i] for i in indices]
+        if any([x != 1.0 for x in self.penalty]):
+            self.penalty_tensor = self.penalty_tensor[indices]
+            return self
+        return None
+
+
+class FrequencyPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    Frequency penalty as defined by OpenAI
+
+    Args:
+        penalty (`float`):
+            The parameter for frequency penalty. 0.0 means no penalty.
+    """
+
+    def __init__(self, penalty: float):
+        self.penalty = penalty
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        score = torch.gather(scores, 1, input_ids)
+        # if score < 0 then penalty has to be multiplied to reduce the previous token probability
+        score = -torch.where(score < 0, score * self.penalty, score / self.penalty)
+        # set score to 0 where input_ids is a padding token
+        score *= input_ids.ne(0)
+
+        return scores.scatter_add_(1, input_ids, score)
+
+
+class HeterogeneousFrequencyPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    Frequency penalty as defined by OpenAI in
+    https://platform.openai.com/docs/guides/text-generation/parameter-details
+
+    Args:
+        frequency_penalty (`List[float]`):
+            The parameter for frequency penalty. 0.0 means no penalty.
+    """
+
+    def __init__(self, penalty: List[float], dtype: torch.dtype, device: torch.device):
+        self.penalty = penalty
+        self.penalty_tensor = torch.tensor(
+            penalty, dtype=dtype, device=device
+        ).unsqueeze(1)
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        batch_size, input_size = input_ids.size()
+        vocab_size = scores.size(1)
+
+        # Calculate the frequency for each token so far
+        token_freq = torch.zeros(
+            batch_size, vocab_size, dtype=scores.dtype, device=scores.device
+        )
+        token_freq.scatter_add_(
+            1,
+            input_ids,
+            torch.ones_like(input_ids, dtype=scores.dtype, device=scores.device),
+        )
+        token_freq /= input_size
+
+        # Apply the frequency penalty to logits
+        scores -= token_freq * self.penalty_tensor
+        return scores
+
+    def filter(self, indices):
+        self.penalty = [self.penalty[i] for i in indices]
+        if any([x != 0.0 for x in self.penalty]):
+            self.penalty_tensor = self.penalty_tensor[indices]
+            return self
+        return None
+
+
+class HeterogeneousTemperatureLogitsWarper:
+    r"""
+    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
+    This version allows for a separate value for each sample and runs inplace when possible.
+    It doesn't validate inputs.
+
+    Args:
+        temperature (`float`):
+            The value used to module the logits distribution.
+    """
+
+    def __init__(
+        self, temperature: List[float], dtype: torch.dtype, device: torch.device
+    ):
+        self.temperature = temperature
+        self.temperature_tensor = torch.tensor(
+            temperature, dtype=dtype, device=device
+        ).unsqueeze(1)
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        scores.div_(self.temperature_tensor)
+        return scores
+
+    def filter(self, indices):
+        self.temperature = [self.temperature[i] for i in indices]
+        if any([x != 1.0 for x in self.temperature]):
+            self.temperature_tensor = self.temperature_tensor[indices]
+            return self
+        return None
+
+
+class HeterogeneousTopPLogitsWarper(LogitsWarper):
+    """
+    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
+    This version allows for a separate value for each sample and runs inplace when possible.
+    It doesn't validate inputs.
+
+    Args:
+        top_p (`float`):
+            If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+            higher are kept for generation.
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+    """
+
+    def __init__(
+        self,
+        top_p: List[float],
+        dtype: torch.dtype,
+        device: torch.device,
+        filter_value: float = -math.inf,
+        min_tokens_to_keep: int = 1,
+    ):
+        self.top_p = top_p
+        self.top_p_opposite = 1 - torch.tensor(
+            top_p, dtype=dtype, device=device
+        ).unsqueeze(1)
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        sorted_logits, sorted_indices = torch.sort(scores, descending=False)
+        probs = sorted_logits.softmax(dim=-1)
+        # This is way faster for some reason
+        for i in range(probs.shape[0]):
+            probs[i] = probs[i].cumsum(dim=-1)
+
+        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = probs <= self.top_p_opposite
+        # Keep at least min_tokens_to_keep
+        sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
+
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            1, sorted_indices, sorted_indices_to_remove
+        )
+        warped_scores = scores.masked_fill_(indices_to_remove, self.filter_value)
+
+        return warped_scores
+
+    def filter(self, indices):
+        self.top_p = [self.top_p[i] for i in indices]
+        if any([x < 1.0 for x in self.top_p]):
+            self.top_p_opposite = self.top_p_opposite[indices]
+            return self
+        return None
+
+
+class HeterogeneousTopKLogitsWarper(LogitsWarper):
+    r"""
+    [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
+    This version allows for a separate value for each sample and runs inplace when possible.
+    It doesn't validate inputs.
+
+    Args:
+        top_k (`int`):
+            The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+    """
+
+    def __init__(
+        self,
+        top_k: List[int],
+        device: torch.device,
+        filter_value: float = -math.inf,
+        min_tokens_to_keep: int = 1,
+    ):
+        self.top_k = top_k
+        self.max_top_k = max(top_k)
+        # value - 1 as we will use top_k to index and python uses 0 based numbering
+        self.top_k_tensor = torch.tensor(
+            [max(x - 1, min_tokens_to_keep - 1) for x in top_k],
+            dtype=torch.int64,
+            device=device,
+        ).unsqueeze(1)
+
+        # 0 is a special value that disables top_k warping for this member of the batch
+        disabled = [x == 0 for x in top_k]
+
+        if any(disabled):
+            self.top_k_disabled_mask = torch.tensor(
+                disabled, dtype=torch.bool, device=device
+            ).view(-1, 1)
+        else:
+            self.top_k_disabled_mask = None
+
+        self.filter_value = filter_value
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        # If max_top_k is superior to the vocab, we need to clamp or the warper will fail
+        if scores.size(-1) < self.max_top_k:
+            max_top_k = scores.size(-1)
+            top_k = torch.clamp_max(self.top_k_tensor, max_top_k)
+        else:
+            max_top_k = self.max_top_k
+            top_k = self.top_k_tensor
+
+        # Get the kth score for each member of the batch
+        kth_scores = torch.gather(torch.topk(scores, max_top_k)[0], 1, top_k)
+
+        # Mask member of kth_scores that do not want to use top_k warping
+        if self.top_k_disabled_mask is not None:
+            kth_scores.masked_fill_(self.top_k_disabled_mask, self.filter_value)
+
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = scores < kth_scores
+        scores.masked_fill_(indices_to_remove, self.filter_value)
+        return scores
+
+    def filter(self, indices):
+        self.top_k = [self.top_k[i] for i in indices]
+        disabled = [x == 0 for x in self.top_k]
+
+        if not all(disabled):
+            self.top_k_tensor = self.top_k_tensor[indices]
+            self.max_top_k = max(self.top_k)
+
+            if self.top_k_disabled_mask is not None:
+                self.top_k_disabled_mask = (
+                    self.top_k_disabled_mask[indices] if any(disabled) else None
+                )
+
+            return self
+        return None
+
+
+class HeterogeneousTypicalLogitsWarper(LogitsWarper):
+    r"""
+    [`LogitsWarper`] that performs typical decoding. See [Typical Decoding for Natural Language
+    Generation](https://arxiv.org/abs/2202.00666) for more information.
+    This version allows for a separate value for each sample and runs inplace when possible.
+    It doesn't validate inputs.
+
+    Args:
+        mass (`float`):
+            Value of typical_p between 0 and 1 inclusive, defaults to 0.9.
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+    """
+
+    def __init__(
+        self,
+        mass: List[float],
+        dtype: torch.dtype,
+        device: torch.device,
+        filter_value: float = -math.inf,
+        min_tokens_to_keep: int = 1,
+    ):
+        self.mass = mass
+        self.mass_tensor = torch.tensor(mass, dtype=dtype, device=device).unsqueeze(1)
+
+        # 1 is a special value that disables typical_p warping for this member of the batch
+        disabled = [x == 1.0 for x in mass]
+
+        if any(disabled):
+            self.disabled_mask = torch.tensor(disabled, dtype=torch.bool, device=device)
+        else:
+            self.disabled_mask = None
+
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        # calculate entropy
+        normalized = torch.nn.functional.log_softmax(scores, dim=-1)
+        p = torch.exp(normalized)
+        ent = -(normalized * p).nansum(-1, keepdim=True)
+
+        # shift and sort
+        shifted_scores = torch.abs((-normalized) - ent)
+        sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
+        sorted_logits = scores.gather(-1, sorted_indices)
+        probs = sorted_logits.softmax(dim=-1)
+        # This is way faster for some reason
+        for i in range(probs.shape[0]):
+            probs[i] = probs[i].cumsum(dim=-1)
+
+        # Remove tokens with cumulative mass above the threshold
+        last_ind = (probs < self.mass_tensor).sum(dim=1)
+        last_ind[last_ind < 0] = 0
+
+        if self.disabled_mask is not None:
+            last_ind.masked_fill_(self.disabled_mask, scores.shape[-1] - 1)
+
+        sorted_indices_to_remove = sorted_scores > sorted_scores.gather(
+            1, last_ind.view(-1, 1)
+        )
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            1, sorted_indices, sorted_indices_to_remove
+        )
+
+        warped_scores = scores.masked_fill_(indices_to_remove, self.filter_value)
+
+        return warped_scores
+
+    def filter(self, indices):
+        self.mass = [self.mass[i] for i in indices]
+        disabled = [x == 1.0 for x in self.mass]
+
+        if not all(disabled):
+            self.mass_tensor = self.mass_tensor[indices]
+
+            if self.disabled_mask is not None:
+                self.disabled_mask = (
+                    self.disabled_mask[indices] if any(disabled) else None
+                )
+
+            return self
+        return None
+
+
+class HeterogeneousProcessorWrapper(LogitsProcessor):
+    r"""
+    A wrapper for logit warpers or processors without heterogeneous parameter support.
+    Args:
+        processors (`Dict[int, Union[LogitsProcessor, LogitsWarper]]`):
+            A mapping of sample indices to logit warpers or processors, to be run sequentially.
+    """
+
+    def __init__(
+        self,
+        processors: Dict[int, Union[LogitsProcessor, LogitsWarper]],
+    ):
+        self.processors = processors
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        for i, processor in self.processors.items():
+            scores[i : i + 1] = processor(input_ids[i : i + 1], scores[i : i + 1])
+        return scores
+
+    def filter(self, indices):
+        new_processors = {}
+        for i, idx in enumerate(indices):
+            if idx in self.processors:
+                new_processors[i] = self.processors[idx]
+
+        if new_processors:
+            self.processors = new_processors
+            return self
+        return None
+
+
+class GrammarLogitProcessor(LogitsProcessor):
+    fsm_state: DefaultDict[int, int]
+    fsm: RegexFSM
+
+    def __init__(self, tokenizer, device, grammar, grammar_type):
+        self.device = device
+        self.tokenizer = GrammarLogitProcessor._cached_adapt_tokenizer(tokenizer)
+        self.fsm = GrammarLogitProcessor._cached_compile_fsm(
+            grammar_type, grammar, self.tokenizer
+        )
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        fsm_grammar_state: int,
+    ):
+        if fsm_grammar_state == -1 or self.fsm is None:
+            return logits
+        allowed_tokens = self.fsm.allowed_token_ids(fsm_grammar_state)
+        mask = torch.full_like(logits, -math.inf)
+        mask[:, allowed_tokens] = 0
+        biased_scores = logits + mask
+        return biased_scores
+
+    def advance(self, next_token_id, fsm_grammar_state):
+        return GrammarLogitProcessor._advance(
+            next_token_id, fsm_grammar_state, self.fsm
+        )
+
+    @staticmethod
+    def _advance(next_token_id, fsm_grammar_state, fsm):
+        if fsm_grammar_state == -1:
+            return fsm_grammar_state
+        return fsm.next_state(fsm_grammar_state, next_token_id)
+
+    # TODO: move grammar compilation into the router
+    @staticmethod
+    @lru_cache(maxsize=32, typed=True)
+    def _cached_compile_fsm(grammar_type, schema, tokenizer):
+        start_time = time.time()
+        if grammar_type == GrammarType.GRAMMAR_TYPE_JSON:
+            schema = build_regex_from_schema(schema)
+        elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX:
+            pass  # schema is already a regex just here for clarity
+        fsm = RegexFSM(schema, tokenizer)
+        logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s")
+        return fsm
+
+    @staticmethod
+    @lru_cache(maxsize=32, typed=True)
+    def _cached_adapt_tokenizer(tokenizer):
+        """Adapt tokenizer to work with the FSM.
+
+        The API of Outlines tokenizers is slightly different to that of
+        `transformers`. In addition we need to handle the missing spaces to
+        Llama's tokenizer to be able to compile FSMs for this model.
+
+        """
+        start_time = time.time()
+        tokenizer.vocabulary = tokenizer.get_vocab()
+        tokenizer.special_tokens = set(tokenizer.all_special_tokens)
+
+        def convert_token_to_string(token: str) -> str:
+            from transformers.file_utils import SPIECE_UNDERLINE
+
+            string = tokenizer.convert_tokens_to_string([token])
+
+            # A hack to handle missing spaces to HF's Llama tokenizers
+            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
+                return " " + string
+
+            return string
+
+        tokenizer.convert_token_to_string = convert_token_to_string
+        logger.debug(f"Adapted tokenizer in {time.time() - start_time:.2f}s")
+        return tokenizer
+
+
+class HeterogeneousGrammarLogitProcessor(LogitsProcessor):
+    def __init__(self, tokenizer, device, grammars, grammar_types):
+        self.device = device
+        self.tokenizer = GrammarLogitProcessor._cached_adapt_tokenizer(tokenizer)
+        self.fsms = []
+        for grammar, grammar_type in zip(grammars, grammar_types):
+            if len(grammar) == 0:
+                self.fsms.append(None)
+                continue
+            fsm = GrammarLogitProcessor._cached_compile_fsm(
+                grammar_type, grammar, self.tokenizer
+            )
+            self.fsms.append(fsm)
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        fsm_grammar_states: List[int],
+    ):
+        mask = torch.full_like(logits, -math.inf)
+        for i in range(logits.shape[0]):
+            fsm = self.fsms[i]
+            if fsm is None:
+                continue
+            allowed_tokens = fsm.allowed_token_ids(fsm_grammar_states[i])
+            mask[i, allowed_tokens] = 0
+            logits[i] += mask[i]
+        return logits
+
+    def advance_batch(self, next_token_ids, fsm_grammar_states):
+        return [
+            GrammarLogitProcessor._advance(
+                next_token_ids[i], fsm_grammar_states[i], self.fsms[i]
+            )
+            for i in range(len(next_token_ids))
+        ]
+
+    def advance_at_index(self, next_token_id, fsm_grammar_state, index):
+        if self.fsms[index] is None:
+            return fsm_grammar_state
+        return GrammarLogitProcessor._advance(
+            next_token_id, fsm_grammar_state, self.fsms[index]
+        )
+
+    def filter(self, indices):
+        new_fsms = []
+        for i in indices:
+            new_fsms.append(self.fsms[i])
+        self.fsms = new_fsms
+        return self
diff --git a/backends/gaudi/server/text_generation_server/utils/merges/strategies.py b/backends/gaudi/server/text_generation_server/utils/merges/strategies.py
new file mode 100644
index 00000000..cb39cde1
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/merges/strategies.py
@@ -0,0 +1,220 @@
+import copy
+from abc import ABC
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, List, Tuple, Type, Union
+from text_generation_server.utils.merges.utils import (
+    calculate_majority_sign_mask,
+    disjoint_merge,
+    prune,
+)
+import torch
+
+if TYPE_CHECKING:
+    from text_generation_server.adapters.lora import LoraConfig
+    from text_generation_server.utils.adapter import ModuleMap
+
+
+class AdapterParameters:
+    def __init__(
+        self, adapter_ids, weights, merge_strategy, density, majority_sign_method
+    ):
+        self.adapter_ids = adapter_ids
+        self.weights = weights
+        self.merge_strategy = merge_strategy
+        self.density = density
+        self.majority_sign_method = majority_sign_method
+
+
+def _apply_weights(
+    tensors: Union[torch.Tensor, List[torch.Tensor]], w: torch.Tensor
+) -> torch.Tensor:
+    if isinstance(tensors, torch.Tensor):
+        t = tensors
+    else:
+        t = torch.stack(tensors, dim=0)
+
+    # element-wise weighting of each task tensor
+    # need to unsqueeze weights to match task tensor dimensions
+    # for multiplication to apply element-wise
+    while len(t.shape) > len(w.shape):
+        w = w.unsqueeze(-1)
+    return t * w
+
+
+class MergeStrategy(ABC):
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        raise NotImplementedError()
+
+
+class LinearMerge(MergeStrategy):
+    def __init__(self, **kwargs):
+        pass
+
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        weighted_task_tensors = _apply_weights(task_tensors, weights)
+        return weighted_task_tensors.sum(dim=0)
+
+
+class TiesMerge(MergeStrategy):
+    def __init__(self, density: float, majority_sign_method: str = "total", **kwargs):
+        self.density = density
+        self.majority_sign_method = majority_sign_method
+
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        # sparsify
+        task_tensors = [
+            prune(tensor, self.density, method="magnitude") for tensor in task_tensors
+        ]
+        task_tensors = torch.stack(task_tensors, dim=0)
+
+        # elect sign before applying weights
+        majority_sign_mask = calculate_majority_sign_mask(
+            task_tensors, method=self.majority_sign_method
+        )
+        weighted_task_tensors = _apply_weights(task_tensors, weights)
+
+        # disjoint merge
+        return disjoint_merge(weighted_task_tensors, majority_sign_mask)
+
+
+class DareLinearMerge(MergeStrategy):
+    def __init__(self, density: float, **kwargs):
+        self.density = density
+
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        # sparsify
+        task_tensors = [
+            prune(tensor, self.density, method="random", rescale=True)
+            for tensor in task_tensors
+        ]
+        weighted_task_tensors = _apply_weights(task_tensors, weights)
+        return weighted_task_tensors.sum(dim=0)
+
+
+class DareTiesMerge(MergeStrategy):
+    def __init__(self, density: float, majority_sign_method: str = "total", **kwargs):
+        self.density = density
+        self.majority_sign_method = majority_sign_method
+
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        # sparsify
+        task_tensors = [
+            prune(tensor, self.density, method="random", rescale=True)
+            for tensor in task_tensors
+        ]
+        task_tensors = torch.stack(task_tensors, dim=0)
+
+        # elect sign before applying weights
+        majority_sign_mask = calculate_majority_sign_mask(
+            task_tensors, method=self.majority_sign_method
+        )
+        weighted_task_tensors = _apply_weights(task_tensors, weights)
+
+        # disjoint merge
+        mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask)
+        return mixed_task_tensors
+
+
+strategy_registry: Dict[str, Type[MergeStrategy]] = {
+    "linear": LinearMerge,
+    "ties": TiesMerge,
+    "dare_linear": DareLinearMerge,
+    "dare_ties": DareTiesMerge,
+}
+
+
+def merge_adapters(
+    adapters: List[Tuple["ModuleMap", "LoraConfig"]],
+    merge_params: AdapterParameters,
+) -> Tuple["ModuleMap", "LoraConfig"]:
+    # strategy_name = MergeStrategyEnum.Name(merge_params.merge_strategy).lower()
+    strategy_name = "linear"
+
+    weights = merge_params.weights
+    if not weights:
+        weights = torch.ones(len(adapters))
+    else:
+        weights = torch.tensor(weights)
+
+    merge_config = {
+        "density": merge_params.density,
+        # "majority_sign_method": MajoritySignMethodEnum.Name(
+        #     merge_params.majority_sign_method
+        # ).lower(),
+        "majority_sign_method": "total",
+    }
+    merge_strategy = strategy_registry[strategy_name](**merge_config)
+
+    module_maps: Dict[str, Dict[str, Dict[str, List[torch.Tensor]]]] = defaultdict(
+        lambda: defaultdict(lambda: defaultdict(list))
+    )
+    lora_configs = []
+    weight_name_to_adapter_idx = defaultdict(list)
+
+    # input is list of (module_map, lora_config) tuples
+    # convert into dict[k][param_name] -> list of tensors
+    for idx, (module_map, lora_config) in enumerate(adapters):
+        for weight_name, data in module_map.items():
+            weight_name_to_adapter_idx[weight_name].append(idx)
+            for k, (param_data, param_name) in data.items():
+                module_maps[weight_name][k][param_name].append(param_data)
+        lora_configs.append(lora_config)
+
+    # validate lora configs are compatible
+    _validate_lora_configs(lora_configs)
+
+    # merge tensors for each module such that we have a single ModuleMap:
+    # dict[k] -> merged tensor
+    merged_module_map: "ModuleMap" = defaultdict(dict)
+    for weight_name, data in module_maps.items():
+        indices = weight_name_to_adapter_idx[weight_name]
+        param_weights = weights[indices]
+        for k, param_data in data.items():
+            for param_name, tensors in param_data.items():
+                merged_tensor = merge_strategy.merge(tensors, param_weights)
+                merged_module_map[weight_name][k] = (merged_tensor, param_name)
+
+    # merge lora configs
+    merged_lora_config = _merge_lora_configs(lora_configs)
+
+    return merged_module_map, merged_lora_config
+
+
+def _validate_lora_configs(lora_configs: List["LoraConfig"]):
+    # check that all configs have the same rank
+    ranks = set(lora_config.r for lora_config in lora_configs)
+    if len(ranks) > 1:
+        raise ValueError(
+            f"unable to merge adapters, lora configs have different ranks: {ranks}"
+        )
+
+    if all(len(lora_config.target_modules) == 0 for lora_config in lora_configs):
+        raise ValueError(
+            "unable to merge adapters, lora configs have no target modules"
+        )
+
+
+def _merge_lora_configs(lora_configs: List["LoraConfig"]) -> "LoraConfig":
+    merged_lora_config = copy.copy(lora_configs[0])
+
+    # merge target modules as a union operation
+    merged_target_modules = sorted(
+        set(
+            module
+            for lora_config in lora_configs
+            for module in lora_config.target_modules
+        )
+    )
+    merged_lora_config.target_modules = merged_target_modules
+
+    return merged_lora_config
diff --git a/backends/gaudi/server/text_generation_server/utils/merges/utils.py b/backends/gaudi/server/text_generation_server/utils/merges/utils.py
new file mode 100644
index 00000000..d9ad3278
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/merges/utils.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# From: https://github.com/huggingface/peft/pull/1364
+# Copyright 2024-present the HuggingFace Inc. team.
+# Modifications by Predibase, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Literal
+
+import torch
+
+
+def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> torch.Tensor:
+    """
+    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
+    `density`.
+
+    Args:
+    tensor (`torch.Tensor`):The tensor to prune.
+    density (`float`):The fraction of values to preserve. Should be in [0,1].
+    """
+    mask = torch.zeros_like(tensor).reshape(-1)
+    k = int(density * tensor.reshape(-1).shape[0])
+    top_k = torch.topk(tensor.abs().reshape(-1), k=k, largest=True)
+    mask[top_k[1]] = 1
+    return tensor * mask.reshape(tensor.shape)
+
+
+def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor:
+    """
+    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
+    `density`.
+
+    Args:
+    tensor (`torch.Tensor`):The tensor to prune.
+    density (`float`):The fraction of values to preserve. Should be in [0,1].
+    rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
+    """
+    mask = torch.bernoulli(torch.full_like(input=tensor, fill_value=density))
+    pruned_tensor = tensor * mask
+    if rescale:
+        torch.div(input=pruned_tensor, other=density)
+    return pruned_tensor
+
+
+def prune(
+    tensor: torch.Tensor,
+    density: float,
+    method: Literal["magnitude", "random"],
+    rescale: bool = False,
+) -> torch.Tensor:
+    """
+    Prune the values of task tensors based on the `method`.
+
+    Args:
+    tensor (`torch.Tensor`):The tensor to prune.
+    density (`float`):The fraction of values to preserve. Should be in [0,1].
+    method (`str`):The method to use to prune. Should be one of ["magnitude", "random"].
+    rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
+    """
+    if density >= 1:
+        return tensor
+    elif density < 0:
+        raise ValueError("Density should be >= 0, got {density}")
+    if method == "magnitude":
+        return magnitude_based_pruning(tensor, density)
+    elif method == "random":
+        return random_pruning(tensor, density, rescale=rescale)
+    else:
+        raise ValueError(f"Unknown method {method}")
+
+
+def calculate_majority_sign_mask(
+    tensor: torch.Tensor, method: Literal["total", "frequency"] = "total"
+):
+    """
+    Get the mask of the majority sign across the task tensors. Task tensors are stacked on dimension 0.
+
+    Args:
+    tensor (`torch.Tensor`):The tensor to get the mask from.
+    method (`str`):The method to use to get the mask. Should be one of ["total", "frequency"].
+    """
+
+    sign = tensor.sign()
+    if method == "total":
+        sign_magnitude = (sign * tensor.abs()).sum(dim=0)
+    elif method == "frequency":
+        sign_magnitude = sign.sum(dim=0)
+    else:
+        raise RuntimeError(f'Unimplemented mask method "{method}"')
+    majority_sign = torch.where(sign_magnitude >= 0, 1, -1)
+    return sign == majority_sign
+
+
+def disjoint_merge(task_tensors, majority_sign_mask):
+    mixed_task_tensors = (task_tensors * majority_sign_mask).sum(dim=0)
+    num_params_preserved = majority_sign_mask.sum(dim=0)
+    return mixed_task_tensors / torch.clamp(num_params_preserved, min=1.0)
diff --git a/backends/gaudi/server/text_generation_server/utils/peft.py b/backends/gaudi/server/text_generation_server/utils/peft.py
new file mode 100644
index 00000000..d49e73f0
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/peft.py
@@ -0,0 +1,68 @@
+import os
+from typing import Union
+from loguru import logger
+import torch
+
+from transformers import AutoTokenizer
+from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
+
+
+def download_and_unload_peft(model_id, revision, trust_remote_code):
+    torch_dtype = torch.float16
+
+    logger.info("Trying to load a Peft model. It might take a while without feedback")
+    try:
+        model = AutoPeftModelForCausalLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=True,
+        )
+    except Exception:
+        model = AutoPeftModelForSeq2SeqLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=True,
+        )
+    logger.info("Peft model detected.")
+    logger.info("Merging the lora weights.")
+
+    base_model_id = model.peft_config["default"].base_model_name_or_path
+
+    model = model.merge_and_unload()
+
+    os.makedirs(model_id, exist_ok=True)
+    cache_dir = model_id
+    logger.info(f"Saving the newly created merged model to {cache_dir}")
+    tokenizer = AutoTokenizer.from_pretrained(
+        base_model_id, trust_remote_code=trust_remote_code
+    )
+    model.save_pretrained(cache_dir, safe_serialization=True)
+    model.config.save_pretrained(cache_dir)
+    tokenizer.save_pretrained(cache_dir)
+
+
+def download_peft(
+    model_id: Union[str, os.PathLike], revision: str, trust_remote_code: bool
+):
+    torch_dtype = torch.float16
+    try:
+        _model = AutoPeftModelForCausalLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=True,
+        )
+    except Exception:
+        _model = AutoPeftModelForSeq2SeqLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=True,
+        )
+    logger.info("Peft model downloaded.")
diff --git a/backends/gaudi/server/text_generation_server/utils/quantization.py b/backends/gaudi/server/text_generation_server/utils/quantization.py
new file mode 100644
index 00000000..ee561acc
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/quantization.py
@@ -0,0 +1,202 @@
+import json
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+from huggingface_hub import hf_hub_download
+from text_generation_server.layers.marlin.gptq import can_use_gptq_marlin
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    WeightsLoader,
+)
+
+
+# TODO: Split this config to have a single config type per quant method
+@dataclass
+class _QuantizerConfig:
+    bits: int
+    checkpoint_format: Optional[str]
+    desc_act: bool
+    groupsize: int
+    quant_method: str
+    sym: bool
+
+
+@dataclass
+class _FP8QuantizerConfig:
+    activation_scale_ub: float
+
+
+# We should probably do this with Pytantic JSON deserialization,
+# but for now we'll stay close to the old _set_gptq_params.
+def _get_quantizer_config(model_id, revision):
+    bits = 4
+    groupsize = -1
+    quant_method = "gptq"
+    checkpoint_format = None
+    sym = False
+    desc_act = False
+
+    filename = "config.json"
+    try:
+        if os.path.exists(os.path.join(model_id, filename)):
+            filename = os.path.join(model_id, filename)
+        else:
+            filename = hf_hub_download(model_id, filename=filename, revision=revision)
+        with open(filename, "r") as f:
+            data = json.load(f)
+
+        # FP8 config
+        if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
+            return _FP8QuantizerConfig(
+                activation_scale_ub=data["quantization_config"]["activation_scale_ub"]
+            )
+
+        if "zero_point" in data["quantization_config"]:
+            sym = not data["quantization_config"]["zero_point"]
+            quant_method = "awq"
+        elif "sym" in data["quantization_config"]:
+            sym = data["quantization_config"]["sym"]
+
+        bits = data["quantization_config"]["bits"]
+        groupsize = data["quantization_config"]["group_size"]
+        # Order is important here, desc_act is missing on some real models
+        quant_method = data["quantization_config"]["quant_method"]
+        checkpoint_format = data["quantization_config"].get("checkpoint_format")
+        desc_act = data["quantization_config"]["desc_act"]
+    except Exception:
+        filename = "quantize_config.json"
+        try:
+            if os.path.exists(os.path.join(model_id, filename)):
+                filename = os.path.join(model_id, filename)
+            else:
+                filename = hf_hub_download(
+                    model_id, filename=filename, revision=revision
+                )
+            with open(filename, "r") as f:
+                data = json.load(f)
+            bits = data["bits"]
+            groupsize = data["group_size"]
+
+            if "zero_point" in data:
+                sym = not data["zero_point"]
+                quant_method = "awq"
+            elif "sym" in data:
+                sym = data["sym"]
+
+            desc_act = data["desc_act"]
+            if "version" in data and data["version"] == "GEMM":
+                quant_method = "awq"
+        except Exception:
+            filename = "quant_config.json"
+            try:
+                if os.path.exists(os.path.join(model_id, filename)):
+                    filename = os.path.join(model_id, filename)
+                else:
+                    filename = hf_hub_download(
+                        model_id, filename=filename, revision=revision
+                    )
+                with open(filename, "r") as f:
+                    data = json.load(f)
+                bits = data["w_bit"]
+                groupsize = data["q_group_size"]
+                desc_act = data["desc_act"]
+                if "version" in data and data["version"] == "GEMM":
+                    quant_method = "awq"
+            except Exception:
+                pass
+
+    return _QuantizerConfig(
+        bits=bits,
+        groupsize=groupsize,
+        quant_method=quant_method,
+        checkpoint_format=checkpoint_format,
+        sym=sym,
+        desc_act=desc_act,
+    )
+
+
+def get_loader(
+    quantize: Optional[str], model_id: str, revision: Optional[str]
+) -> WeightsLoader:
+    quantizer_config = _get_quantizer_config(model_id, revision)
+    if quantize in {"awq", "gptq"}:
+        from text_generation_server.layers.gptq import GPTQWeightsLoader
+
+        # TODO: improve check once we have one config type per quantize value
+        if not isinstance(quantizer_config, _QuantizerConfig):
+            raise ValueError(
+                f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
+            )
+
+        if can_use_gptq_marlin(
+            bits=quantizer_config.bits,
+            groupsize=quantizer_config.groupsize,
+            quant_method=quantizer_config.quant_method,
+            quantize=quantize,
+            sym=quantizer_config.sym,
+        ):
+            from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader
+
+            return GPTQMarlinWeightsLoader(
+                bits=quantizer_config.bits,
+                desc_act=quantizer_config.desc_act,
+                groupsize=quantizer_config.groupsize,
+                quant_method=quantizer_config.quant_method,
+                quantize=quantize,
+                sym=quantizer_config.sym,
+            )
+        else:
+            return GPTQWeightsLoader(
+                bits=quantizer_config.bits,
+                desc_act=quantizer_config.desc_act,
+                groupsize=quantizer_config.groupsize,
+                quant_method=quantizer_config.quant_method,
+                quantize=quantize,
+                sym=quantizer_config.sym,
+            )
+    elif quantize == "bitsandbytes":
+        from text_generation_server.layers.bnb import BNBWeight
+
+        return DefaultWeightsLoader(BNBWeight)
+    elif quantize == "bitsandbytes-fp4":
+        from text_generation_server.layers.bnb import BNBFP4Weight
+
+        return DefaultWeightsLoader(BNBFP4Weight)
+    elif quantize == "bitsandbytes-nf4":
+        from text_generation_server.layers.bnb import BNBNF4Weight
+
+        return DefaultWeightsLoader(BNBNF4Weight)
+    elif quantize == "eetq":
+        from text_generation_server.layers.eetq import EETQWeight
+
+        return DefaultWeightsLoader(EETQWeight)
+    elif quantize == "exl2":
+        from text_generation_server.layers.exl2 import Exl2WeightsLoader
+
+        return Exl2WeightsLoader()
+    elif quantize == "marlin":
+        from text_generation_server.layers.marlin import MarlinWeightsLoader
+
+        # TODO: improve check once we have one config type per quantize value
+        if not isinstance(quantizer_config, _QuantizerConfig):
+            raise ValueError(
+                f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
+            )
+
+        return MarlinWeightsLoader(
+            bits=quantizer_config.bits,
+            is_marlin_24=quantizer_config.checkpoint_format == "marlin_24",
+        )
+    elif quantize == "fp8" or quantize is None:
+        from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
+
+        # Since the default for the quantize config is _QuantizerConfig,
+        # we need to add this check to not get an attribute error
+        activation_scale_ub = None
+        if isinstance(quantizer_config, _FP8QuantizerConfig):
+            activation_scale_ub = quantizer_config.activation_scale_ub
+
+        return HybridFP8UnquantLoader(activation_scale_ub, to_fp8=quantize == "fp8")
+    else:
+        raise ValueError(f"Unknown quantization method: {quantize}")
diff --git a/backends/gaudi/server/text_generation_server/utils/segments.py b/backends/gaudi/server/text_generation_server/utils/segments.py
new file mode 100644
index 00000000..f5961102
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/segments.py
@@ -0,0 +1,66 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/utils/segments.py
+# License:  Apache License Version 2.0, January 2004
+
+from typing import List, Tuple, Union
+
+import torch
+
+
+def find_segments(
+    adapter_indices: Union[torch.Tensor, List[int]]
+) -> Tuple[List[int], List[int]]:
+    segments = [0]
+    segment_indices = []
+
+    if isinstance(adapter_indices, torch.Tensor):
+        # Calling .item() repeatedly on CUDA tensor is very slow, so we move it to CPU first
+        adapter_indices = adapter_indices.cpu().tolist()
+
+    start_index = 0
+    for i in range(1, len(adapter_indices)):
+        if adapter_indices[i] != adapter_indices[i - 1]:
+            segments.append(i)
+            segment_indices.append(adapter_indices[i - 1])
+            start_index = i
+
+    # Handle the last segment
+    if start_index < len(adapter_indices):
+        segments.append(len(adapter_indices))
+        segment_indices.append(adapter_indices[-1])
+
+    return segments, segment_indices
+
+
+class SegmentConcatBuilder:
+    def __init__(self):
+        self.adapter_segment_indices = []
+        self.adapter_segment_tensors = []
+
+    def concat(self, adapter_segments: torch.Tensor, segment_indices: List[int]):
+        # Update adapter segments
+        if self.adapter_segment_tensors:
+            # Because we have already processed at least one batch, remove the 0 start index
+            # from this batch denoting the beginning of the segment, then offset all segment
+            # positions by the value of the last segment in the previous batch to account for
+            # the concatenation.
+            adapter_segments = (
+                adapter_segments[1:] + self.adapter_segment_tensors[-1][-1]
+            )
+
+        if (
+            self.adapter_segment_indices
+            and self.adapter_segment_indices[-1] == segment_indices[0]
+        ):
+            # If the last segment in the previous batch is the same as the first segment in this batch,
+            # then we merge them together into a single segment. In effect, this means removing it from
+            # the segment indices of this batch, and extending the segment span by removing the segment
+            # end index from the previous batch.
+            segment_indices = segment_indices[1:]
+            self.adapter_segment_tensors[-1] = self.adapter_segment_tensors[-1][:-1]
+
+        self.adapter_segment_indices.extend(segment_indices)
+        self.adapter_segment_tensors.append(adapter_segments)
+
+    def build(self) -> Tuple[torch.Tensor, List[int]]:
+        return torch.concat(self.adapter_segment_tensors), self.adapter_segment_indices
diff --git a/backends/gaudi/server/text_generation_server/utils/sgmv.py b/backends/gaudi/server/text_generation_server/utils/sgmv.py
new file mode 100644
index 00000000..2d0a73a5
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/sgmv.py
@@ -0,0 +1,252 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/utils/sgmv.py
+# License:  Apache License Version 2.0, January 2004
+
+import os
+import warnings
+from functools import lru_cache
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+
+try:
+    import punica_kernels as _kernels
+
+    HAS_SGMV = not bool(os.environ.get("DISABLE_SGMV", ""))
+except ImportError:
+    warnings.warn("Could not import SGMV kernel from Punica, falling back to loop.")
+    _kernels = None
+    HAS_SGMV = False
+
+
+MIN_SGMV_RANK = 8
+MIN_RANK_CUSTOM = 16
+MAX_RANK_CUSTOM = 128
+SGMV_BLOCK_SIZE = 16
+BGMV_MAX_RANK = 64
+
+
+def has_sgmv() -> bool:
+    return HAS_SGMV
+
+
+def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
+    """Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""
+    if not has_sgmv():
+        return t
+
+    # tensor parallelism will result in effective rank being divided by world_size,
+    # so we need to scale the min rank to offset that effect
+    min_rank = MIN_SGMV_RANK * world_size
+
+    # if we're at or below the min rank, pad up to the min rank
+    # otherwise, pad to the nearest multiple of the block size
+    current_rank = t.size(dim)
+    target_rank = (
+        min_rank
+        if current_rank <= min_rank
+        else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE
+    )
+    if current_rank == target_rank:
+        return t
+
+    pad_size = target_rank - current_rank
+
+    # see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
+    pad = [0, 0] * t.dim()
+    pad[(t.dim() - dim - 1) * 2 + 1] = pad_size
+    pad = tuple(pad)
+
+    return F.pad(t, pad, mode="constant", value=0.0)
+
+
+def use_cutlass_shrink(lora_rank: int) -> bool:
+    return lora_rank < MIN_RANK_CUSTOM
+
+
+def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
+    if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:
+        return t.transpose(0, 1)
+    return t
+
+
+# Source: https://github.com/punica-ai/punica/blob/master/src/punica/ops/__init__.py
+def add_lora_sgmv_cutlass(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.Tensor,
+    s_end: torch.Tensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    """
+    Semantics:
+        y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])
+
+    Args:
+        y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+        x: Shape: `[B, H1]`. Input vectors.
+        wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H1]`.
+        wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H2]`.
+        s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
+        s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
+        layer_idx: Layer index of the weight matrices.
+    """
+    if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:
+        # Custom SGMV shrink only supports rank 16, 32, 64, 128
+        _add_lora_sgmv_cutlass_legacy(
+            y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank
+        )
+        return
+
+    tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)
+    tmp2_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    _kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)
+    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)
+
+
+def _add_lora_sgmv_cutlass_legacy(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    tmp_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    _kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+
+
+@lru_cache(maxsize=1)
+def get_tmp_tensor(device: torch.device) -> torch.Tensor:
+    return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)
+
+
+@lru_cache(maxsize=32)
+def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
+    tmp_size = _kernels.sgmv_cutlass_tmp_size(size)
+    return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
+
+
+def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) -> torch.Tensor:
+    return torch.empty((size,), dtype=torch.uint8, device=device)
+
+
+def get_tmp_expand_size(size: int) -> int:
+    return _kernels.sgmv_cutlass_tmp_size(size)
+
+
+def get_tmp_tensors(
+    nsegments: int, lora_rank: int, device: torch.device
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    use_cutlass = use_cutlass_shrink(lora_rank) and has_sgmv()
+    has_sgmv_available = has_sgmv()
+
+    if use_cutlass:
+        tmp = get_tmp_tensor_for_size(nsegments, device)
+        return tmp, tmp
+    elif has_sgmv_available:
+        return get_tmp_tensor(device), get_tmp_tensor_for_size(nsegments, device)
+    else:
+        tmp = get_tmp_tensor_for_size(nsegments, device)
+        return tmp, tmp
+
+
+def lora_a_sgmv_cutlass(
+    x: torch.Tensor,
+    tmp: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+) -> torch.Tensor:
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:
+        _kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    else:
+        _kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    return v
+
+
+def lora_b_sgmv_cutlass(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    tmp: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+):
+    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+
+
+"""
+Semantics:
+    y[i] += (
+        x[i].unsqueeze(0)
+        @ wa_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+        @ wb_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+        * scale
+    ).squeeze(0)
+
+Args:
+    y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+    v: Shape: `[B, R]`. Temporary vector.
+    x: Shape: `[B, H1]`. Input vectors.
+    wa_T_all: Shape: `[None, L, R, H1]`. All of the transposed LoRA A matrices.
+    wb_T_all: Shape: `[None, L, H2, R]`. All of the transposed LoRA B matrices.
+    indicies: Shape: `[B]`. Indices of the LoRA weights.
+    layer_idx: Layer index of LoRA weights.
+    scale: Scaling factor.
+"""
+
+
+def add_lora_a_bgmv(
+    v: torch.Tensor,
+    x: torch.Tensor,
+    wa_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    _kernels.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)
+
+
+def add_lora_b_bgmv(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    wb_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    _kernels.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)
+
+
+def segmented_matmul(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w: List[torch.Tensor],
+    b: List[torch.Tensor],
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+):
+    for i in range(len(w)):
+        if s_end[i] - s_start[i] <= 0:
+            continue
+
+        xi = x[s_start[i] : s_end[i]]
+        wi = w[i]
+        bi = b[i]
+        y[s_start[i] : s_end[i]] = F.linear(xi, wi, bi)
diff --git a/backends/gaudi/server/text_generation_server/utils/speculate.py b/backends/gaudi/server/text_generation_server/utils/speculate.py
new file mode 100644
index 00000000..a1b37a34
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/speculate.py
@@ -0,0 +1,11 @@
+SPECULATE = None
+
+
+def get_speculate() -> int:
+    global SPECULATE
+    return SPECULATE
+
+
+def set_speculate(speculate: int):
+    global SPECULATE
+    SPECULATE = speculate
diff --git a/backends/gaudi/server/text_generation_server/utils/tokens.py b/backends/gaudi/server/text_generation_server/utils/tokens.py
new file mode 100644
index 00000000..9c44ba15
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/tokens.py
@@ -0,0 +1,762 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
+import re
+from typing import List, Optional, Tuple, Set, Union
+
+import torch
+from text_generation_server.pb import generate_pb2
+from text_generation_server.pb.generate_pb2 import FinishReason, GrammarType
+from text_generation_server.utils.logits_process import (
+    FrequencyPenaltyLogitsProcessor,
+    GrammarLogitProcessor,
+    HeterogeneousProcessorWrapper,
+    HeterogeneousRepetitionPenaltyLogitsProcessor,
+    HeterogeneousFrequencyPenaltyLogitsProcessor,
+    HeterogeneousTemperatureLogitsWarper,
+    HeterogeneousTopKLogitsWarper,
+    HeterogeneousTopPLogitsWarper,
+    HeterogeneousTypicalLogitsWarper,
+    HeterogeneousGrammarLogitProcessor,
+    static_warper,
+)
+from text_generation_server.utils.watermark import WatermarkLogitsProcessor
+from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcessor
+import os
+
+
+class NextTokenChooser:
+    def __init__(
+        self,
+        watermark: bool = False,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.0,
+        frequency_penalty: float = 0.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        do_sample: bool = False,
+        seed: int = 0,
+        device: str = "cpu",
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        grammar: str = "",
+        grammar_type: GrammarType = GrammarType.GRAMMAR_TYPE_NONE,
+        fsm_grammar_state: int = 0,
+    ):
+        self.watermark_processor = (
+            WatermarkLogitsProcessor(device=device) if watermark else None
+        )
+        self.repetition_processor = (
+            RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)
+            if repetition_penalty and repetition_penalty != 1.0
+            else None
+        )
+        self.frequency_processor = (
+            FrequencyPenaltyLogitsProcessor(penalty=frequency_penalty)
+            if frequency_penalty and frequency_penalty != 0.0
+            else None
+        )
+        self.grammar_processor = (
+            GrammarLogitProcessor(tokenizer, device, grammar, grammar_type)
+            if grammar != ""
+            else None
+        )
+        self.tokenizer = tokenizer
+
+        has_warpers = (
+            (temperature is not None and temperature != 1.0)
+            or (top_k is not None and top_k != 0)
+            or (top_p is not None and top_p < 1.0)
+            or (typical_p is not None and typical_p < 1.0)
+        )
+        if has_warpers:
+            self.static_warper = static_warper(
+                temperature=temperature, top_k=top_k, top_p=top_p, typical_p=typical_p
+            )
+        else:
+            self.static_warper = None
+
+        sampling = do_sample or has_warpers
+
+        self.choice = Sampling(seed, device) if sampling else Greedy()
+        self.fsm_grammar_state = fsm_grammar_state
+        self.grammar = grammar
+
+    def __call__(self, input_ids, scores):
+        if self.watermark_processor is not None:
+            scores = self.watermark_processor(input_ids, scores)
+        if self.repetition_processor is not None:
+            scores = self.repetition_processor(input_ids, scores)
+        if self.frequency_processor is not None:
+            scores = self.frequency_processor(input_ids, scores)
+        if self.grammar_processor is not None:
+            scores = self.grammar_processor(scores, self.fsm_grammar_state)
+
+        if self.static_warper is None:
+            next_logprob = torch.log_softmax(scores, -1)
+        else:
+            scores, next_logprob = self.static_warper(scores)
+
+        next_id = self.choice(scores[-1]).view(1, 1)
+
+        return next_id, next_logprob
+
+    def advance_grammar(self, next_id: int):
+        if self.grammar_processor is not None:
+            self.fsm_grammar_state = self.grammar_processor.advance(
+                next_id, self.fsm_grammar_state
+            )
+        return self
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.NextTokenChooserParameters,
+        device: torch.device,
+        tokenizer: PreTrainedTokenizerBase,
+    ) -> "NextTokenChooser":
+        return NextTokenChooser(
+            watermark=pb.watermark,
+            temperature=pb.temperature,
+            repetition_penalty=pb.repetition_penalty,
+            frequency_penalty=pb.frequency_penalty,
+            top_k=pb.top_k,
+            top_p=pb.top_p,
+            typical_p=pb.typical_p,
+            do_sample=pb.do_sample,
+            seed=pb.seed,
+            device=device,
+            tokenizer=tokenizer,
+            grammar=pb.grammar,
+            grammar_type=pb.grammar_type,
+        )
+
+
+class StopSequenceCriteria:
+    def __init__(self, stop_sequence: str):
+        stop_sequence = re.escape(stop_sequence)
+        self.regex = re.compile(f"{stop_sequence}$")
+
+    def __call__(self, output: str) -> bool:
+        if self.regex.findall(output):
+            return True
+        return False
+
+
+class StoppingCriteria:
+    def __init__(
+        self,
+        eos_token_ids: Optional[Union[Set[int], int]],
+        stop_sequence_criterias: List[StopSequenceCriteria],
+        max_new_tokens: int = 20,
+        ignore_eos_token: bool = False,
+    ):
+        if eos_token_ids is None:
+            eos_token_ids = set()
+        elif isinstance(eos_token_ids, int):
+            eos_token_ids = set([eos_token_ids])
+        elif isinstance(eos_token_ids, set):
+            eos_token_ids = eos_token_ids
+        else:
+            raise RuntimeError(
+                f"eos_token_ids is of invalid type {type(eos_token_ids)}, expected int, None or set[int]"
+            )
+        self.eos_token_ids = eos_token_ids
+        self.stop_sequence_criterias = stop_sequence_criterias
+        self.max_new_tokens = max_new_tokens
+        self.current_tokens = 0
+        self.current_output = ""
+
+        if os.getenv("TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN", "false") == "true":
+            self.ignore_eos_token = True
+        else:
+            self.ignore_eos_token = ignore_eos_token
+
+    def __call__(self, last_token: int, last_output: str) -> Tuple[bool, Optional[str]]:
+        self.current_tokens += 1
+        if self.current_tokens >= self.max_new_tokens:
+            return True, FinishReason.FINISH_REASON_LENGTH
+
+        if isinstance(last_token, torch.Tensor):
+            last_token = last_token.item()
+
+        if not self.ignore_eos_token and last_token in self.eos_token_ids:
+            return True, FinishReason.FINISH_REASON_EOS_TOKEN
+
+        if self.stop_sequence_criterias:
+            self.current_output += last_output
+            # There is no need to keep an output that is too long
+            if len(self.current_output) > 300:
+                # Slice to -200 to avoid doing it all the time
+                self.current_output = self.current_output[-200:]
+            for stop_sequence_criteria in self.stop_sequence_criterias:
+                if stop_sequence_criteria(self.current_output):
+                    return True, FinishReason.FINISH_REASON_STOP_SEQUENCE
+
+        return False, None
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.StoppingCriteriaParameters,
+        tokenizer: PreTrainedTokenizerBase,
+    ) -> "StoppingCriteria":
+        stop_sequence_criterias = [
+            StopSequenceCriteria(sequence) for sequence in pb.stop_sequences
+        ]
+        # TODO Hack because eos_token_id cannot be what we want.
+        eos_token_id = getattr(tokenizer, "_eos_token_ids", tokenizer.eos_token_id)
+        return StoppingCriteria(
+            eos_token_id,
+            stop_sequence_criterias,
+            pb.max_new_tokens,
+            pb.ignore_eos_token,
+        )
+
+
+def create_n_gram_speculation(
+    input_ids: torch.Tensor,
+    next_ids: torch.Tensor,
+    accepted_ids: torch.Tensor,
+    speculate: int,
+    verbose: bool,
+):
+    # Very trivial approach, find first match in the string.
+    # This is much less refined than actual n-gram but seems to work
+    # relatively OK in grounded mode and is by far much faster with
+    # much less worst case complexity as everything happens on device.
+    B = accepted_ids.shape[0]
+    device = input_ids.device
+    seeds = next_ids[accepted_ids.cumsum(dim=-1) - 1]
+    indices = (input_ids == seeds.unsqueeze(-1)).max(dim=1).indices + 1
+    all_indices = indices.unsqueeze(-1).expand(B, speculate) + torch.arange(
+        speculate, device=device
+    )
+    all_indices = torch.clamp(all_indices, max=input_ids.shape[1] - 1)
+
+    speculative_ids = input_ids.gather(dim=-1, index=all_indices)
+    return speculative_ids
+
+
+class HeterogeneousNextTokenChooser:
+    def __init__(
+        self,
+        dtype: torch.dtype,
+        device: torch.device,
+        watermark: List[bool],
+        temperature: List[float],
+        repetition_penalty: List[float],
+        frequency_penalty: List[float],
+        top_k: List[int],
+        top_p: List[float],
+        typical_p: List[float],
+        do_sample: List[bool],
+        seeds: List[int],
+        tokenizer: PreTrainedTokenizerBase,
+        grammars: List[str],
+        grammar_types: List[int],
+        fsm_grammar_states: List[int],
+        quantization_enabled: bool,
+    ):
+        warpers = []
+
+        # TODO: enable watermark with FP8 quantization
+        self.watermark_processor = (
+            HeterogeneousProcessorWrapper(
+                {
+                    i: WatermarkLogitsProcessor(device=device)
+                    for i, do_watermark in enumerate(watermark)
+                    if do_watermark
+                }
+            )
+            if any(watermark) and not quantization_enabled
+            else None
+        )
+
+        self.repetition_processor = (
+            HeterogeneousRepetitionPenaltyLogitsProcessor(
+                repetition_penalty, dtype, device
+            )
+            if any([x != 1.0 for x in repetition_penalty])
+            else None
+        )
+
+        self.frequency_processor = (
+            HeterogeneousFrequencyPenaltyLogitsProcessor(
+                frequency_penalty, dtype, device
+            )
+            if any([x != 0.0 for x in frequency_penalty])
+            else None
+        )
+
+        self.grammar_processor = (
+            HeterogeneousGrammarLogitProcessor(
+                tokenizer, device, grammars, grammar_types
+            )
+            if any([grammar != "" for grammar in grammars])
+            else None
+        )
+
+        if any(x != 1.0 for x in temperature):
+            do_sample = [
+                sample or x != 1.0 for x, sample in zip(temperature, do_sample)
+            ]
+            warpers.append(
+                HeterogeneousTemperatureLogitsWarper(temperature, dtype, device)
+            )
+
+        if any(x != 0 for x in top_k):
+            do_sample = [sample or x != 0 for x, sample in zip(top_k, do_sample)]
+            warpers.append(HeterogeneousTopKLogitsWarper(top_k, device))
+
+        if any(x < 1.0 for x in top_p):
+            do_sample = [sample or x < 1.0 for x, sample in zip(top_p, do_sample)]
+            warpers.append(HeterogeneousTopPLogitsWarper(top_p, dtype, device))
+
+        if any(x < 1.0 for x in typical_p):
+            do_sample = [sample or x < 1.0 for x, sample in zip(typical_p, do_sample)]
+            warpers.append(HeterogeneousTypicalLogitsWarper(typical_p, dtype, device))
+
+        self.warpers = warpers
+
+        if any(do_sample):
+            self.choice = HeterogeneousSampling(do_sample, seeds, device)
+        else:
+            self.choice = Greedy()
+
+        self.seeds = seeds
+        self.do_sample = do_sample
+        self.dtype = dtype
+        self.device = device
+        self.tokenizer = tokenizer
+        self.fsm_grammar_states = fsm_grammar_states
+        self.grammars = grammars
+        self.grammar_types = grammar_types
+
+    def __call__(
+        self,
+        input_ids: torch.Tensor,
+        scores: torch.Tensor,
+        speculate: int,
+        speculated_ids: Optional[torch.Tensor] = None,
+        speculative_scores: Optional[torch.Tensor] = None,
+        verbose=False,
+    ):
+        if speculated_ids is not None:
+            B = scores.shape[0] // (speculated_ids.shape[1] + 1)
+            S = speculated_ids.shape[1] + 1
+            scores = scores.view(B, S, -1)
+        else:
+            B = scores.shape[0]
+            S = 1
+            scores = scores.view(B, S, -1)
+
+        next_ids = torch.zeros((B, S), device=scores.device, dtype=torch.long)
+
+        for j in range(S):
+            _scores = scores[:, j]
+            if self.watermark_processor is not None:
+                _scores = self.watermark_processor(input_ids, _scores)
+            if self.repetition_processor is not None:
+                _scores = self.repetition_processor(input_ids, _scores)
+            if self.frequency_processor is not None:
+                _scores = self.frequency_processor(input_ids, _scores)
+            if self.grammar_processor is not None:
+                _scores = self.grammar_processor(_scores, self.fsm_grammar_states)
+            for warper in self.warpers:
+                _scores = warper(input_ids, _scores)
+            _next_ids = self.choice(_scores)
+            scores[:, j] = _scores
+            next_ids[:, j] = _next_ids
+        next_ids = next_ids.view(B * S)
+        allscores = scores.view(B * S, -1)
+        alllogprobs = torch.log_softmax(allscores, -1)
+
+        if speculated_ids is not None:
+            accepted_ids = []
+            B = next_ids.shape[0] // (speculated_ids.shape[1] + 1)
+            S = speculated_ids.shape[1] + 1
+            indices = []
+            for i in range(B):
+                _next_ids = next_ids[i * S : (i + 1) * S]
+                _speculated_ids = speculated_ids[i]
+                validate_speculative = _next_ids[:-1] == _speculated_ids
+                index = i * S
+                accepted = 1
+                # First is always valid
+                indices.append(index)
+                for valid in validate_speculative.tolist():
+                    if valid:
+                        index += 1
+                        accepted += 1
+                        indices.append(index)
+                    else:
+                        break
+                accepted_ids.append(accepted)
+
+            accepted_ids = torch.tensor(
+                accepted_ids, device=input_ids.device, dtype=input_ids.dtype
+            )
+            next_ids = next_ids[indices]
+            logprobs = alllogprobs[indices]
+            indices = torch.arange(B, device=input_ids.device) * S
+            if speculative_scores is not None:
+                speculative_scores = speculative_scores[indices + accepted_ids - 1]
+        else:
+            accepted_ids = torch.ones_like(next_ids)
+            logprobs = alllogprobs
+
+        next_logprobs = torch.gather(logprobs, 1, next_ids.view(-1, 1)).view(-1)
+
+        if speculate > 0:
+            if speculative_scores is not None:
+                # Medusa provided some scores
+                speculative_ids = Greedy()(speculative_scores)
+            else:
+                # n-gram
+                speculative_ids = create_n_gram_speculation(
+                    input_ids, next_ids, accepted_ids, speculate, verbose
+                )
+        else:
+            speculative_ids = None
+
+        return next_ids, next_logprobs, alllogprobs, accepted_ids, speculative_ids
+
+    def advance_grammar(self, next_ids: List[int]):
+        if self.grammar_processor is not None:
+            other_new_states = self.grammar_processor.advance_batch(
+                next_ids, self.fsm_grammar_states
+            )
+            self.fsm_grammar_states = other_new_states
+        return self
+
+    def advance_grammar_single(self, grammar_state_index: int, next_id: int):
+        if self.grammar_processor is not None:
+            self.fsm_grammar_states[grammar_state_index] = (
+                self.grammar_processor.advance_at_index(
+                    next_id,
+                    self.fsm_grammar_states[grammar_state_index],
+                    grammar_state_index,
+                )
+            )
+        return self
+
+    def advance_grammar_single_with_past_state(
+        self, grammar_state_index: int, next_id: torch.Tensor, past_state: int
+    ):
+        if self.grammar_processor is not None:
+            next_id = next_id.item()
+            self.fsm_grammar_states[grammar_state_index] = (
+                self.grammar_processor.advance_at_index(
+                    next_id,
+                    past_state,
+                    grammar_state_index,
+                )
+            )
+        return self
+
+    def filter(self, indices):
+        if self.watermark_processor is not None:
+            self.watermark_processor = self.watermark_processor.filter(indices)
+
+        if self.repetition_processor is not None:
+            self.repetition_processor = self.repetition_processor.filter(indices)
+
+        if self.frequency_processor is not None:
+            self.frequency_processor = self.frequency_processor.filter(indices)
+
+        if self.grammar_processor is not None:
+            self.grammar_processor = self.grammar_processor.filter(indices)
+
+        filtered_warpers = []
+        for warper in self.warpers:
+            filtered_warper = warper.filter(indices)
+            if filtered_warper is not None:
+                filtered_warpers.append(filtered_warper)
+        self.warpers = filtered_warpers
+
+        self.seeds = [self.seeds[i] for i in indices]
+        self.do_sample = [self.do_sample[i] for i in indices]
+
+        new_grammars = []
+        new_fsm_grammar_states = []
+        new_grammar_types = []
+        for i in indices:
+            new_grammars.append(self.grammars[i])
+            new_fsm_grammar_states.append(self.fsm_grammar_states[i])
+            new_grammar_types.append(self.grammar_types[i])
+
+        self.grammars = new_grammars
+        self.fsm_grammar_states = new_fsm_grammar_states
+        self.grammar_types = new_grammar_types
+
+        if any(self.do_sample):
+            self.choice.filter(indices)
+        else:
+            self.choice = Greedy()
+
+        return self
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: List[generate_pb2.NextTokenChooserParameters],
+        dtype: torch.dtype,
+        device: torch.device,
+        tokenizer: PreTrainedTokenizerBase,
+        fsm_grammar_states: Optional[List[int]] = None,
+        quantization_enabled: bool = False,
+    ) -> "HeterogeneousNextTokenChooser":
+        return HeterogeneousNextTokenChooser(
+            watermark=[pb_.watermark for pb_ in pb],
+            temperature=[pb_.temperature for pb_ in pb],
+            repetition_penalty=[pb_.repetition_penalty for pb_ in pb],
+            frequency_penalty=[pb_.frequency_penalty for pb_ in pb],
+            top_k=[pb_.top_k for pb_ in pb],
+            top_p=[pb_.top_p for pb_ in pb],
+            typical_p=[pb_.typical_p for pb_ in pb],
+            do_sample=[pb_.do_sample for pb_ in pb],
+            seeds=[pb_.seed for pb_ in pb],
+            device=device,
+            dtype=dtype,
+            tokenizer=tokenizer,
+            grammars=[pb_.grammar for pb_ in pb],
+            grammar_types=[pb_.grammar_type for pb_ in pb],
+            fsm_grammar_states=(
+                fsm_grammar_states if fsm_grammar_states else [0] * len(pb)
+            ),
+            quantization_enabled=quantization_enabled,
+        )
+
+
+def pad_next_token_chooser_parameters(
+    parameters: List[generate_pb2.NextTokenChooserParameters],
+    expected_size: int,
+) -> List[generate_pb2.NextTokenChooserParameters]:
+    # disable all logits processors to minimize padding overhead
+    empty_parameters = generate_pb2.NextTokenChooserParameters(
+        temperature=1.0,
+        top_k=0,
+        top_p=1.0,
+        typical_p=1.0,
+        do_sample=False,
+        seed=0,
+        repetition_penalty=1.0,
+        frequency_penalty=0.0,
+        watermark=False,
+        grammar="",
+        grammar_type=0,
+    )
+    parameters.extend([empty_parameters] * (expected_size - len(parameters)))
+    return parameters
+
+
+class Sampling:
+    def __init__(self, seed: int, device: str = "cpu"):
+        self.generator = torch.Generator("cpu")
+        self.generator.manual_seed(seed)
+        self.seed = seed
+
+    def __call__(self, logits):
+        probs = torch.nn.functional.softmax(logits, -1)
+        # Avoid GPU<->CPU sync done by torch multinomial
+        # See: https://github.com/pytorch/pytorch/blob/925a3788ec5c06db62ca732a0e9425a26a00916f/aten/src/ATen/native/Distributions.cpp#L631-L637
+        q = torch.empty_like(probs).exponential_(1, generator=self.generator)
+        return probs.div_(q).argmax()
+
+
+class Greedy:
+    def __call__(self, logits):
+        return logits.argmax(dim=-1)
+
+
+class HeterogeneousSampling:
+    r"""
+    Mixed greedy and probabilistic sampling. Compute both and pick the right one for each sample.
+    """
+
+    def __init__(self, do_sample: List[bool], seeds: List[int], device: torch.device):
+        self.seeds = seeds
+
+        self.greedy_indices = []
+        self.sampling_mapping = {}
+        for i, (sample, seed) in enumerate(zip(do_sample, seeds)):
+            if sample:
+                self.sampling_mapping[i] = Sampling(seed, device)
+            else:
+                self.greedy_indices.append(i)
+
+        self.greedy = Greedy()
+
+    def __call__(self, logits):
+        out = torch.zeros(logits.shape[0], dtype=torch.int64, device=logits.device)
+        if self.greedy_indices:
+            # Computing for all indices is faster than slicing
+            torch.argmax(logits, -1, out=out)
+
+        for i, sampling in self.sampling_mapping.items():
+            out[i] = sampling(logits[i])
+        return out
+
+    def filter(self, indices):
+        new_greedy_indices = []
+        new_sampling_mapping = {}
+        for i, idx in enumerate(indices):
+            if idx in self.sampling_mapping:
+                new_sampling_mapping[i] = self.sampling_mapping[idx]
+            else:
+                new_greedy_indices.append(i)
+
+        self.greedy_indices = new_greedy_indices
+        self.sampling_mapping = new_sampling_mapping
+        return self
+
+
+def batch_top_tokens(
+    top_n_tokens: List[int],
+    top_n_tokens_tensor: torch.Tensor,
+    logprobs: torch.Tensor,
+    accepted_ids: torch.Tensor,
+) -> Tuple[List[List[List[int]]], List[List[List[float]]]]:
+    """Find the top n most likely tokens for a batch of generations.
+
+    When multiple tokens have equal probabilities and they don't all fit, the
+    remaining tokens are also returned.
+    """
+    max_top_n = max(top_n_tokens)
+    # Early exit when top_n_tokens is not used
+    if max_top_n == 0:
+        return [[[]]] * len(top_n_tokens), [[[]]] * len(top_n_tokens)
+
+    batch_size = accepted_ids.shape[0]
+    speculate_size = logprobs.shape[0] // batch_size
+    top_n_tokens_tensor = top_n_tokens_tensor.repeat_interleave(speculate_size)
+    # Ensure top_n doesn't exceed vocab size
+    top_n_tokens = [
+        min(tok, logprobs.size(-1))
+        for tok in top_n_tokens
+        for _ in range(speculate_size)
+    ]
+
+    # Parallel kthvalue adapted from https://discuss.pytorch.org/t/how-to-efficiently-get-the-k-th-largest-values-in-parallel/160529/2
+    # Sorted topk is faster than torch.sort() since we only need a small subset
+    sorted_top_k = torch.topk(logprobs, k=max_top_n, dim=-1, sorted=True).values
+
+    nth_highest = torch.gather(
+        sorted_top_k, 1, (top_n_tokens_tensor - 1).clip(min=0).unsqueeze(1)
+    )
+    nth_highest[nth_highest == -float("inf")] = torch.finfo(logprobs.dtype).min
+
+    # Find the new "fuzzy" top n values
+    top_n_indices = (logprobs >= nth_highest).nonzero()
+    _, top_n_ishes = torch.unique_consecutive(top_n_indices[:, 0], return_counts=True)
+
+    k = 1 if top_n_ishes.numel() == 0 else top_n_ishes.max()
+    # Take a new topk for these new max n values
+    top_k = torch.topk(logprobs, k=k, dim=1, sorted=True)
+
+    top_n_ishes = top_n_ishes.tolist()
+    top_indices = top_k.indices.tolist()
+    top_values = top_k.values.tolist()
+
+    batch_top_token_ids = []
+    batch_top_token_logprobs = []
+    accepted_ids_list = accepted_ids.tolist()
+    for i, n_accepted_ids in enumerate(accepted_ids_list):
+        start = speculate_size * i
+        stop = speculate_size * (i + 1)
+        _top_indices = top_indices[start:stop]
+        _top_values = top_values[start:stop]
+        _top_n_ishes = top_n_ishes[start:stop]
+        _top_n_tokens = top_n_tokens[start:stop]
+
+        _top_indices = _top_indices[:n_accepted_ids]
+        _top_values = _top_values[:n_accepted_ids]
+        _top_n_ishes = _top_n_ishes[:n_accepted_ids]
+        _top_n_tokens = _top_n_tokens[:n_accepted_ids]
+
+        row_top_token_ids = []
+        row_top_token_logprobs = []
+
+        for idxs, vals, n, req_n in zip(
+            _top_indices, _top_values, _top_n_ishes, _top_n_tokens
+        ):
+            indices = idxs[:n] if req_n > 0 else []
+            values = vals[:n] if req_n > 0 else []
+
+            row_top_token_ids.append(indices)
+            row_top_token_logprobs.append(values)
+
+        batch_top_token_ids.append(row_top_token_ids)
+        batch_top_token_logprobs.append(row_top_token_logprobs)
+
+    return batch_top_token_ids, batch_top_token_logprobs
+
+
+def make_tokenizer_optional(tokenizer):
+    class _(type(tokenizer)):
+        def __call__(
+            self,
+            text,
+            return_tensors,
+            padding,
+            return_token_type_ids,
+            truncation,
+            max_length,
+        ):
+            assert (
+                return_tensors == "pt"
+            ), "inccorrect input arguments when calling TransparentTokenizer"
+            assert (
+                padding == "max_length" or padding == "longest"
+            ), "inccorrect input arguments when calling TransparentTokenizer"
+            assert (
+                not return_token_type_ids
+            ), "inccorrect input arguments when calling TransparentTokenizer"
+            assert (
+                truncation
+            ), "inccorrect input arguments when calling TransparentTokenizer"
+
+            def str_token_to_int(i):
+                if i == "?":
+                    return tokenizer.pad_token_id
+                else:
+                    return int(i)
+
+            all_tokens = [
+                [str_token_to_int(i.strip()) for i in inner_text.split(",")]
+                for inner_text in text
+            ]
+            if padding == "longest":
+                max_length = max(len(tokens) for tokens in all_tokens)
+            return {
+                "input_ids": torch.tensor(
+                    [
+                        [tokenizer.pad_token_id] * (max_length - len(tokens)) + tokens
+                        for tokens in all_tokens
+                    ]
+                ),
+                "attention_mask": torch.tensor(
+                    [
+                        [0] * (max_length - len(tokens)) + [1] * len(tokens)
+                        for tokens in all_tokens
+                    ]
+                ),
+            }
+
+        def decode(
+            self,
+            token_ids,
+            skip_special_tokens: bool = False,
+            clean_up_tokenization_spaces: bool = None,
+            **kwargs,
+        ) -> str:
+            # I don't think this method is used anywhere and should be removed when doing refactoring
+            return ",".join(str(i) for i in to_py_obj(token_ids))  # noqa: F821
+
+    if os.getenv("SKIP_TOKENIZER_IN_TGI", "false").lower() == "true":
+        tokenizer.__class__ = _
+        tokenizer.is_transparent = True
+
+
+def is_tokenizer_transparent(tokenizer):
+    return hasattr(tokenizer, "is_transparent") and tokenizer.is_transparent is True
diff --git a/backends/gaudi/server/text_generation_server/utils/version.py b/backends/gaudi/server/text_generation_server/utils/version.py
new file mode 100644
index 00000000..f54b6ae8
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/version.py
@@ -0,0 +1,12 @@
+from optimum.habana.utils import get_driver_version
+from packaging.version import Version
+
+MIN_TGI_GAUDI_SYNAPSE_VERSION = Version("1.19.0")
+
+
+def is_driver_compatible():
+    driver_version = get_driver_version()
+    if driver_version is not None:
+        if driver_version < MIN_TGI_GAUDI_SYNAPSE_VERSION:
+            return False
+    return True
diff --git a/backends/gaudi/server/text_generation_server/utils/watermark.py b/backends/gaudi/server/text_generation_server/utils/watermark.py
new file mode 100644
index 00000000..5092b076
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/watermark.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+from transformers import LogitsProcessor
+from typing import List, Union
+
+GAMMA = float(os.getenv("WATERMARK_GAMMA", 0.5))
+DELTA = float(os.getenv("WATERMARK_DELTA", 2.0))
+
+
+class WatermarkLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self,
+        gamma: float = GAMMA,
+        delta: float = DELTA,
+        hash_key: int = 15485863,  # just a large prime number to create a rng seed with sufficient bit width
+        device: str = "cpu",
+    ):
+        # watermarking parameters
+        self.gamma = gamma
+        self.delta = delta
+        self.rng = torch.Generator(device="cpu")
+        self.hash_key = hash_key
+
+    def _seed_rng(self, input_ids: Union[List[int], torch.LongTensor]):
+        if isinstance(input_ids, list):
+            assert (
+                len(input_ids) >= 1
+            ), "requires at least a 1 token prefix sequence to seed rng"
+            prev_token = input_ids[-1]
+        else:
+            assert len(input_ids) == 1
+            input_ids = input_ids[0]
+            assert (
+                input_ids.shape[-1] >= 1
+            ), "requires at least a 1 token prefix sequence to seed rng"
+            prev_token = input_ids[-1].item()
+        self.rng.manual_seed(self.hash_key * prev_token)
+
+    def _get_greenlist_ids(
+        self,
+        input_ids: Union[List[int], torch.LongTensor],
+        max_value: int,
+        device: torch.device,
+    ) -> List[int]:
+        # seed the rng using the previous tokens/prefix
+        self._seed_rng(input_ids)
+
+        greenlist_size = int(max_value * self.gamma)
+        vocab_permutation = torch.randperm(max_value, device=device, generator=self.rng)
+        greenlist_ids = vocab_permutation[:greenlist_size]
+        return greenlist_ids
+
+    @staticmethod
+    def _calc_greenlist_mask(
+        scores: torch.FloatTensor, greenlist_token_ids
+    ) -> torch.BoolTensor:
+        green_tokens_mask = torch.zeros_like(scores)
+        green_tokens_mask[-1, greenlist_token_ids] = 1
+        final_mask = green_tokens_mask.bool()
+        return final_mask
+
+    @staticmethod
+    def _bias_greenlist_logits(
+        scores: torch.Tensor, greenlist_mask: torch.Tensor, greenlist_bias: float
+    ) -> torch.Tensor:
+        scores[greenlist_mask] = scores[greenlist_mask] + greenlist_bias
+        return scores
+
+    def __call__(
+        self, input_ids: Union[List[int], torch.LongTensor], scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        greenlist_ids = self._get_greenlist_ids(
+            input_ids, scores.shape[-1], scores.device
+        )
+        green_tokens_mask = self._calc_greenlist_mask(
+            scores=scores, greenlist_token_ids=greenlist_ids
+        )
+
+        scores = self._bias_greenlist_logits(
+            scores=scores, greenlist_mask=green_tokens_mask, greenlist_bias=self.delta
+        )
+        return scores
diff --git a/backends/gaudi/server/text_generation_server/utils/weights.py b/backends/gaudi/server/text_generation_server/utils/weights.py
new file mode 100644
index 00000000..75e01f7c
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/weights.py
@@ -0,0 +1,437 @@
+import torch
+
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Dict, List, Optional, Union, Type
+from safetensors import safe_open
+from dataclasses import dataclass
+
+from text_generation_server.utils.import_utils import SYSTEM
+
+
+class WeightsLoader(ABC):
+    """
+    Instances of this type implement higher-level weight loading.
+
+    At a low-level, every weight is stored in the Safetensors format.
+    The interpretation of weights may be different however, for instance
+    could be packed, quantized weights. Loaders are responsible for
+    interpreting the raw tensors, sharding tensors in a manner compatible
+    with the format, etc.
+    """
+
+    @abstractmethod
+    def get_weights(self, weights: "Weights", prefix: str):
+        """
+        Get weights at the given prefix and apply without tensor paralllism.
+        """
+        ...
+
+    @abstractmethod
+    def get_weights_col_packed(
+        self,
+        weights: "Weights",
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        """
+        Get the packed weights at the given prefix with column-splitting for
+        tensor parallelism. This method should be used when multiple different
+        weights are packed into a tensor, for instance, query/key/value
+        weights or a gate/up projection.
+
+        The `block_sizes` determines the proportions of the packed tensors.
+        The columns are split in equally sized blocks when `block_sizes` is an
+        `int`, or in blocks proportional given to the sizes. For instance
+        `[2, 1, 1]` will divide an input with dimensionality `1024` in
+        `[512, 256, 256]`.
+        """
+        ...
+
+    def get_weights_col(self, weights: "Weights", prefix: str):
+        """
+        Get weights at the given prefix and apply column-splitting for tensor
+        paralllism.
+        """
+        return weights.get_multi_weights_col([prefix], 0)
+
+    @abstractmethod
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        """
+        Get the weights at the given prefixes, column-split them for tensor
+        parallelim, and then concatenate the weights along the given dimension.
+        """
+        ...
+
+    @abstractmethod
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        """
+        Get the weights at the given prefix and apply row-splitting for tensor
+        parallism.
+        """
+        ...
+
+
+class Weight(ABC):
+    """Instances of this type implement unquantized/quantized/to-be
+    quantized weights."""
+
+    @abstractmethod
+    def get_linear(self, bias: torch.Tensor):
+        """Create a linear layer from this weight."""
+        ...
+
+
+@dataclass
+class UnquantizedWeight(Weight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        from text_generation_server.layers.linear import FastLinear, FastLinearROCm
+
+        if SYSTEM == "rocm":
+            return FastLinearROCm(self.weight, bias)
+        else:
+            return FastLinear(self.weight, bias)
+
+
+class DefaultWeightsLoader(WeightsLoader):
+    """Weight loader that loads (unquantized) Torch tensors."""
+
+    def __init__(self, weight_class: Type[UnquantizedWeight]):
+        """Create a loader. Weights will be wrapped using the given `weights_class`,
+        normally this will be `UnquantizedWeight`, but a quantizer-specific class
+        such as `Fp8Weight` can be used to quantize the weights during loading.
+        """
+        self.weight_class = weight_class
+
+    """
+    Loader that uses tensors as-is with the exception of applying sharding
+    and/or concatenation.
+    """
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        return weights.get_tensor(f"{prefix}.weight")
+
+    def get_weights_col_packed(
+        self,
+        weights: "Weights",
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        return self.weight_class(
+            weights.get_packed_sharded(
+                f"{prefix}.weight", dim=0, block_sizes=block_sizes
+            ),
+        )
+
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        w = [weights.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
+        return self.weight_class(torch.cat(w, dim=dim))
+
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        return self.weight_class(
+            weights.get_sharded(f"{prefix}.weight", dim=1),
+        )
+
+
+class Weights:
+    def __init__(
+        self,
+        filenames: List[Path],
+        device,
+        dtype,
+        process_group,
+        weights_loader: WeightsLoader,
+        aliases: Optional[Dict[str, List[str]]] = None,
+        prefix: Optional[str] = None,
+    ):
+        routing = {}
+        for filename in filenames:
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+        if aliases is None:
+            aliases = {}
+        self.aliases = aliases
+        self.routing = routing
+        self.device = device
+        self.dtype = dtype
+        self.process_group = process_group
+        self.prefix = prefix
+        self.weights_loader = weights_loader
+        self._handles = {}
+
+    def _get_handle(self, filename):
+        if filename not in self._handles:
+            f = safe_open(filename, framework="pytorch")
+            self._handles[filename] = f
+
+        return self._handles[filename]
+
+    def get_filename(self, tensor_name: str) -> (str, str):
+        names = [tensor_name]
+        if self.prefix is not None:
+            prefixed = f"{self.prefix}.{tensor_name}"
+            names.append(prefixed)
+        for name in names:
+            filename = self.routing.get(name, None)
+            if filename is not None:
+                return str(filename), name
+
+            aliases = self.aliases.get(name, [])
+            for alias in aliases:
+                filename = self.routing.get(alias, None)
+                if filename is not None:
+                    return str(filename), alias
+        raise RuntimeError(f"weight {tensor_name} does not exist")
+
+    def _get_slice(self, tensor_name: str):
+        filename, tensor_name = self.get_filename(tensor_name)
+        f = self._get_handle(filename)
+        slice_ = f.get_slice(tensor_name)
+        return slice_
+
+    def _has_tensor(self, tensor_name: str):
+        try:
+            self.get_filename(tensor_name)
+        except Exception:
+            return False
+        return True
+
+    def get_shape(self, tensor_name: str):
+        return self._get_slice(tensor_name).get_shape()
+
+    def get_tensor(self, tensor_name: str, to_device=True, to_dtype=True):
+        filename, tensor_name = self.get_filename(tensor_name)
+        f = self._get_handle(filename)
+        tensor = f.get_tensor(tensor_name)
+        # Special case for gptq which shouldn't convert
+        # u4 which are disguised as int32. Exl2 uses int16
+        # as well. FP8 uses torch.float8_e4m3fn
+        if (
+            tensor.dtype
+            not in [
+                torch.float8_e4m3fn,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]
+            and to_dtype
+        ):
+            tensor = tensor.to(dtype=self.dtype)
+        if to_device:
+            tensor = tensor.to(device=self.device)
+        return tensor
+
+    def get_partial_sharded(
+        self, tensor_name: str, dim: int, to_device=True, to_dtype=True
+    ):
+        filename, tensor_name = self.get_filename(tensor_name)
+        f = self._get_handle(filename)
+        slice_ = f.get_slice(tensor_name)
+        world_size = self.process_group.size()
+        rank = self.process_group.rank()
+
+        size = slice_.get_shape()[dim]
+        block_size = (size + world_size - 1) // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+
+        if dim == 0:
+            tensor = slice_[start:stop]
+        elif dim == 1:
+            tensor = slice_[:, start:stop]
+        else:
+            raise NotImplementedError("Let's make that generic when needed")
+        # Special case for gptq which shouldn't convert
+        # u4 which are disguised as int32. exl2 uses int16.
+        # FP8 uses torch.float8_e4m3fn.
+        if (
+            tensor.dtype not in (torch.float8_e4m3fn, torch.int16, torch.int32)
+            and to_dtype
+        ):
+            tensor = tensor.to(dtype=self.dtype)
+        if to_device:
+            tensor = tensor.to(device=self.device)
+        return tensor
+
+    def get_sharded(self, tensor_name: str, dim: int, to_device=True, to_dtype=True):
+        filename, tensor_name = self.get_filename(tensor_name)
+        f = self._get_handle(filename)
+        slice_ = f.get_slice(tensor_name)
+        world_size = self.process_group.size()
+        size = slice_.get_shape()[dim]
+        assert (
+            size % world_size == 0
+        ), f"The choosen size {size} is not compatible with sharding on {world_size} shards"
+        return self.get_partial_sharded(
+            tensor_name, dim, to_device=to_device, to_dtype=to_dtype
+        )
+
+    def get_packed_sharded(
+        self,
+        tensor_name: str,
+        dim: int,
+        block_sizes: Union[int, List[int]],
+        to_dtype=True,
+    ) -> torch.Tensor:
+        """
+        Get a shard from a tensor that packs multiple tensors.
+
+        When a tensor packs multiple tensors (such as QKV or an up
+        projection + gate projection), sharding with `get_sharded` is not
+        safe since it would not split the packed tensors across shards.
+
+        This method shards a tensor, such that the packed tensors are
+        split across shards.
+
+        The columns are split in equally sized blocks when blocks is an `int`, or
+        in blocks proportional given to the sizes. For instance `[2, 1, 1]` will
+        divide an input with dimensionality `1024` in `[512, 256, 256]`. This is
+        convenient for e.g. splitting QKV without knowing the storage details of
+        quantized weights.
+        """
+        slice_ = self._get_slice(tensor_name)
+        total_size = slice_.get_shape()[dim]
+        block_sizes = _blocks_to_block_sizes(total_size=total_size, blocks=block_sizes)
+
+        world_size = self.process_group.size()
+        rank = self.process_group.rank()
+
+        tensors = []
+        block_offset = 0
+        for block_size in block_sizes:
+            assert (
+                block_size % world_size == 0
+            ), f"Prepacked tensor cannot be sharded across {world_size} shards"
+            shard_block_size = block_size // world_size
+            start = rank * shard_block_size
+            stop = (rank + 1) * shard_block_size
+            if dim == 0:
+                tensor = slice_[block_offset + start : block_offset + stop]
+            elif dim == 1:
+                tensor = slice_[:, block_offset + start : block_offset + stop]
+            else:
+                raise NotImplementedError("Currently only dim=0 or dim=1 is supported")
+            tensors.append(tensor)
+            block_offset += block_size
+        tensor = torch.cat(tensors, dim=dim)
+        tensor = tensor.to(device=self.device)
+
+        # Avoid casting quantizer dtypes.
+        if (
+            tensor.dtype
+            not in [
+                torch.float8_e4m3fn,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]
+            and to_dtype
+        ):
+            tensor = tensor.to(dtype=self.dtype)
+
+        return tensor
+
+    def get_weights(self, prefix: str):
+        return self.weights_loader.get_weights(self, prefix)
+
+    def get_weights_col_packed_qkv(
+        self,
+        prefix: str,
+        num_heads: int,
+        num_key_value_heads: int,
+    ):
+        return self.get_weights_col_packed(
+            prefix, [num_heads, num_key_value_heads, num_key_value_heads]
+        )
+
+    def get_weights_col_packed_gate_up(self, prefix: str):
+        return self.get_weights_col_packed(prefix, 2)
+
+    def get_weights_col_packed(self, prefix: str, block_sizes: Union[int, List[int]]):
+        """
+        The columns are split in equally sized blocks when blocks is an `int`, or
+        in blocks proportional given to the sizes. For instance `[2, 1, 1]` will
+        divide an input with dimensionality `1024` in `[512, 256, 256]`. This is
+        convenient for e.g. splitting QKV without knowing the storage details of
+        quantized weights.
+        """
+        return self.weights_loader.get_weights_col_packed(self, prefix, block_sizes)
+
+    def get_weights_col(self, prefix: str):
+        return self.weights_loader.get_weights_col(self, prefix)
+
+    def get_multi_weights_col(self, prefixes: List[str], dim: int):
+        return self.weights_loader.get_multi_weights_col(self, prefixes, dim)
+
+    def get_tensor_shard(self, var, dim):
+        world_size = self.process_group.size()
+        rank = self.process_group.rank()
+        block_size = var.size()[dim] // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        if dim == 0:
+            tensor = var[start:stop]
+        elif dim == 1:
+            tensor = var[:, start:stop]
+        else:
+            raise NotImplementedError("Let's make that generic when needed")
+        tensor = tensor.to(dtype=self.dtype)
+        tensor = tensor.to(device=self.device)
+        return tensor
+
+    def get_weights_row(self, prefix: str):
+        return self.weights_loader.get_weights_row(self, prefix)
+
+    @contextmanager
+    def use_loader(self, weights_loader: WeightsLoader):
+        """
+        This method is a context manager that can be used to use `Weights` with
+        a different loader for the duration of the context.
+        """
+
+        old_loader = self.weights_loader
+        self.weights_loader = weights_loader
+        try:
+            yield
+        finally:
+            self.weights_loader = old_loader
+
+    @property
+    def loader(self):
+        return self.weights_loader
+
+
+def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]]) -> List[int]:
+    """
+    Convert block count or proportions to block sizes.
+
+    This function accepts
+
+    - The number of blocks (int), in which case the block size is
+      total_size//blocks; or
+    - A list of block sizes (List[int]).
+
+    In the latter case, if sum(blocks) < total_size, the ratios between
+    the block sizes will be preserved. For instance, if blocks is
+    [2, 1, 1] and total_size is 1024, the returned block sizes are
+    [512, 256, 256].
+    """
+    if isinstance(blocks, list):
+        total_blocks = sum(blocks)
+        assert (
+            total_size % total_blocks == 0
+        ), f"Cannot split {total_size} in proportional blocks: {blocks}"
+        part_size = total_size // total_blocks
+        return [part_size * block for block in blocks]
+    else:
+        assert total_size % blocks == 0, f"Prepacked is not divisible by {blocks}"
+        single_size = total_size // blocks
+        return [single_size] * blocks
diff --git a/backends/gaudi/tgi-entrypoint.sh b/backends/gaudi/tgi-entrypoint.sh
new file mode 100644
index 00000000..a5c3f5e1
--- /dev/null
+++ b/backends/gaudi/tgi-entrypoint.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
+
+# Check if --sharded argument is present in the command line arguments
+if [[ "$*" == *"--sharded true"* ]]; then
+  echo 'setting PT_HPU_ENABLE_LAZY_COLLECTIVES=1 for sharding'
+  export PT_HPU_ENABLE_LAZY_COLLECTIVES=1
+fi
+
+text-generation-launcher $@
diff --git a/launcher/src/env_runtime.rs b/launcher/src/env_runtime.rs
index 08fb301c..d7ae11d5 100644
--- a/launcher/src/env_runtime.rs
+++ b/launcher/src/env_runtime.rs
@@ -8,22 +8,29 @@ pub(crate) struct Env {
     docker_label: &'static str,
     nvidia_env: String,
     xpu_env: String,
+    hpu_env: String,
 }
 
 impl Env {
     pub fn new() -> Self {
         let nvidia_env = nvidia_smi();
         let xpu_env = xpu_smi();
+        let hpu_env = hl_smi();
 
         Self {
             nvidia_env: nvidia_env.unwrap_or("N/A".to_string()),
             xpu_env: xpu_env.unwrap_or("N/A".to_string()),
+            hpu_env: hpu_env.unwrap_or("N/A".to_string()),
             cargo_target: env!("VERGEN_CARGO_TARGET_TRIPLE"),
             cargo_version: env!("VERGEN_RUSTC_SEMVER"),
             git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
             docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
         }
     }
+
+    pub fn is_hpu_device(&self) -> bool {
+        self.hpu_env != "N/A"
+    }
 }
 
 impl fmt::Display for Env {
@@ -35,7 +42,8 @@ impl fmt::Display for Env {
         writeln!(f, "Commit sha: {}", self.git_sha)?;
         writeln!(f, "Docker label: {}", self.docker_label)?;
         writeln!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
-        write!(f, "xpu-smi:\n{}", self.xpu_env)?;
+        writeln!(f, "xpu-smi:\n{}", self.xpu_env)?;
+        writeln!(f, "hpu-smi:\n{}", self.hpu_env)?;
 
         Ok(())
     }
@@ -54,3 +62,10 @@ fn xpu_smi() -> Option<String> {
     let output = xpu_smi.replace('\n', "\n   ");
     Some(output.trim().to_string())
 }
+
+fn hl_smi() -> Option<String> {
+    let output = Command::new("hl-smi").output().ok()?;
+    let hl_smi = String::from_utf8(output.stdout).ok()?;
+    let output = hl_smi.replace('\n', "\n   ");
+    Some(output.trim().to_string())
+}
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index fbbe8a2d..d5d1ba83 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1531,6 +1531,11 @@ fn spawn_shards(
 ) -> Result<(), LauncherError> {
     // Start shard processes
     for rank in 0..num_shard {
+        if rank != 0 && env_runtime::Env::new().is_hpu_device() {
+            tracing::info!("Running on HPU, the launcher will not do any sharding as actual sharding is done in the server");
+            break;
+        }
+
         let model_id = args.model_id.clone();
         let revision = args.revision.clone();
         let uds_path = args.shard_uds_path.clone();
@@ -1605,6 +1610,10 @@ fn spawn_shards(
                 if shard_ready == num_shard {
                     break;
                 }
+                if env_runtime::Env::new().is_hpu_device() {
+                    tracing::info!("HPU detected, shard is ready");
+                    break;
+                }
             }
             Err(TryRecvError::Empty) => {
                 sleep(Duration::from_millis(100));

From 1f35cc7a31d07451077e63b578829f16ef5240d0 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 4 Mar 2025 12:13:58 +0000
Subject: [PATCH 111/183] Updating patch rust release.

---
 rust-toolchain.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 25959e0e..65f01fa5 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
 # Released on: June 13, 2024
 # https://releases.rs/docs/1.79.0/
-channel = "1.84.0"
+channel = "1.84.1"
 components = ["rustfmt", "clippy"]

From aad9c2b0bdb966d38c78ff18b9a657f831b6f4bb Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 4 Mar 2025 12:14:58 +0000
Subject: [PATCH 112/183] Patch rust release.

---
 .github/workflows/tests.yaml | 2 +-
 Dockerfile                   | 2 +-
 Dockerfile_amd               | 2 +-
 Dockerfile_intel             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index f9ccb842..812c260d 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -31,7 +31,7 @@ jobs:
         with:
           # Released on: 02 May, 2024
           # https://releases.rs/docs/1.78.0/
-          toolchain: 1.84.0
+          toolchain: 1.84.1
           override: true
           components: rustfmt, clippy
       - name: Install Protoc
diff --git a/Dockerfile b/Dockerfile
index 2d769c15..483b3e21 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_amd b/Dockerfile_amd
index f95f3b6d..6c3b62af 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 134a6f66..8d8474da 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -1,6 +1,6 @@
 ARG PLATFORM=xpu
 
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

From a914a2189987ad75e6eedf5a04371d4eccc09e5b Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 4 Mar 2025 12:16:18 +0000
Subject: [PATCH 113/183] Revert "Patch rust release."

This reverts commit aad9c2b0bdb966d38c78ff18b9a657f831b6f4bb.
---
 .github/workflows/tests.yaml | 2 +-
 Dockerfile                   | 2 +-
 Dockerfile_amd               | 2 +-
 Dockerfile_intel             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 812c260d..f9ccb842 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -31,7 +31,7 @@ jobs:
         with:
           # Released on: 02 May, 2024
           # https://releases.rs/docs/1.78.0/
-          toolchain: 1.84.1
+          toolchain: 1.84.0
           override: true
           components: rustfmt, clippy
       - name: Install Protoc
diff --git a/Dockerfile b/Dockerfile
index 483b3e21..2d769c15 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.1 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_amd b/Dockerfile_amd
index 6c3b62af..f95f3b6d 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.1 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 8d8474da..134a6f66 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -1,6 +1,6 @@
 ARG PLATFORM=xpu
 
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.1 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

From fa4e9511f8f9223f5a8fa194ae9917f3c3c4c28e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 4 Mar 2025 13:23:27 +0100
Subject: [PATCH 114/183] Fix two edge cases in `RadixTrie::find` (#3067)

- Always return a node, not its parent.
- Do not recurse when a node does not represent a full prefix of the
  input.
---
 backends/v3/src/radix.rs | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/backends/v3/src/radix.rs b/backends/v3/src/radix.rs
index 532ec6dd..81ce61d1 100644
--- a/backends/v3/src/radix.rs
+++ b/backends/v3/src/radix.rs
@@ -283,7 +283,7 @@ impl RadixTrie {
     }
 
     /// Find worker.
-    fn find_(&mut self, mut node_id: NodeId, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
+    fn find_(&mut self, node_id: NodeId, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
         let node = &self.nodes[node_id];
 
         if key.len() >= self.block_size {
@@ -295,9 +295,13 @@ impl RadixTrie {
                 assert_eq!(shared_prefix_len % self.block_size, 0);
                 blocks.extend(&child.blocks[..shared_prefix_len / self.block_size]);
 
+                // A node represents the prefix of its children. So, only
+                // recurse when there is a full prefix match.
                 let key = &key[shared_prefix_len..];
-                if !key.is_empty() {
-                    node_id = self.find_(child_id, key, blocks);
+                if !key.is_empty() && shared_prefix_len == child.key.len() {
+                    return self.find_(child_id, key, blocks);
+                } else {
+                    return child_id;
                 }
             }
         }
@@ -873,4 +877,27 @@ mod tests {
         // Clear out the whole trie.
         assert_eq!(trie.evict(10), vec![1, 2, 3, 0, 1]);
     }
+
+    #[test]
+    fn full_match_returns_correct_node() {
+        let mut trie = RadixTrie::new(1);
+        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
+        let node_id = trie.find(&[0, 1, 2], &mut vec![]);
+        // At this point, there are only two nodes: the root and the node
+        // with tokens 0, 1, 2. Looking up the exact prefix must return
+        // the non-root node.
+        assert_ne!(node_id, trie.root);
+    }
+
+    #[test]
+    fn partial_match_does_not_recurse() {
+        let mut trie = RadixTrie::new(1);
+        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
+        trie.insert(&[0, 1, 2, 3, 4, 5], &[0, 1, 2, 3, 4, 5])
+            .unwrap();
+        let mut blocks = Vec::new();
+        let node_id = trie.find(&[0, 1, 3, 4, 5], &mut blocks);
+        assert_eq!(blocks, vec![0, 1]);
+        assert_eq!(node_id, trie.find(&[0, 1], &mut blocks))
+    }
 }

From e88f6f6ee978ba6ce5df3a999e7efe2717e8532e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 4 Mar 2025 15:09:46 +0100
Subject: [PATCH 115/183] Add property-based testing for `RadixAllocator`
 (#3068)

---
 Cargo.lock               |   1 +
 backends/v3/Cargo.toml   |   1 +
 backends/v3/src/radix.rs | 138 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 140 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 4603f77d..16f8c80d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4847,6 +4847,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "reqwest 0.11.27",
+ "rustc-hash 2.1.1",
  "serde",
  "serde_json",
  "slotmap",
diff --git a/backends/v3/Cargo.toml b/backends/v3/Cargo.toml
index 996290ed..588a2716 100644
--- a/backends/v3/Cargo.toml
+++ b/backends/v3/Cargo.toml
@@ -71,6 +71,7 @@ prost-build = "0.12.1"
 [dev-dependencies]
 criterion = "0.3"
 itertools = "0.13"
+rustc-hash = "2"
 
 [features]
 default = ["ngrok"]
diff --git a/backends/v3/src/radix.rs b/backends/v3/src/radix.rs
index 81ce61d1..aea69693 100644
--- a/backends/v3/src/radix.rs
+++ b/backends/v3/src/radix.rs
@@ -635,6 +635,12 @@ fn shared_prefix(left: &[u32], right: &[u32], block_size: usize) -> usize {
 mod tests {
     use std::sync::Arc;
 
+    use rand::{
+        distributions::Uniform, prelude::Distribution, rngs::SmallRng, seq::SliceRandom,
+        SeedableRng,
+    };
+    use rustc_hash::FxHashSet;
+
     use super::*;
 
     #[test]
@@ -900,4 +906,136 @@ mod tests {
         assert_eq!(blocks, vec![0, 1]);
         assert_eq!(node_id, trie.find(&[0, 1], &mut blocks))
     }
+
+    struct AllocationWithInfo {
+        allocation: BlockAllocation,
+        // We are doing a lot of set operations and `FxBuildHasher` is
+        // muc faster for a set of integers.
+        blockset: FxHashSet<u32>,
+        non_prefix_blocks: FxHashSet<u32>,
+    }
+
+    #[test]
+    fn invariants_hold_on_many_operations_remove_all() {
+        invariants_hold_on_many_insertions(true);
+    }
+
+    #[test]
+    fn invariants_hold_on_many_operations_remove_subset() {
+        invariants_hold_on_many_insertions(false);
+    }
+
+    fn invariants_hold_on_many_insertions(remove_all: bool) {
+        // Small vocabulary sizes lead to violations more quickly due to
+        // prefix sharing, etc.
+        const VOCAB_SIZE: u32 = 2;
+        const DATA_LEN: usize = 1_000;
+
+        const MAX_PREFILL_LEN: usize = 8;
+        const MAX_DECODE_LEN: usize = 8;
+
+        let vocab_range = Uniform::new(0, VOCAB_SIZE);
+        let data_range = Uniform::new(0, DATA_LEN);
+        let prefill_len_range = Uniform::new(0, MAX_PREFILL_LEN);
+        let decode_len_range = Uniform::new(0, MAX_DECODE_LEN);
+
+        let mut rng = SmallRng::seed_from_u64(64);
+        let data = (0..DATA_LEN)
+            .map(|_| vocab_range.sample(&mut rng))
+            .collect::<Vec<_>>();
+        let mut allocator = RadixAllocator::new(1, 100, None);
+
+        let mut allocations = Vec::new();
+
+        for i in 0..100_000 {
+            // Allocate until all blocks are used.
+            'allocation: loop {
+                // Use offset 0 half of the times for prefix sharing.
+                let prefill_offset = data_range.sample(&mut rng);
+                let prefill_len = prefill_len_range.sample(&mut rng);
+                let decode_len = decode_len_range.sample(&mut rng);
+
+                let prefill =
+                    data[prefill_offset..data.len().min(prefill_offset + prefill_len)].to_vec();
+
+                let allocation = match allocator
+                    .allocate((prefill.len() + decode_len) as u32, Some(Arc::new(prefill)))
+                {
+                    Some(allocation) => allocation,
+                    None => break 'allocation,
+                };
+                let non_prefix_blocks = allocation.blocks[allocation.prefix_len as usize..]
+                    .iter()
+                    .copied()
+                    .collect::<FxHashSet<_>>();
+                let blockset = allocation.blocks.iter().copied().collect::<FxHashSet<_>>();
+
+                // No duplicate blocks in an allocation.
+                assert_eq!(
+                    allocation.blocks.len(),
+                    blockset.len(),
+                    "Duplicate blocks in allocation"
+                );
+
+                allocations.push(AllocationWithInfo {
+                    allocation,
+                    blockset,
+                    non_prefix_blocks,
+                });
+            }
+
+            // Check invariants. Skip first iteration, since there is no prefix sharing yet.
+            if i > 1 {
+                check_allocation_invariants(&allocations);
+            }
+
+            // Remove 20% of the allocations, randomly.
+            if remove_all {
+                allocations.into_iter().for_each(|allocation| {
+                    allocator.free(
+                        allocation.allocation.blocks.clone(),
+                        allocation.allocation.allocation_id,
+                    )
+                });
+                allocations = Vec::new();
+            } else {
+                allocations.shuffle(&mut rng);
+                let remove_index = (allocations.len() as f64 * 0.8) as usize;
+                for allocation in allocations.drain(remove_index..) {
+                    allocator.free(
+                        allocation.allocation.blocks.clone(),
+                        allocation.allocation.allocation_id,
+                    );
+                }
+            }
+        }
+    }
+
+    fn check_allocation_invariants(allocations: &[AllocationWithInfo]) {
+        for i in 0..allocations.len() {
+            let allocation = &allocations[i];
+
+            // 0 is used for health checks, must not be used.
+            assert!(
+                !allocation.blockset.contains(&0),
+                "Block 0 must not be allocated"
+            );
+
+            // No duplicate blocks in an allocation.
+            assert_eq!(
+                allocation.allocation.blocks.len(),
+                allocation.blockset.len(),
+                "Duplicate blocks in allocation"
+            );
+
+            for other_allocation in &allocations[i + 1..] {
+                assert!(
+                    other_allocation
+                        .non_prefix_blocks
+                        .is_disjoint(&allocation.non_prefix_blocks),
+                    "Allocations share non-prefix blocks"
+                )
+            }
+        }
+    }
 }

From d8ff7f26234cce6ea2ca6da12daa67b1a640cbc8 Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Tue, 4 Mar 2025 16:43:50 +0100
Subject: [PATCH 116/183] feat: add support for HF_HUB_USER_AGENT_ORIGIN to add
 user-agent Origin field in Hub requests. (#3061)

* feat: add support for HF_HUB_USER_AGENT_ORIGIN to add user-agent Origin field in Hub requests.

* fix: Rust version for Neuron

* fix: PR comments, use rust-toolchain.toml
---
 Cargo.lock                  | 42 ++++++++++++++++++-------------------
 Cargo.toml                  |  2 +-
 Dockerfile.neuron           |  3 ++-
 backends/neuron/Cargo.toml  |  2 +-
 backends/trtllm/src/main.rs |  4 ++++
 launcher/Cargo.toml         |  2 +-
 launcher/src/main.rs        | 15 ++++++-------
 router/src/server.rs        |  4 ++++
 server/pyproject.toml       |  3 ++-
 server/uv.lock              |  8 ++++---
 10 files changed, 48 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 16f8c80d..b85cc516 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1007,9 +1007,9 @@ dependencies = [
 
 [[package]]
 name = "cxx"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc49567e08c72902f4cbc7242ee8d874ec9cbe97fbabf77b4e0e1f447513e13a"
+checksum = "8bc580dceb395cae0efdde0a88f034cfd8a276897e40c693a7b87bed17971d33"
 dependencies = [
  "cc",
  "cxxbridge-cmd",
@@ -1021,9 +1021,9 @@ dependencies = [
 
 [[package]]
 name = "cxx-build"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe46b5309c99e9775e7a338c98e4097455f52db5b684fd793ca22848fde6e371"
+checksum = "49d8c1baedad72a7efda12ad8d7ad687b3e7221dfb304a12443fd69e9de8bb30"
 dependencies = [
  "cc",
  "codespan-reporting",
@@ -1035,9 +1035,9 @@ dependencies = [
 
 [[package]]
 name = "cxxbridge-cmd"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4315c4ce8d23c26d87f2f83698725fd5718d8e6ace4a9093da2664d23294d372"
+checksum = "e43afb0e3b2ef293492a31ecd796af902112460d53e5f923f7804f348a769f9c"
 dependencies = [
  "clap 4.5.30",
  "codespan-reporting",
@@ -1048,15 +1048,15 @@ dependencies = [
 
 [[package]]
 name = "cxxbridge-flags"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f55d69deb3a92f610a60ecc524a72c7374b6dc822f8fb7bb4e5d9473f10530c4"
+checksum = "0257ad2096a2474fe877e9e055ab69603851c3d6b394efcc7e0443899c2492ce"
 
 [[package]]
 name = "cxxbridge-macro"
-version = "1.0.140"
+version = "1.0.141"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bee7a1d9b5091462002c2b8de2a4ed0f0fde011d503cc272633f66075bd5141"
+checksum = "b46cbd7358a46b760609f1cb5093683328e58ca50e594a308716f5403fdc03e5"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1660,30 +1660,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732"
 dependencies = [
  "dirs",
- "futures",
  "indicatif",
  "log",
  "native-tls",
- "num_cpus",
  "rand 0.8.5",
- "reqwest 0.11.27",
  "serde",
  "serde_json",
  "thiserror 1.0.69",
- "tokio",
  "ureq",
 ]
 
 [[package]]
 name = "hf-hub"
-version = "0.4.1"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "112fa2f6ad4ab815b9e1b938b4b1e437032d055e2f92ed10fd6ab2e62d02c6b6"
+checksum = "cc03dcb0b0a83ae3f3363ec811014ae669f083e4e499c66602f447c4828737a1"
 dependencies = [
  "dirs",
  "futures",
  "http 1.2.0",
  "indicatif",
+ "libc",
  "log",
  "native-tls",
  "num_cpus",
@@ -1694,6 +1691,7 @@ dependencies = [
  "thiserror 2.0.11",
  "tokio",
  "ureq",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -4627,7 +4625,7 @@ dependencies = [
  "cxx",
  "cxx-build",
  "hashbrown 0.15.2",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
  "pkg-config",
  "pyo3",
  "text-generation-router",
@@ -4645,7 +4643,7 @@ dependencies = [
  "average",
  "clap 4.5.30",
  "float-ord",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
  "ratatui",
  "serde",
  "serde_json",
@@ -4683,7 +4681,7 @@ dependencies = [
  "clap 4.5.30",
  "ctrlc",
  "float_eq",
- "hf-hub 0.4.1",
+ "hf-hub 0.4.2",
  "nix 0.28.0",
  "once_cell",
  "pyo3",
@@ -4712,7 +4710,7 @@ dependencies = [
  "csv",
  "futures",
  "futures-util",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
  "image",
  "init-tracing-opentelemetry",
  "itertools 0.10.5",
@@ -4779,7 +4777,7 @@ dependencies = [
  "futures",
  "futures-util",
  "grpc-metadata",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
  "image",
  "init-tracing-opentelemetry",
  "jsonschema",
@@ -4829,7 +4827,7 @@ dependencies = [
  "futures",
  "futures-util",
  "grpc-metadata",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
  "image",
  "init-tracing-opentelemetry",
  "itertools 0.13.0",
diff --git a/Cargo.toml b/Cargo.toml
index df7f2a73..72075940 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,7 +29,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
 [workspace.dependencies]
 base64 = "0.22.0"
 tokenizers = { version = "0.20.0", features = ["http"] }
-hf-hub = { version = "0.3.1", features = ["tokio"] }
+hf-hub = { version = "0.4.1", features = ["tokio"] }
 metrics = { version = "0.23.0" }
 metrics-exporter-prometheus = { version = "0.15.1", features = [] }
 minijinja = { version = "2.2.0", features = ["json"] }
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index c7c4af68..272407c7 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -18,7 +18,8 @@ RUN apt-get update -y \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.80.1 --profile minimal -y
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN cargo install cargo-chef --locked
 
diff --git a/backends/neuron/Cargo.toml b/backends/neuron/Cargo.toml
index 3b237eda..72f92e69 100644
--- a/backends/neuron/Cargo.toml
+++ b/backends/neuron/Cargo.toml
@@ -22,7 +22,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
 [workspace.dependencies]
 base64 = "0.22.0"
 tokenizers = { version = "0.20.0", features = ["http"] }
-hf-hub = { version = "0.3.1", features = ["tokio"] }
+hf-hub = { version = "0.4.2", features = ["tokio"] }
 metrics = { version = "0.23.0" }
 metrics-exporter-prometheus = { version = "0.15.1", features = [] }
 minijinja = { version = "2.2.0", features = ["json"] }
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index cef225be..9d4bf8f2 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -86,6 +86,10 @@ async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option<T
             builder = builder.with_cache_dir(cache_dir.into());
         }
 
+        if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
+            builder = builder.with_user_agent("origin", origin.as_str());
+        }
+
         builder
     };
 
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index fdbb5994..2d2571ce 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -9,7 +9,7 @@ homepage.workspace = true
 [dependencies]
 clap = { version = "4.4.5", features = ["derive", "env"] }
 ctrlc = { version = "3.4.1", features = ["termination"] }
-hf-hub = "0.4.1"
+hf-hub = "0.4.2"
 nix = { version = "0.28.0", features = ["signal"] }
 once_cell = "1.19.0"
 pyo3 = { workspace = true }
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index d5d1ba83..cd4b2231 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1,8 +1,5 @@
 use clap::{Parser, ValueEnum};
-use hf_hub::{
-    api::sync::{Api, ApiBuilder},
-    Repo, RepoType,
-};
+use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
 use nix::sys::signal::{self, Signal};
 use nix::unistd::Pid;
 use serde::Deserialize;
@@ -100,12 +97,16 @@ fn get_config(
     let filename = if !path.exists() {
         // Assume it's a hub id
 
-        let api = if let Ok(token) = std::env::var("HF_TOKEN") {
+        let mut builder = if let Ok(token) = std::env::var("HF_TOKEN") {
             // env variable has precedence over on file token.
-            ApiBuilder::new().with_token(Some(token)).build()?
+            ApiBuilder::new().with_token(Some(token))
         } else {
-            Api::new()?
+            ApiBuilder::new()
         };
+        if let Ok(origin) = env::var("HF_HUB_USER_AGENT_ORIGIN") {
+            builder = builder.with_user_agent("origin", origin.as_str());
+        }
+        let api = builder.build()?;
         let repo = if let Some(ref revision) = revision {
             api.repo(Repo::with_revision(
                 model_id,
diff --git a/router/src/server.rs b/router/src/server.rs
index e9aa4612..c566cf98 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1719,6 +1719,10 @@ pub async fn run(
             builder = builder.with_cache_dir(cache_dir.into());
         }
 
+        if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
+            builder = builder.with_user_agent("origin", origin.as_str());
+        }
+
         builder
     };
 
diff --git a/server/pyproject.toml b/server/pyproject.toml
index bda9df1b..7fd690cc 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -31,7 +31,8 @@ dependencies = [
     "sentencepiece>=0.2.0",
     "tokenizers>=0.20.3",
     "typer>=0.15.1",
-    "transformers>=4.48.0"
+    "transformers>=4.48.0",
+    "huggingface-hub>=0.29.0",
 ]
 
 [build-system]
diff --git a/server/uv.lock b/server/uv.lock
index bbb2c9d8..fecccecf 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -720,7 +720,7 @@ wheels = [
 
 [[package]]
 name = "huggingface-hub"
-version = "0.28.1"
+version = "0.29.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -731,9 +731,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e7/ce/a734204aaae6c35a22f9956ebcd8d8708ae5b842e15d6f42bd6f49e634a4/huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae", size = 387074 }
+sdist = { url = "https://files.pythonhosted.org/packages/22/37/797d6476f13e5ef6af5fc48a5d641d32b39c37e166ccf40c3714c5854a85/huggingface_hub-0.29.1.tar.gz", hash = "sha256:9524eae42077b8ff4fc459ceb7a514eca1c1232b775276b009709fe2a084f250", size = 389776 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ea/da/6c2bea5327b640920267d3bf2c9fc114cfbd0a5de234d81cda80cc9e33c8/huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7", size = 464068 },
+    { url = "https://files.pythonhosted.org/packages/ae/05/75b90de9093de0aadafc868bb2fa7c57651fd8f45384adf39bd77f63980d/huggingface_hub-0.29.1-py3-none-any.whl", hash = "sha256:352f69caf16566c7b6de84b54a822f6238e17ddd8ae3da4f8f2272aea5b198d5", size = 468049 },
 ]
 
 [[package]]
@@ -2563,6 +2563,7 @@ dependencies = [
     { name = "grpcio-status" },
     { name = "hf-kernels" },
     { name = "hf-transfer" },
+    { name = "huggingface-hub" },
     { name = "loguru" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
     { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
@@ -2627,6 +2628,7 @@ requires-dist = [
     { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
     { name = "hf-kernels", specifier = ">=0.1.5" },
     { name = "hf-transfer", specifier = ">=0.1.8" },
+    { name = "huggingface-hub", specifier = ">=0.29.0" },
     { name = "loguru", specifier = ">=0.7.3" },
     { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
     { name = "numpy", specifier = ">=1.26,<3" },

From 08bbfa16a15befeddc48a60594ffe54438593a7a Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 4 Mar 2025 16:47:10 +0100
Subject: [PATCH 117/183] Preparing for release. (#3060)

* Preparing for release.

* Upgrade doc.

* Fix docs auto-generated.

* Fix update doc along.
---
 Cargo.lock                                       | 16 ++++++++--------
 Cargo.toml                                       |  2 +-
 README.md                                        |  6 +++---
 docs/openapi.json                                |  2 +-
 docs/source/backends/neuron.md                   |  2 +-
 .../source/basic_tutorials/gated_model_access.md |  2 +-
 docs/source/conceptual/quantization.md           |  6 +++---
 docs/source/installation_amd.md                  |  2 +-
 docs/source/installation_intel.md                |  4 ++--
 docs/source/installation_nvidia.md               |  2 +-
 docs/source/quicktour.md                         |  4 ++--
 docs/source/reference/api_reference.md           |  2 +-
 update_doc.py                                    |  4 +---
 13 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b85cc516..0c28b285 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4617,7 +4617,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
  "async-trait",
  "clap 4.5.30",
@@ -4638,7 +4638,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-benchmark"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
  "average",
  "clap 4.5.30",
@@ -4658,7 +4658,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -4676,7 +4676,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
  "clap 4.5.30",
  "ctrlc",
@@ -4697,7 +4697,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
  "anyhow",
  "async-stream",
@@ -4749,7 +4749,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-llamacpp"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
  "async-trait",
  "bindgen 0.71.1",
@@ -4766,7 +4766,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v2"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -4815,7 +4815,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v3"
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
diff --git a/Cargo.toml b/Cargo.toml
index 72075940..4e3ad010 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "3.1.1-dev0"
+version = "3.1.2-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
diff --git a/README.md b/README.md
index b344892e..a24ed5f1 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/docs/openapi.json b/docs/openapi.json
index 9de76e47..e16ca7f9 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "3.1.1-dev0"
+    "version": "3.1.2-dev0"
   },
   "paths": {
     "/": {
diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md
index 8e80701f..e7ba8873 100644
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:
 
 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.1.0-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.1.1-neuron <service_parameters>
 ```
 
 - system parameters are used to map ports, volumes and devices between the host and the service,
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 949aab56..2cbd7e06 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 \
     --model-id $model
 ```
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index 449cc79b..77a64a88 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes
 ```
 
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes-nf4
 ```
 
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.0 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize gptq
 ```
 
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
index 100bc2a9..20ef26a8 100644
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
     --device=/dev/kfd --device=/dev/dri --group-add video \
     --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-rocm \
     --model-id $model
 ```
 
diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md
index b2279bb4..a0bf11d1 100644
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-intel-xpu \
     --model-id $model --cuda-graphs 0
 ```
 
@@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-intel-cpu \
     --model-id $model --cuda-graphs 0
 ```
 
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
index 8c4bdaee..3b20c7e1 100644
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0 \
+    ghcr.io/huggingface/text-generation-inference:3.1.1 \
     --model-id $model
 ```
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index bd4956a0..be905102 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.0 \
+    ghcr.io/huggingface/text-generation-inference:3.1.1 \
     --model-id $model
 ```
 
@@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.1.0 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.1.1 --help
 ```
 
 </Tip>
diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
index ee34d587..bc4029e4 100644
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@@ -163,7 +163,7 @@ hub = {
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.0"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.1"),
  env=hub,
  role=role,
 )
diff --git a/update_doc.py b/update_doc.py
index 28430b1a..e570b5ad 100644
--- a/update_doc.py
+++ b/update_doc.py
@@ -142,14 +142,12 @@ def check_openapi(check: bool):
 
     with open(tmp_filename, "w") as f:
         json.dump(new_openapi_data, f, indent=2)
+        f.write("\n")
 
     if check:
         diff = subprocess.run(
             [
                 "diff",
-                # allow for trailing whitespace since it's not significant
-                # and the precommit hook will remove it
-                "--ignore-trailing-space",
                 tmp_filename,
                 filename,
             ],

From 144d99c1475b7b543d3159267372308f9ffabf75 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Tue, 4 Mar 2025 19:36:26 +0330
Subject: [PATCH 118/183] Fix a tiny typo in `monitoring.md` tutorial (#3056)

Update monitoring.md
---
 docs/source/basic_tutorials/monitoring.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/basic_tutorials/monitoring.md b/docs/source/basic_tutorials/monitoring.md
index 509b0aff..aa60e7af 100644
--- a/docs/source/basic_tutorials/monitoring.md
+++ b/docs/source/basic_tutorials/monitoring.md
@@ -38,7 +38,7 @@ In this guide, Prometheus monitoring data will be consumed on a local computer.
 * Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example)
 * Use ngrok port tunneling
 
-For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word.
+For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside world.
 
 For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use:
 ```bash

From 491ed9e11d23debe366a45783dd805825210bfe6 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 4 Mar 2025 18:07:33 +0100
Subject: [PATCH 119/183] Patch rust release. (#3069)

* Patch rust release.

* Trying to remove the rust-toolchain hardcoded in action.

* Upgrade rust toolchain.

* Put back the toolchain ?

* Fix neuron dockerfile.

* Move to the proper version of Rust.

* 1.85 since the GH action doesn't respect the override.

* Typo.

* Fixing the github action.

* Fixing docker llamacpp.

* Fixing the github action.

* Update clippy.
---
 .github/workflows/tests.yaml | 9 ++-------
 Dockerfile                   | 2 +-
 Dockerfile_amd               | 2 +-
 Dockerfile_gaudi             | 2 +-
 Dockerfile_intel             | 2 +-
 Dockerfile_llamacpp          | 2 +-
 Dockerfile_trtllm            | 2 +-
 router/src/lib.rs            | 4 ++--
 router/src/server.rs         | 2 +-
 rust-toolchain.toml          | 6 +++---
 10 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index f9ccb842..f8c17dc7 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -20,19 +20,14 @@ jobs:
     runs-on:
       group: aws-highmemory-32-plus-priv
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Set up Python
         uses: actions/setup-python@v4
         id: python
         with:
           python-version: 3.11
-      - name: Install Rust
-        uses: actions-rs/toolchain@v1
+      - uses: dtolnay/rust-toolchain@1.85.0
         with:
-          # Released on: 02 May, 2024
-          # https://releases.rs/docs/1.78.0/
-          toolchain: 1.84.0
-          override: true
           components: rustfmt, clippy
       - name: Install Protoc
         uses: arduino/setup-protoc@v1
diff --git a/Dockerfile b/Dockerfile
index 2d769c15..fb87968a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_amd b/Dockerfile_amd
index f95f3b6d..9d035c2d 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_gaudi b/Dockerfile_gaudi
index 91b172ed..6d37c6ae 100644
--- a/Dockerfile_gaudi
+++ b/Dockerfile_gaudi
@@ -3,7 +3,7 @@ ARG HABANA_VERSION
 ARG PYTORCH_VERSION
 
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 134a6f66..bdff0290 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -1,6 +1,6 @@
 ARG PLATFORM=xpu
 
-FROM lukemathwalker/cargo-chef:latest-rust-1.84.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp
index 2eb62a1f..1f55cf82 100644
--- a/Dockerfile_llamacpp
+++ b/Dockerfile_llamacpp
@@ -36,7 +36,7 @@ RUN tar -xzf ${llamacpp_version}.tar.gz \
 
 WORKDIR /app
 COPY rust-toolchain.toml rust-toolchain.toml
-RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
+RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.0 --profile minimal -y
 ENV PATH="/root/.cargo/bin:$PATH"
 RUN cargo install cargo-chef --locked
 
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index 14a74c00..d099ec87 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -71,7 +71,7 @@ ARG actions_runtime_token
 
 # Install Rust
 ENV PATH="/root/.cargo/bin:$PATH"
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.0 --profile minimal -y && \
     chmod -R a+w /root/.rustup && \
     chmod -R a+w /root/.cargo && \
     cargo install sccache --locked
diff --git a/router/src/lib.rs b/router/src/lib.rs
index e8c875a8..637e6c56 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -947,7 +947,7 @@ impl ChatRequest {
         let stop = stop.unwrap_or_default();
         // enable greedy only when temperature is 0
         let (do_sample, temperature) = match temperature {
-            Some(temperature) if temperature == 0.0 => (false, None),
+            Some(0.0) => (false, None),
             other => (true, other),
         };
 
@@ -1010,7 +1010,7 @@ impl ChatRequest {
                     seed,
                     top_n_tokens: top_logprobs,
                     grammar,
-                    adapter_id: model.filter(|m| *m != "tgi").map(String::from),
+                    adapter_id: model.filter(|m| *m != "tgi"),
                 },
             },
             using_tools,
diff --git a/router/src/server.rs b/router/src/server.rs
index c566cf98..da3daa3e 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -723,7 +723,7 @@ pub(crate) async fn completions(
     let stop = stop.unwrap_or_default();
     // enable greedy only when temperature is 0
     let (do_sample, temperature) = match temperature {
-        Some(temperature) if temperature == 0.0 => (false, None),
+        Some(0.0) => (false, None),
         other => (true, other),
     };
 
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 65f01fa5..8e51cc69 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-# Released on: June 13, 2024
-# https://releases.rs/docs/1.79.0/
-channel = "1.84.1"
+# Released on: 30 January, 2025
+# https://releases.rs/docs/1.84.1/
+channel = "1.85.0"
 components = ["rustfmt", "clippy"]

From cb42b3ad832152650d334d86d0bcbdc9451d8bcb Mon Sep 17 00:00:00 2001
From: David Corvoysier <david.corvoysier@gmail.com>
Date: Wed, 5 Mar 2025 11:46:58 +0100
Subject: [PATCH 120/183] fix(neuron): explicitly install toolchain (#3072)

* fix(neuron): explicitly install toolchain

* ci(neuron): trigger CI when Dockerfile is modified
---
 .github/workflows/ci_build.yaml | 1 +
 Dockerfile.neuron               | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml
index b5487191..752c6ddd 100644
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@@ -20,6 +20,7 @@ on:
       - "Dockerfile"
       - "Dockerfile_amd"
       - "Dockerfile_intel"
+      - "Dockerfile.neuron"
     branches:
       - "main"
   workflow_dispatch:
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 272407c7..5a22fab3 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -18,8 +18,7 @@ RUN apt-get update -y \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 
-COPY rust-toolchain.toml rust-toolchain.toml
-RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.0 --profile minimal -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN cargo install cargo-chef --locked
 

From ec35976f82e54cb87b68c7640d55d73a7b665a92 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 5 Mar 2025 11:59:52 +0100
Subject: [PATCH 121/183] Only add token when it is defined. (#3073)

* Only add token when it is defined.

* Update router/src/server.rs
---
 router/src/server.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/router/src/server.rs b/router/src/server.rs
index da3daa3e..6e55d2bc 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1711,9 +1711,10 @@ pub async fn run(
 
     // Shared API builder initialization
     let api_builder = || {
-        let mut builder = ApiBuilder::new()
-            .with_progress(false)
-            .with_token(authorization_token);
+        let mut builder = ApiBuilder::new().with_progress(false);
+        if let Some(token) = authorization_token {
+            builder = builder.with_token(Some(token));
+        }
 
         if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
             builder = builder.with_cache_dir(cache_dir.into());

From 31766dad77ba3a8d50ffccc77880993fe65f2f6b Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 5 Mar 2025 12:17:09 +0100
Subject: [PATCH 122/183] Force upgrade transformers version for olmo.

---
 server/pyproject.toml | 2 +-
 server/uv.lock        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/server/pyproject.toml b/server/pyproject.toml
index 7fd690cc..07ea1048 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "sentencepiece>=0.2.0",
     "tokenizers>=0.20.3",
     "typer>=0.15.1",
-    "transformers>=4.48.0",
+    "transformers>=4.49.0",
     "huggingface-hub>=0.29.0",
 ]
 
diff --git a/server/uv.lock b/server/uv.lock
index fecccecf..a2293a5c 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -2648,7 +2648,7 @@ requires-dist = [
     { name = "sentencepiece", specifier = ">=0.2.0" },
     { name = "texttable", marker = "extra == 'quantize'", specifier = ">=1.6.7,<2" },
     { name = "tokenizers", specifier = ">=0.20.3" },
-    { name = "transformers", specifier = ">=4.48.0" },
+    { name = "transformers", specifier = ">=4.49.0" },
     { name = "typer", specifier = ">=0.15.1" },
 ]
 
@@ -2790,7 +2790,7 @@ wheels = [
 
 [[package]]
 name = "transformers"
-version = "4.48.3"
+version = "4.49.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -2805,9 +2805,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e3/82/cebeb7af5e64440f1638f18c4ed0f89156d0eeaa6290d98da8ca93ac3872/transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb", size = 8373458 }
+sdist = { url = "https://files.pythonhosted.org/packages/79/50/46573150944f46df8ec968eda854023165a84470b42f69f67c7d475dabc5/transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e", size = 8610952 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b6/1a/efeecb8d83705f2f4beac98d46f2148c95ecd7babfb31b5c0f1e7017e83d/transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36", size = 9669412 },
+    { url = "https://files.pythonhosted.org/packages/20/37/1f29af63e9c30156a3ed6ebc2754077016577c094f31de7b2631e5d379eb/transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03", size = 9970275 },
 ]
 
 [[package]]

From ab9dafc68f124946fad05a55daca9149d95cf721 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 5 Mar 2025 17:46:47 +0100
Subject: [PATCH 123/183] Making sure Olmo (transformers backend) works.
 (#3074)

---
 .../test_flash_llama_load.json                | 294 ++++++++++++++++++
 .../test_flash_llama_simple.json              |  73 +++++
 .../models/test_transformers_olmo.py          |  37 +++
 3 files changed, 404 insertions(+)
 create mode 100644 integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_load.json
 create mode 100644 integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_simple.json
 create mode 100644 integration-tests/models/test_transformers_olmo.py

diff --git a/integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_load.json b/integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_load.json
new file mode 100644
index 00000000..ed15bc0b
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_load.json
@@ -0,0 +1,294 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 27,
+          "logprob": -1.3457031,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 187,
+          "logprob": -1.453125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -1.4111328,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 11202,
+          "logprob": -0.45898438,
+          "special": false,
+          "text": "```"
+        },
+        {
+          "id": 8456,
+          "logprob": -0.41918945,
+          "special": false,
+          "text": "json"
+        },
+        {
+          "id": 187,
+          "logprob": -0.003189087,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 92,
+          "logprob": -0.061187744,
+          "special": false,
+          "text": "{"
+        },
+        {
+          "id": 187,
+          "logprob": -0.009010315,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50276,
+          "logprob": -0.484375,
+          "special": false,
+          "text": "  "
+        },
+        {
+          "id": 3,
+          "logprob": -0.0002951622,
+          "special": false,
+          "text": "\""
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ":\n\n```json\n{\n  \""
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 27,
+          "logprob": -1.3457031,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 187,
+          "logprob": -1.453125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -1.4111328,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 11202,
+          "logprob": -0.45898438,
+          "special": false,
+          "text": "```"
+        },
+        {
+          "id": 8456,
+          "logprob": -0.41918945,
+          "special": false,
+          "text": "json"
+        },
+        {
+          "id": 187,
+          "logprob": -0.003189087,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 92,
+          "logprob": -0.061187744,
+          "special": false,
+          "text": "{"
+        },
+        {
+          "id": 187,
+          "logprob": -0.009010315,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50276,
+          "logprob": -0.484375,
+          "special": false,
+          "text": "  "
+        },
+        {
+          "id": 3,
+          "logprob": -0.0002951622,
+          "special": false,
+          "text": "\""
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ":\n\n```json\n{\n  \""
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 27,
+          "logprob": -1.3457031,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 187,
+          "logprob": -1.453125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -1.4111328,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 11202,
+          "logprob": -0.45898438,
+          "special": false,
+          "text": "```"
+        },
+        {
+          "id": 8456,
+          "logprob": -0.41918945,
+          "special": false,
+          "text": "json"
+        },
+        {
+          "id": 187,
+          "logprob": -0.003189087,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 92,
+          "logprob": -0.061187744,
+          "special": false,
+          "text": "{"
+        },
+        {
+          "id": 187,
+          "logprob": -0.009010315,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50276,
+          "logprob": -0.484375,
+          "special": false,
+          "text": "  "
+        },
+        {
+          "id": 3,
+          "logprob": -0.0002951622,
+          "special": false,
+          "text": "\""
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ":\n\n```json\n{\n  \""
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 27,
+          "logprob": -1.3457031,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 187,
+          "logprob": -1.453125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -1.4111328,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 11202,
+          "logprob": -0.45898438,
+          "special": false,
+          "text": "```"
+        },
+        {
+          "id": 8456,
+          "logprob": -0.41918945,
+          "special": false,
+          "text": "json"
+        },
+        {
+          "id": 187,
+          "logprob": -0.003189087,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 92,
+          "logprob": -0.061187744,
+          "special": false,
+          "text": "{"
+        },
+        {
+          "id": 187,
+          "logprob": -0.009010315,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50276,
+          "logprob": -0.484375,
+          "special": false,
+          "text": "  "
+        },
+        {
+          "id": 3,
+          "logprob": -0.0002951622,
+          "special": false,
+          "text": "\""
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ":\n\n```json\n{\n  \""
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_simple.json b/integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_simple.json
new file mode 100644
index 00000000..695244cd
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_simple.json
@@ -0,0 +1,73 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 27,
+        "logprob": -1.3457031,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 187,
+        "logprob": -1.4580078,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 187,
+        "logprob": -1.4472656,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 11202,
+        "logprob": -0.46044922,
+        "special": false,
+        "text": "```"
+      },
+      {
+        "id": 8456,
+        "logprob": -0.4206543,
+        "special": false,
+        "text": "json"
+      },
+      {
+        "id": 187,
+        "logprob": -0.0031471252,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 92,
+        "logprob": -0.061187744,
+        "special": false,
+        "text": "{"
+      },
+      {
+        "id": 187,
+        "logprob": -0.009033203,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 50276,
+        "logprob": -0.48461914,
+        "special": false,
+        "text": "  "
+      },
+      {
+        "id": 3,
+        "logprob": -0.0002901554,
+        "special": false,
+        "text": "\""
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": ":\n\n```json\n{\n  \""
+}
diff --git a/integration-tests/models/test_transformers_olmo.py b/integration-tests/models/test_transformers_olmo.py
new file mode 100644
index 00000000..53bccafa
--- /dev/null
+++ b/integration-tests/models/test_transformers_olmo.py
@@ -0,0 +1,37 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_handle(launcher):
+    with launcher("allenai/OLMo-7B-0724-Instruct-hf", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama(flash_llama_handle):
+    await flash_llama_handle.health(300)
+    return flash_llama_handle.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_simple(flash_llama, response_snapshot):
+    response = await flash_llama.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response.generated_text == ':\n\n```json\n{\n  "'
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_load(flash_llama, generate_load, response_snapshot):
+    responses = await generate_load(flash_llama, "Test request", max_new_tokens=10, n=4)
+
+    assert len(responses) == 4
+    assert responses[0].generated_text == ':\n\n```json\n{\n  "'
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot

From cdf70d6a28795278c2661107603f9054382c9ade Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 5 Mar 2025 20:50:43 +0100
Subject: [PATCH 124/183] Trying to reduce the logs in the case of errors.

---
 integration-tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 0ffcd162..588a2ce8 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -98,7 +98,7 @@ def pytest_collection_modifyitems(config, items):
             selector(item)
 
 
-@pytest.fixture(autouse=True)
+@pytest.fixture(autouse=True, scope="module")
 def container_log(request: SubRequest):
     error_log = request.getfixturevalue("error_log")
     assert error_log is not None

From 3208d1cd1dce5fb0e81fcf12dfb8842221f50c7e Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 5 Mar 2025 20:52:38 +0100
Subject: [PATCH 125/183] Revert "Trying to reduce the logs in the case of
 errors."

This reverts commit cdf70d6a28795278c2661107603f9054382c9ade.
---
 integration-tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 588a2ce8..0ffcd162 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -98,7 +98,7 @@ def pytest_collection_modifyitems(config, items):
             selector(item)
 
 
-@pytest.fixture(autouse=True, scope="module")
+@pytest.fixture(autouse=True)
 def container_log(request: SubRequest):
     error_log = request.getfixturevalue("error_log")
     assert error_log is not None

From 8e92942a18f51b3670c5285baef1885526b64da0 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 5 Mar 2025 22:32:31 +0100
Subject: [PATCH 126/183] Making `tool_calls` a vector. (#3075)

* Making `tool_calls` a vector.

* Update doc.

* Fixing the nix overlay with updated version.

* Add openai dependency.

* Updating the old tests.

* Trying to reduce the logs in the case of errors.

* Less spammy logs too.
---
 backends/v3/src/queue.rs                      |   4 +-
 clients/python/text_generation/types.py       |   2 +-
 docs/openapi.json                             |   5 +-
 integration-tests/conftest.py                 |  14 +-
 .../test_flash_llama_grammar_tools.json       |   4 +-
 .../test_flash_llama_grammar_tools_auto.json  |   4 +-
 ...test_flash_llama_grammar_tools_choice.json |  20 +-
 ...rammar_tools_insufficient_information.json |  10 +-
 ...tools_insufficient_information_stream.json |   4 +-
 ...test_flash_llama_grammar_tools_openai.json | 992 ++++++++++++++++++
 ...ma_grammar_tools_sea_creatures_stream.json |   4 +-
 ..._sea_creatures_stream_function_object.json |  24 +-
 ...ammar_tools_sea_creatures_stream_none.json |   4 +-
 ...r_tools_sea_creatures_stream_required.json |  24 +-
 ...test_flash_llama_grammar_tools_stream.json |  24 +-
 .../test_flash_llama_tool_reply_response.json |   4 +-
 integration-tests/models/test_tools_llama.py  |  45 +-
 integration-tests/pyproject.toml              |   1 +
 integration-tests/requirements.txt            |  38 +-
 nix/overlay.nix                               |   4 +-
 router/src/lib.rs                             |   6 +-
 21 files changed, 1158 insertions(+), 79 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json

diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs
index 249eebf7..d3bf4b9c 100644
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@@ -311,7 +311,7 @@ impl State {
                         + entry.request.stopping_parameters.max_new_tokens
                         + self.speculate
                         - 1;
-                    tracing::debug!("Allocating {tokens} with {input_ids:?}");
+                    // tracing::debug!("Allocating {tokens} with {input_ids:?}");
 
                     let block_allocation = match block_allocator.allocate(tokens, input_ids).await {
                         None => {
@@ -322,7 +322,7 @@ impl State {
                             break 'entry_loop;
                         }
                         Some(mut block_allocation) => {
-                            tracing::debug!("Allocation: {block_allocation:?}");
+                            // tracing::debug!("Allocation: {block_allocation:?}");
                             max_blocks = max(max_blocks, block_allocation.blocks.len() as u32);
 
                             if block_allocation.prefix_len == entry.request.input_length {
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index 1085075e..6f51c153 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -67,7 +67,7 @@ class ChoiceDeltaToolCall(BaseModel):
 class ChoiceDelta(BaseModel):
     role: str
     content: Optional[str] = None
-    tool_calls: Optional[ChoiceDeltaToolCall] = None
+    tool_calls: Optional[List[ChoiceDeltaToolCall]] = None
 
 
 class Choice(BaseModel):
diff --git a/docs/openapi.json b/docs/openapi.json
index e16ca7f9..e1ce234e 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -2302,7 +2302,10 @@
             "example": "assistant"
           },
           "tool_calls": {
-            "$ref": "#/components/schemas/DeltaToolCall"
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/DeltaToolCall"
+            }
           }
         }
       },
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 0ffcd162..01250ce2 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -98,7 +98,7 @@ def pytest_collection_modifyitems(config, items):
             selector(item)
 
 
-@pytest.fixture(autouse=True)
+@pytest.fixture(autouse=True, scope="module")
 def container_log(request: SubRequest):
     error_log = request.getfixturevalue("error_log")
     assert error_log is not None
@@ -269,7 +269,17 @@ class ResponseComparator(JSONSnapshotExtension):
         def eq_chat_complete_chunk(
             response: ChatCompletionChunk, other: ChatCompletionChunk
         ) -> bool:
-            return response.choices[0].delta.content == other.choices[0].delta.content
+            if response.choices[0].delta.content is not None:
+                return (
+                    response.choices[0].delta.content == other.choices[0].delta.content
+                )
+            elif response.choices[0].delta.tool_calls is not None:
+                return (
+                    response.choices[0].delta.tool_calls
+                    == other.choices[0].delta.tool_calls
+                )
+            else:
+                raise RuntimeError(f"Invalid empty chat chunk {response} vs {other}")
 
         def eq_response(response: Response, other: Response) -> bool:
             return response.generated_text == other.generated_text and eq_details(
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
index 33e223ba..7445099f 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
@@ -26,11 +26,11 @@
       "usage": null
     }
   ],
-  "created": 1732293383,
+  "created": 1741195536,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 30,
     "prompt_tokens": 615,
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
index 92ffbbc1..99018f96 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
@@ -26,11 +26,11 @@
       "usage": null
     }
   ],
-  "created": 1732293384,
+  "created": 1741195538,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 30,
     "prompt_tokens": 615,
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
index 603c90af..a80a6a23 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
@@ -1,7 +1,7 @@
 {
   "choices": [
     {
-      "finish_reason": "eos_token",
+      "finish_reason": "stop",
       "index": 0,
       "logprobs": null,
       "message": {
@@ -13,12 +13,12 @@
             "function": {
               "arguments": {
                 "format": "celsius",
-                "location": "New York, NY"
+                "location": "Brooklyn, New York"
               },
               "description": null,
               "name": "get_current_weather"
             },
-            "id": 0,
+            "id": "0",
             "type": "function"
           }
         ]
@@ -26,14 +26,14 @@
       "usage": null
     }
   ],
-  "created": 1712852394,
+  "created": 1741195540,
   "id": "",
-  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-  "object": "text_completion",
-  "system_fingerprint": "2.0.1-native",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 48,
-    "prompt_tokens": 320,
-    "total_tokens": 368
+    "completion_tokens": 30,
+    "prompt_tokens": 326,
+    "total_tokens": 356
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
index 3ed893fa..9cfea791 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1728497062,
+  "created": 1741195542,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 23,
-    "prompt_tokens": 604,
-    "total_tokens": 627
+    "completion_tokens": 22,
+    "prompt_tokens": 608,
+    "total_tokens": 630
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
index b134004a..34615f8e 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
@@ -11,10 +11,10 @@
       "logprobs": null
     }
   ],
-  "created": 1728497531,
+  "created": 1741195542,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
new file mode 100644
index 00000000..e6d78924
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
@@ -0,0 +1,992 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "{\"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "function",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " {\"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "name",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "get",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_current",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_weather",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "location",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "Bro",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "oklyn",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": ",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " New",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " York",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "format",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195536,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "c",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "elsius",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\"}}",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "<|eot_id|>",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741195537,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
index 1362b472..11644190 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
@@ -11,10 +11,10 @@
       "logprobs": null
     }
   ],
-  "created": 1728497461,
+  "created": 1741195545,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.2-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
index bb8d61c8..713e7a56 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
@@ -3,25 +3,27 @@
     {
       "delta": {
         "role": "assistant",
-        "tool_calls": {
-          "function": {
-            "arguments": "<|eot_id|>",
-            "name": null
-          },
-          "id": "",
-          "index": 0,
-          "type": "function"
-        }
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": "<|eot_id|>",
+              "name": null
+            },
+            "id": "",
+            "index": 0,
+            "type": "function"
+          }
+        ]
       },
       "finish_reason": "stop",
       "index": 0,
       "logprobs": null
     }
   ],
-  "created": 1732293254,
+  "created": 1741195554,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
index 2ccab4a9..bde28149 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
@@ -11,10 +11,10 @@
       "logprobs": null
     }
   ],
-  "created": 1729262528,
+  "created": 1741195551,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.3.2-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
index dbced5b8..7896607a 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
@@ -4,25 +4,27 @@
       "delta": {
         "content": null,
         "role": "assistant",
-        "tool_calls": {
-          "function": {
-            "arguments": "<|eot_id|>",
-            "name": null
-          },
-          "id": "",
-          "index": 0,
-          "type": "function"
-        }
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": "<|eot_id|>",
+              "name": null
+            },
+            "id": "",
+            "index": 0,
+            "type": "function"
+          }
+        ]
       },
       "finish_reason": "stop",
       "index": 0,
       "logprobs": null
     }
   ],
-  "created": 1732293246,
+  "created": 1741195548,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
index 27d2f9ca..92d27f61 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
@@ -4,25 +4,27 @@
       "delta": {
         "content": null,
         "role": "assistant",
-        "tool_calls": {
-          "function": {
-            "arguments": "<|eot_id|>",
-            "name": null
-          },
-          "id": "",
-          "index": 0,
-          "type": "function"
-        }
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": "<|eot_id|>",
+              "name": null
+            },
+            "id": "",
+            "index": 0,
+            "type": "function"
+          }
+        ]
       },
       "finish_reason": "stop",
       "index": 0,
       "logprobs": null
     }
   ],
-  "created": 1732293235,
+  "created": 1741195541,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion.chunk",
-  "system_fingerprint": "2.4.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": null
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
index 4f10aa3b..33a3bb43 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
@@ -13,11 +13,11 @@
       "usage": null
     }
   ],
-  "created": 1739932427,
+  "created": 1741195556,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
-  "system_fingerprint": "3.1.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 79,
     "prompt_tokens": 103,
diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py
index b8a90cff..7fd6cadd 100644
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@@ -1,6 +1,7 @@
 import pytest
 import requests
 import json
+from openai import OpenAI
 
 
 @pytest.fixture(scope="module")
@@ -108,6 +109,38 @@ async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_sna
     assert response == response_snapshot
 
 
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_openai(
+    flash_llama_grammar_tools, response_snapshot
+):
+    client = OpenAI(api_key="xx", base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    stream = client.chat.completions.create(
+        model="tgi",
+        max_tokens=100,
+        seed=1,
+        tools=tools,
+        stream=True,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "system",
+                "content": "Youre a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "What is the weather like in Brooklyn, New York?",
+            },
+        ],
+    )
+
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+
+    assert chunks == response_snapshot
+
+
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_grammar_tools_auto(
@@ -213,7 +246,9 @@ async def test_flash_llama_grammar_tools_stream(
     last_response = None
     async for response in responses:
         count += 1
-        tool_calls_generated += response.choices[0].delta.tool_calls.function.arguments
+        tool_calls_generated += (
+            response.choices[0].delta.tool_calls[0].function.arguments
+        )
         last_response = response
         assert response.choices[0].delta.content is None
 
@@ -360,7 +395,9 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_required(
     async for response in responses:
         count += 1
         assert response.choices[0].delta.content is None
-        tool_calls_generated += response.choices[0].delta.tool_calls.function.arguments
+        tool_calls_generated += (
+            response.choices[0].delta.tool_calls[0].function.arguments
+        )
         last_response = response
 
     assert count == 29
@@ -458,8 +495,8 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object(
                     break
                 response = json.loads(line)
                 tool_calls_generated += response["choices"][0]["delta"]["tool_calls"][
-                    "function"
-                ]["arguments"]
+                    0
+                ]["function"]["arguments"]
                 last_response = response
 
     assert count == 39
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
index 1838995e..37003440 100644
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
     "pytest-asyncio>=0.23.1",
     "docker>=7",
     "numpy>=2.0",
+    "openai>=1.65",
 ]
 
 [tool.isort]
diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
index c5a91825..d419d4b3 100644
--- a/integration-tests/requirements.txt
+++ b/integration-tests/requirements.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile pyproject.toml --output-file requirements.txt
+#    uv pip compile pyproject.toml -o requirements.txt
 aiohappyeyeballs==2.4.6
     # via aiohttp
 aiohttp==3.11.12
@@ -8,12 +8,21 @@ aiosignal==1.3.2
     # via aiohttp
 annotated-types==0.7.0
     # via pydantic
+anyio==4.8.0
+    # via
+    #   httpx
+    #   openai
 attrs==25.1.0
     # via aiohttp
 certifi==2025.1.31
-    # via requests
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
 charset-normalizer==3.4.1
     # via requests
+distro==1.9.0
+    # via openai
 docker==7.1.0
     # via text-generation-integration-tests (pyproject.toml)
 filelock==3.17.0
@@ -24,20 +33,32 @@ frozenlist==1.5.0
     #   aiosignal
 fsspec==2025.2.0
     # via huggingface-hub
+h11==0.14.0
+    # via httpcore
+httpcore==1.0.7
+    # via httpx
+httpx==0.28.1
+    # via openai
 huggingface-hub==0.29.0
     # via text-generation
 idna==3.10
     # via
+    #   anyio
+    #   httpx
     #   requests
     #   yarl
 iniconfig==2.0.0
     # via pytest
+jiter==0.8.2
+    # via openai
 multidict==6.1.0
     # via
     #   aiohttp
     #   yarl
 numpy==2.2.3
     # via text-generation-integration-tests (pyproject.toml)
+openai==1.65.3
+    # via text-generation-integration-tests (pyproject.toml)
 packaging==24.2
     # via
     #   huggingface-hub
@@ -51,6 +72,7 @@ propcache==0.2.1
 pydantic==2.10.6
     # via
     #   text-generation-integration-tests (pyproject.toml)
+    #   openai
     #   text-generation
 pydantic-core==2.27.2
     # via pydantic
@@ -67,15 +89,23 @@ requests==2.32.3
     # via
     #   docker
     #   huggingface-hub
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   openai
 syrupy==4.8.1
     # via text-generation-integration-tests (pyproject.toml)
 text-generation==0.7.0
     # via text-generation-integration-tests (pyproject.toml)
 tqdm==4.67.1
-    # via huggingface-hub
-typing-extensions==4.12.2
     # via
     #   huggingface-hub
+    #   openai
+typing-extensions==4.12.2
+    # via
+    #   anyio
+    #   huggingface-hub
+    #   openai
     #   pydantic
     #   pydantic-core
 urllib3==2.3.0
diff --git a/nix/overlay.nix b/nix/overlay.nix
index d9047819..63398f07 100644
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@@ -18,8 +18,8 @@ final: prev: {
             src = final.fetchFromGitHub {
               owner = "huggingface";
               repo = "transformers";
-              rev = "8d73a38606bc342b370afe1f42718b4828d95aaa";
-              hash = "sha256-MxroG6CWqrcmRS+eFt7Ej87TDOInN15aRPBUcaycKTI=";
+              rev = "v4.49.0";
+              hash = "sha256-drq7RWoRaRejiQjCUHIYuzaKa9rA4eQZI2do74scp1c=";
             };
           }
         );
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 637e6c56..60f1f73a 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -730,7 +730,7 @@ pub(crate) struct ChatCompletionChoice {
 pub struct ToolCallDelta {
     #[schema(example = "assistant")]
     role: String,
-    tool_calls: DeltaToolCall,
+    tool_calls: Vec<DeltaToolCall>,
 }
 
 #[derive(Clone, Debug, Serialize, ToSchema)]
@@ -774,7 +774,7 @@ impl ChatCompletionChunk {
             }),
             (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta {
                 role: "assistant".to_string(),
-                tool_calls: DeltaToolCall {
+                tool_calls: vec![DeltaToolCall {
                     index: 0,
                     id: String::new(),
                     r#type: "function".to_string(),
@@ -782,7 +782,7 @@ impl ChatCompletionChunk {
                         name: None,
                         arguments: tool_calls[0].to_string(),
                     },
-                },
+                }],
             }),
             (None, None) => ChatCompletionDelta::Chat(TextMessage {
                 role: "assistant".to_string(),

From 036d802b627080c9a7db6fea5ae8af8a32492f08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 7 Mar 2025 13:04:21 +0100
Subject: [PATCH 127/183] Nix: add `openai` to impure shell for integration
 tests (#3081)

---
 nix/impure-shell.nix | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nix/impure-shell.nix b/nix/impure-shell.nix
index 9b8d7e35..57a8bf71 100644
--- a/nix/impure-shell.nix
+++ b/nix/impure-shell.nix
@@ -60,6 +60,7 @@ mkShell {
       pip
       ipdb
       click
+      openai
       pytest
       pytest-asyncio
       syrupy

From 55a6618434f8472c120ef5e528230ebb76ed8526 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 7 Mar 2025 14:24:26 +0100
Subject: [PATCH 128/183] Update `--max-batch-total-tokens` description (#3083)

* Update `--max-batch-total-tokens` description

* Update docstring in `launcher/src/main.rs` instead
---
 docs/source/reference/launcher.md | 2 +-
 launcher/src/main.rs              | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md
index 159b22e7..2d9b58d9 100644
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@@ -198,7 +198,7 @@ Options:
           
           For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
           
-          Overall this number should be the largest possible amount that fits the remaining memory (after the model is loaded). Since the actual memory overhead depends on other parameters like if you're using quantization, flash attention or the model implementation, text-generation-inference cannot infer this number automatically.
+          Overall this number should be the largest possible amount that fits the remaining memory (after the model is loaded). Since the actual memory overhead depends on other parameters like if you're using quantization, flash attention or the model implementation, text-generation-inference infers this number automatically if not provided ensuring that the value is as large as possible.
           
           [env: MAX_BATCH_TOTAL_TOKENS=]
 
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index cd4b2231..321d7c69 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -702,8 +702,8 @@ struct Args {
     /// Overall this number should be the largest possible amount that fits the
     /// remaining memory (after the model is loaded). Since the actual memory overhead
     /// depends on other parameters like if you're using quantization, flash attention
-    /// or the model implementation, text-generation-inference cannot infer this number
-    /// automatically.
+    /// or the model implementation, text-generation-inference infers this number automatically
+    /// if not provided ensuring that the value is as large as possible.
     #[clap(long, env)]
     max_batch_total_tokens: Option<u32>,
 

From 622908deab55206dd99041aed9a4525eb538a698 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 7 Mar 2025 19:45:57 +0100
Subject: [PATCH 129/183] Fix tool call2 (#3076)

* Making `tool_calls` a vector.

* Arguments output is a string.

* Update all the integration tests.

* Add the requirements.

* Upgrade other tests.

* Clippy.

* Update the old test.
---
 integration-tests/conftest.py                 |   82 +-
 ...t_flash_llama_completion_stream_usage.json |   60 +-
 ...sh_llama_grammar_tools_auto_nostream.json} |   17 +-
 ..._llama_grammar_tools_choice_nostream.json} |   17 +-
 ...ash_llama_grammar_tools_choice_stream.json |  842 +++++++
 ...ls_insufficient_information_nostream.json} |   14 +-
 ...tools_insufficient_information_stream.json |  122 +-
 ...t_flash_llama_grammar_tools_nostream.json} |   17 +-
 ...test_flash_llama_grammar_tools_openai.json |   97 +-
 ...ma_grammar_tools_sea_creatures_stream.json |   20 -
 ...ammar_tools_sea_creatures_stream_auto.json | 1542 +++++++++++++
 ..._sea_creatures_stream_function_object.json | 1261 +++++++++-
 ...ammar_tools_sea_creatures_stream_none.json | 2022 ++++++++++++++++-
 ...r_tools_sea_creatures_stream_required.json |   31 +-
 ...test_flash_llama_grammar_tools_stream.json |   30 -
 .../test_flash_llama_tool_reply_response.json |   14 +-
 .../models/test_completion_prompts.py         |  112 +-
 integration-tests/models/test_tools_llama.py  |  279 ++-
 integration-tests/pyproject.toml              |    1 +
 integration-tests/requirements.txt            |    4 +-
 router/src/infer/chat_template.rs             |    4 +-
 router/src/lib.rs                             |   11 +-
 22 files changed, 6054 insertions(+), 545 deletions(-)
 rename integration-tests/models/__snapshots__/test_tools_llama/{test_flash_llama_grammar_tools.json => test_flash_llama_grammar_tools_auto_nostream.json} (66%)
 rename integration-tests/models/__snapshots__/test_tools_llama/{test_flash_llama_grammar_tools_auto.json => test_flash_llama_grammar_tools_choice_nostream.json} (66%)
 create mode 100644 integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json
 rename integration-tests/models/__snapshots__/test_tools_llama/{test_flash_llama_grammar_tools_insufficient_information.json => test_flash_llama_grammar_tools_insufficient_information_nostream.json} (62%)
 rename integration-tests/models/__snapshots__/test_tools_llama/{test_flash_llama_grammar_tools_choice.json => test_flash_llama_grammar_tools_nostream.json} (66%)
 delete mode 100644 integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
 create mode 100644 integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
 delete mode 100644 integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json

diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 01250ce2..6490f833 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,6 +1,13 @@
 pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
 # ruff: noqa: E402
 from _pytest.fixtures import SubRequest
+from huggingface_hub.inference._generated.types.chat_completion import (
+    ChatCompletionStreamOutput,
+    ChatCompletionOutput,
+)
+from openai.types.chat.chat_completion_chunk import (
+    ChatCompletionChunk as OAIChatCompletionChunk,
+)
 import requests
 
 
@@ -115,6 +122,31 @@ class ResponseComparator(JSONSnapshotExtension):
     rtol = 0.2
     ignore_logprob = False
 
+    def _serialize(
+        self,
+        data,
+    ):
+        if (
+            isinstance(data, Response)
+            or isinstance(data, ChatComplete)
+            or isinstance(data, ChatCompletionChunk)
+            or isinstance(data, ChatCompletionComplete)
+            or isinstance(data, Completion)
+            or isinstance(data, OAIChatCompletionChunk)
+        ):
+            data = data.model_dump()
+        elif isinstance(data, ChatCompletionStreamOutput) or isinstance(
+            data, ChatCompletionOutput
+        ):
+            data = dict(data)
+        elif isinstance(data, List):
+            data = [self._serialize(d) for d in data]
+        elif isinstance(data, dict):
+            return data
+        else:
+            raise RuntimeError(f"Unexpected data {type(data)} : {data}")
+        return data
+
     def serialize(
         self,
         data,
@@ -123,17 +155,7 @@ class ResponseComparator(JSONSnapshotExtension):
         exclude=None,
         matcher=None,
     ):
-        if (
-            isinstance(data, Response)
-            or isinstance(data, ChatComplete)
-            or isinstance(data, ChatCompletionChunk)
-            or isinstance(data, ChatCompletionComplete)
-        ):
-            data = data.model_dump()
-
-        if isinstance(data, List):
-            data = [d.model_dump() for d in data]
-
+        data = self._serialize(data)
         data = self._filter(
             data=data,
             depth=0,
@@ -142,7 +164,8 @@ class ResponseComparator(JSONSnapshotExtension):
             include=include,
             matcher=matcher,
         )
-        return json.dumps(data, indent=2, ensure_ascii=False, sort_keys=False) + "\n"
+        data = json.dumps(data, indent=2, ensure_ascii=False, sort_keys=False) + "\n"
+        return data
 
     def matches(
         self,
@@ -158,7 +181,7 @@ class ResponseComparator(JSONSnapshotExtension):
             if isinstance(data, Dict):
                 if "choices" in data:
                     data["choices"] = list(
-                        sorted(data["choices"], key=lambda x: x["index"])
+                        sorted(data["choices"], key=lambda x: int(x["index"]))
                     )
                     choices = data["choices"]
                     if isinstance(choices, List) and len(choices) >= 1:
@@ -171,7 +194,7 @@ class ResponseComparator(JSONSnapshotExtension):
                     return Response(**data)
             if isinstance(data, List):
                 return [_convert_data(d) for d in data]
-            raise NotImplementedError
+            raise NotImplementedError(f"Data: {data}")
 
         def eq_token(token: Token, other: Token) -> bool:
             return (
@@ -269,17 +292,25 @@ class ResponseComparator(JSONSnapshotExtension):
         def eq_chat_complete_chunk(
             response: ChatCompletionChunk, other: ChatCompletionChunk
         ) -> bool:
-            if response.choices[0].delta.content is not None:
-                return (
-                    response.choices[0].delta.content == other.choices[0].delta.content
-                )
-            elif response.choices[0].delta.tool_calls is not None:
-                return (
-                    response.choices[0].delta.tool_calls
-                    == other.choices[0].delta.tool_calls
-                )
+            if response.choices:
+                if response.choices[0].delta.content is not None:
+                    return (
+                        response.choices[0].delta.content
+                        == other.choices[0].delta.content
+                    )
+                elif response.choices[0].delta.tool_calls is not None:
+                    return (
+                        response.choices[0].delta.tool_calls
+                        == other.choices[0].delta.tool_calls
+                    )
+                else:
+                    raise RuntimeError(
+                        f"Invalid empty chat chunk {response} vs {other}"
+                    )
+            elif response.usage is not None:
+                return response.usage == other.usage
             else:
-                raise RuntimeError(f"Invalid empty chat chunk {response} vs {other}")
+                raise RuntimeError(f"Invalid empty chat {response} vs {other}")
 
         def eq_response(response: Response, other: Response) -> bool:
             return response.generated_text == other.generated_text and eq_details(
@@ -294,6 +325,9 @@ class ResponseComparator(JSONSnapshotExtension):
         if not isinstance(snapshot_data, List):
             snapshot_data = [snapshot_data]
 
+        if len(serialized_data) == 0:
+            return len(snapshot_data) == len(serialized_data)
+
         if isinstance(serialized_data[0], Completion):
             return len(snapshot_data) == len(serialized_data) and all(
                 [eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
index 8c7be4cb..fbb3669f 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
@@ -12,11 +12,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656043,
+    "created": 1741338471,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": null
   },
   {
@@ -32,11 +32,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656043,
+    "created": 1741338471,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": null
   },
   {
@@ -52,11 +52,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656043,
+    "created": 1741338471,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": null
   },
   {
@@ -72,11 +72,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656043,
+    "created": 1741338471,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": null
   },
   {
@@ -92,11 +92,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656043,
+    "created": 1741338472,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": null
   },
   {
@@ -112,11 +112,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656043,
+    "created": 1741338472,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": null
   },
   {
@@ -132,11 +132,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656044,
+    "created": 1741338472,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": null
   },
   {
@@ -152,11 +152,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656044,
+    "created": 1741338472,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": null
   },
   {
@@ -172,11 +172,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656044,
+    "created": 1741338472,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": null
   },
   {
@@ -192,11 +192,11 @@
         "logprobs": null
       }
     ],
-    "created": 1726656044,
+    "created": 1741338472,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
-    "system_fingerprint": "2.2.1-dev0-native",
+    "system_fingerprint": "3.1.2-dev0-native",
     "usage": {
       "completion_tokens": 10,
       "prompt_tokens": 40,
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json
similarity index 66%
rename from integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
rename to integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json
index 7445099f..06cf038a 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json
@@ -6,15 +6,11 @@
       "logprobs": null,
       "message": {
         "content": null,
-        "name": null,
         "role": "assistant",
         "tool_calls": [
           {
             "function": {
-              "arguments": {
-                "format": "celsius",
-                "location": "Brooklyn, New York"
-              },
+              "arguments": "{\"format\":\"fahrenheit\",\"location\":\"Brooklyn, NY\"}",
               "description": null,
               "name": "get_current_weather"
             },
@@ -22,18 +18,17 @@
             "type": "function"
           }
         ]
-      },
-      "usage": null
+      }
     }
   ],
-  "created": 1741195536,
+  "created": 1741263682,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
   "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 30,
-    "prompt_tokens": 615,
-    "total_tokens": 645
+    "completion_tokens": 29,
+    "prompt_tokens": 501,
+    "total_tokens": 530
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json
similarity index 66%
rename from integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
rename to integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json
index 99018f96..0152ea70 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json
@@ -6,15 +6,11 @@
       "logprobs": null,
       "message": {
         "content": null,
-        "name": null,
         "role": "assistant",
         "tool_calls": [
           {
             "function": {
-              "arguments": {
-                "format": "celsius",
-                "location": "Brooklyn, New York"
-              },
+              "arguments": "{\"format\":\"fahrenheit\",\"location\":\"Brooklyn, NY\"}",
               "description": null,
               "name": "get_current_weather"
             },
@@ -22,18 +18,17 @@
             "type": "function"
           }
         ]
-      },
-      "usage": null
+      }
     }
   ],
-  "created": 1741195538,
+  "created": 1741263684,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
   "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 30,
-    "prompt_tokens": 615,
-    "total_tokens": 645
+    "completion_tokens": 29,
+    "prompt_tokens": 286,
+    "total_tokens": 315
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json
new file mode 100644
index 00000000..8dab9a5b
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json
@@ -0,0 +1,842 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "{\"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "function",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " {\"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "name",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "get",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_current",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_weather",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "location",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "Paris",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": ",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " France",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "format",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "c",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "elsius",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\"}}",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "<|eot_id|>",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263685,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json
similarity index 62%
rename from integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
rename to integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json
index 9cfea791..797c9578 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json
@@ -5,22 +5,20 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "I am an AI assistant",
-        "name": null,
+        "content": "I am a helpful assistant!",
         "role": "assistant",
         "tool_calls": null
-      },
-      "usage": null
+      }
     }
   ],
-  "created": 1741195542,
+  "created": 1741263686,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
   "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 22,
-    "prompt_tokens": 608,
-    "total_tokens": 630
+    "completion_tokens": 23,
+    "prompt_tokens": 494,
+    "total_tokens": 517
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
index 34615f8e..b1d4fb87 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
@@ -1,20 +1,102 @@
-{
-  "choices": [
-    {
-      "delta": {
-        "content": " assistant",
-        "role": "assistant",
-        "tool_calls": null
-      },
-      "finish_reason": null,
-      "index": 0,
-      "logprobs": null
-    }
-  ],
-  "created": 1741195542,
-  "id": "",
-  "model": "meta-llama/Llama-3.1-8B-Instruct",
-  "object": "chat.completion.chunk",
-  "system_fingerprint": "3.1.2-dev0-native",
-  "usage": null
-}
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "I",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263687,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " am",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263687,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " a",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263687,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " helpful",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263687,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " assistant",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263687,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json
similarity index 66%
rename from integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
rename to integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json
index a80a6a23..3b22d83e 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json
@@ -6,15 +6,11 @@
       "logprobs": null,
       "message": {
         "content": null,
-        "name": null,
         "role": "assistant",
         "tool_calls": [
           {
             "function": {
-              "arguments": {
-                "format": "celsius",
-                "location": "Brooklyn, New York"
-              },
+              "arguments": "{\"format\":\"fahrenheit\",\"location\":\"Brooklyn, NY\"}",
               "description": null,
               "name": "get_current_weather"
             },
@@ -22,18 +18,17 @@
             "type": "function"
           }
         ]
-      },
-      "usage": null
+      }
     }
   ],
-  "created": 1741195540,
+  "created": 1741263680,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
   "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 30,
-    "prompt_tokens": 326,
-    "total_tokens": 356
+    "completion_tokens": 29,
+    "prompt_tokens": 501,
+    "total_tokens": 530
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
index e6d78924..c8fc50a2 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
@@ -24,7 +24,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -57,7 +57,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -90,7 +90,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -123,7 +123,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -156,7 +156,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -189,7 +189,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -222,7 +222,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -255,7 +255,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -288,7 +288,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -321,7 +321,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -354,7 +354,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -387,7 +387,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -420,7 +420,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -453,7 +453,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -486,7 +486,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -519,7 +519,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -552,7 +552,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -585,7 +585,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -618,7 +618,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -637,7 +637,7 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": " New",
+                "arguments": " NY",
                 "name": null
               },
               "id": "",
@@ -651,40 +651,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " York",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -717,7 +684,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -750,7 +717,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -783,7 +750,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195536,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -816,7 +783,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195537,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -849,7 +816,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195537,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -868,7 +835,7 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": "c",
+                "arguments": "f",
                 "name": null
               },
               "id": "",
@@ -882,7 +849,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195537,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -901,7 +868,7 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": "elsius",
+                "arguments": "ahrenheit",
                 "name": null
               },
               "id": "",
@@ -915,7 +882,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195537,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -948,7 +915,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195537,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -981,7 +948,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741195537,
+    "created": 1741263681,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
deleted file mode 100644
index 11644190..00000000
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-  "choices": [
-    {
-      "delta": {
-        "content": " fans",
-        "role": "assistant",
-        "tool_calls": null
-      },
-      "finish_reason": null,
-      "index": 0,
-      "logprobs": null
-    }
-  ],
-  "created": 1741195545,
-  "id": "",
-  "model": "meta-llama/Llama-3.1-8B-Instruct",
-  "object": "chat.completion.chunk",
-  "system_fingerprint": "3.1.2-dev0-native",
-  "usage": null
-}
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
new file mode 100644
index 00000000..4b0f5a07
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
@@ -0,0 +1,1542 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "There",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " was",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " a",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " wise",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " old",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " oct",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "opus",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " named",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Oracle",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ".",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " He",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " lived",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " in",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " a",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " cozy",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " little",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " cave",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " beneath",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " waves",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " with",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " his",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " best",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " friend",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263688,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " a",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " curious",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " se",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "ah",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "orse",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " named",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Fin",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "ley",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ".",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " One",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " day",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Fin",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "ley",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " met",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " a",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " playful",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " dolphin",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " named",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Daisy",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " three",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " became",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " inse",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "parable",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ".",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " They",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263689,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " spent",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " their",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " days",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " exploring",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " ocean",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " playing",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " hide",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "-and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "-se",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "ek",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " learning",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " about",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " wonders",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " of",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " sea",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " from",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Oracle",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263690,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
index 713e7a56..b253d465 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
@@ -1,29 +1,1232 @@
-{
-  "choices": [
-    {
-      "delta": {
-        "role": "assistant",
-        "tool_calls": [
-          {
-            "function": {
-              "arguments": "<|eot_id|>",
-              "name": null
-            },
-            "id": "",
-            "index": 0,
-            "type": "function"
-          }
-        ]
-      },
-      "finish_reason": "stop",
-      "index": 0,
-      "logprobs": null
-    }
-  ],
-  "created": 1741195554,
-  "id": "",
-  "model": "meta-llama/Llama-3.1-8B-Instruct",
-  "object": "chat.completion.chunk",
-  "system_fingerprint": "3.1.2-dev0-native",
-  "usage": null
-}
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "{\"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "function",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " {\"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "n",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "am",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "e",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "get",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_n",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_day",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_weather",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_fore",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "cast",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "location",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263698,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "San",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " Francisco",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": ",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " CA",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "format",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "c",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "elsius",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\",",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " \"",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "num",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "_days",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "\":",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": " ",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "3",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "}}",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "<|eot_id|>",
+                "name": null
+              },
+              "id": "",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263699,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
index bde28149..9a4a6b0c 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
@@ -1,20 +1,2002 @@
-{
-  "choices": [
-    {
-      "delta": {
-        "content": " deep",
-        "role": "assistant",
-        "tool_calls": null
-      },
-      "finish_reason": "length",
-      "index": 0,
-      "logprobs": null
-    }
-  ],
-  "created": 1741195551,
-  "id": "",
-  "model": "meta-llama/Llama-3.1-8B-Instruct",
-  "object": "chat.completion.chunk",
-  "system_fingerprint": "3.1.2-dev0-native",
-  "usage": null
-}
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "Once",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263693,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " upon",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263693,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " a",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263693,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " time",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263693,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " in",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " a",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " vibrant",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " ocean",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " filled",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " with",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " coral",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " reefs",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " schools",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " of",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " shimmer",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "ing",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " fish",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " lived",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " three",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " dear",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " friends",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ":",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Luna",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " sea",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " turtle",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Fin",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "ley",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " friendly",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " fish",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263694,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Cr",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "usty",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " wise",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " crab",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ".\n\n",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "L",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "una",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " was",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " oldest",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " of",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " three",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ".",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " She",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " had",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " traveled",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " world",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " exploring",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " hidden",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " caves",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " ship",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "w",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "re",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263695,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "cks",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " collecting",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " sparkling",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " shells",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " shiny",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " pe",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "bb",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "les",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ".",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Her",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " shell",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " was",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " a",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " beautiful",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " mosaic",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " of",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " blues",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " greens",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ",",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " and",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " her",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " gentle",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " eyes",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " twink",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "led",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " with",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263696,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " secrets",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263697,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " of",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263697,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " the",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263697,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " deep",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741263697,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
index 7896607a..fe51488c 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
@@ -1,30 +1 @@
-{
-  "choices": [
-    {
-      "delta": {
-        "content": null,
-        "role": "assistant",
-        "tool_calls": [
-          {
-            "function": {
-              "arguments": "<|eot_id|>",
-              "name": null
-            },
-            "id": "",
-            "index": 0,
-            "type": "function"
-          }
-        ]
-      },
-      "finish_reason": "stop",
-      "index": 0,
-      "logprobs": null
-    }
-  ],
-  "created": 1741195548,
-  "id": "",
-  "model": "meta-llama/Llama-3.1-8B-Instruct",
-  "object": "chat.completion.chunk",
-  "system_fingerprint": "3.1.2-dev0-native",
-  "usage": null
-}
+[]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
deleted file mode 100644
index 92d27f61..00000000
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "choices": [
-    {
-      "delta": {
-        "content": null,
-        "role": "assistant",
-        "tool_calls": [
-          {
-            "function": {
-              "arguments": "<|eot_id|>",
-              "name": null
-            },
-            "id": "",
-            "index": 0,
-            "type": "function"
-          }
-        ]
-      },
-      "finish_reason": "stop",
-      "index": 0,
-      "logprobs": null
-    }
-  ],
-  "created": 1741195541,
-  "id": "",
-  "model": "meta-llama/Llama-3.1-8B-Instruct",
-  "object": "chat.completion.chunk",
-  "system_fingerprint": "3.1.2-dev0-native",
-  "usage": null
-}
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
index 33a3bb43..45161f35 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
@@ -5,22 +5,20 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "I can't access real-time data, but I can provide you with current conditions and forecast for Paris, France:\n\nThe current conditions in Paris are mostly cloudy with a temperature of 6.7°C (44.1°F). \n\nPlease note that the actual weather may differ from this information, and I recommend checking the forecast on a reliable weather website for the most up-to-date information.",
-        "name": null,
+        "content": "I can't access real-time data, but I can provide you with current conditions and forecast for Paris, France:\n\nThe current conditions in Paris are mostly cloudy with a temperature of 6.7°C (44.1°F). \n\nPlease note that the actual weather may differ from the provided information. For up-to-date information, I suggest checking a reliable weather website or app for the latest conditions and forecast.",
         "role": "assistant",
         "tool_calls": null
-      },
-      "usage": null
+      }
     }
   ],
-  "created": 1741195556,
+  "created": 1741263702,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
   "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 79,
-    "prompt_tokens": 103,
-    "total_tokens": 182
+    "completion_tokens": 83,
+    "prompt_tokens": 109,
+    "total_tokens": 192
   }
 }
diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py
index 6c359f1e..27988ef9 100644
--- a/integration-tests/models/test_completion_prompts.py
+++ b/integration-tests/models/test_completion_prompts.py
@@ -2,8 +2,9 @@ import pytest
 import requests
 import json
 from aiohttp import ClientSession
+from huggingface_hub import InferenceClient
 
-from text_generation.types import Completion, ChatCompletionChunk
+from text_generation.types import Completion
 
 
 @pytest.fixture(scope="module")
@@ -52,52 +53,35 @@ def test_flash_llama_completion_single_prompt(
 async def test_flash_llama_completion_stream_usage(
     flash_llama_completion, response_snapshot
 ):
-    url = f"{flash_llama_completion.base_url}/v1/chat/completions"
-    request = {
-        "model": "tgi",
-        "messages": [
+    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.chat_completion(
+        model="tgi",
+        messages=[
             {
                 "role": "user",
                 "content": "What is Deep Learning?",
             }
         ],
-        "max_tokens": 10,
-        "temperature": 0.0,
-        "stream_options": {"include_usage": True},
-        "stream": True,
-    }
+        max_tokens=10,
+        temperature=0.0,
+        stream_options={"include_usage": True},
+        stream=True,
+    )
     string = ""
     chunks = []
     had_usage = False
-    async with ClientSession(headers=flash_llama_completion.headers) as session:
-        async with session.post(url, json=request) as response:
-            # iterate over the stream
-            async for chunk in response.content.iter_any():
-                # remove "data:"
-                chunk = chunk.decode().split("\n\n")
-                # remove "data:" if present
-                chunk = [c.replace("data:", "") for c in chunk]
-                # remove empty strings
-                chunk = [c for c in chunk if c]
-                # remove completion marking chunk
-                chunk = [c for c in chunk if c != " [DONE]"]
-                # parse json
-                chunk = [json.loads(c) for c in chunk]
+    for chunk in stream:
+        # remove "data:"
+        chunks.append(chunk)
+        print(f"Chunk {chunk}")
+        if len(chunk.choices) == 1:
+            index = chunk.choices[0].index
+            assert index == 0
+            string += chunk.choices[0].delta.content
+        if chunk.usage:
+            assert not had_usage
+            had_usage = True
 
-                for c in chunk:
-                    chunks.append(ChatCompletionChunk(**c))
-                    assert "choices" in c
-                    if len(c["choices"]) == 1:
-                        index = c["choices"][0]["index"]
-                        assert index == 0
-                        string += c["choices"][0]["delta"]["content"]
-
-                        has_usage = c["usage"] is not None
-                        assert not had_usage
-                        if has_usage:
-                            had_usage = True
-                    else:
-                        raise RuntimeError("Expected different payload")
     assert had_usage
     assert (
         string
@@ -105,51 +89,29 @@ async def test_flash_llama_completion_stream_usage(
     )
     assert chunks == response_snapshot
 
-    request = {
-        "model": "tgi",
-        "messages": [
+    stream = client.chat_completion(
+        model="tgi",
+        messages=[
             {
                 "role": "user",
                 "content": "What is Deep Learning?",
             }
         ],
-        "max_tokens": 10,
-        "temperature": 0.0,
-        "stream": True,
-    }
+        max_tokens=10,
+        temperature=0.0,
+        # No usage
+        # stream_options={"include_usage": True},
+        stream=True,
+    )
     string = ""
     chunks = []
     had_usage = False
-    async with ClientSession(headers=flash_llama_completion.headers) as session:
-        async with session.post(url, json=request) as response:
-            # iterate over the stream
-            async for chunk in response.content.iter_any():
-                # remove "data:"
-                chunk = chunk.decode().split("\n\n")
-                # remove "data:" if present
-                chunk = [c.replace("data:", "") for c in chunk]
-                # remove empty strings
-                chunk = [c for c in chunk if c]
-                # remove completion marking chunk
-                chunk = [c for c in chunk if c != " [DONE]"]
-                # parse json
-                chunk = [json.loads(c) for c in chunk]
-
-                for c in chunk:
-                    chunks.append(ChatCompletionChunk(**c))
-                    assert "choices" in c
-                    if len(c["choices"]) == 1:
-                        index = c["choices"][0]["index"]
-                        assert index == 0
-                        string += c["choices"][0]["delta"]["content"]
-
-                        has_usage = c["usage"] is not None
-                        assert not had_usage
-                        if has_usage:
-                            had_usage = True
-                    else:
-                        raise RuntimeError("Expected different payload")
-    assert not had_usage
+    for chunk in stream:
+        chunks.append(chunk)
+        assert chunk.usage is None
+        assert len(chunk.choices) == 1
+        assert chunk.choices[0].index == 0
+        string += chunk.choices[0].delta.content
     assert (
         string
         == "**Deep Learning: An Overview**\n=====================================\n\n"
diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py
index 7fd6cadd..ebf69cb7 100644
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@@ -1,7 +1,10 @@
 import pytest
-import requests
-import json
 from openai import OpenAI
+from huggingface_hub import InferenceClient
+from huggingface_hub.inference._generated.types.chat_completion import (
+    ChatCompletionOutputToolCall,
+    ChatCompletionOutputFunctionDefinition,
+)
 
 
 @pytest.fixture(scope="module")
@@ -77,8 +80,11 @@ tools = [
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_snapshot):
-    response = await flash_llama_grammar_tools.chat(
+async def test_flash_llama_grammar_tools_nostream(
+    flash_llama_grammar_tools, response_snapshot
+):
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    response = client.chat_completion(
         max_tokens=100,
         seed=1,
         tools=tools,
@@ -96,15 +102,15 @@ async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_sna
     )
     assert response.choices[0].message.content is None
     assert response.choices[0].message.tool_calls == [
-        {
-            "id": "0",
-            "type": "function",
-            "function": {
-                "description": None,
-                "name": "get_current_weather",
-                "arguments": {"format": "celsius", "location": "Brooklyn, New York"},
-            },
-        }
+        ChatCompletionOutputToolCall(
+            id="0",
+            type="function",
+            function=ChatCompletionOutputFunctionDefinition(
+                description=None,
+                name="get_current_weather",
+                arguments='{"format":"fahrenheit","location":"Brooklyn, NY"}',
+            ),
+        )
     ]
     assert response == response_snapshot
 
@@ -135,18 +141,25 @@ async def test_flash_llama_grammar_tools_openai(
     )
 
     chunks = []
+    tool = ""
     for chunk in stream:
+        tool += chunk.choices[0].delta.tool_calls[0].function.arguments
         chunks.append(chunk)
 
+    assert (
+        tool
+        == '{"function": {"_name": "get_current_weather", "location": "Brooklyn, NY", "format": "fahrenheit"}}<|eot_id|>'
+    )
     assert chunks == response_snapshot
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_grammar_tools_auto(
+async def test_flash_llama_grammar_tools_auto_nostream(
     flash_llama_grammar_tools, response_snapshot
 ):
-    response = await flash_llama_grammar_tools.chat(
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    response = client.chat_completion(
         max_tokens=100,
         seed=1,
         tools=tools,
@@ -165,15 +178,15 @@ async def test_flash_llama_grammar_tools_auto(
     )
     assert response.choices[0].message.content is None
     assert response.choices[0].message.tool_calls == [
-        {
-            "id": "0",
-            "type": "function",
-            "function": {
-                "description": None,
-                "name": "get_current_weather",
-                "arguments": {"format": "celsius", "location": "Brooklyn, New York"},
-            },
-        }
+        ChatCompletionOutputToolCall(
+            id="0",
+            type="function",
+            function=ChatCompletionOutputFunctionDefinition(
+                description=None,
+                name="get_current_weather",
+                arguments='{"format":"fahrenheit","location":"Brooklyn, NY"}',
+            ),
+        )
     ]
 
     assert response == response_snapshot
@@ -181,10 +194,11 @@ async def test_flash_llama_grammar_tools_auto(
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_grammar_tools_choice(
+async def test_flash_llama_grammar_tools_choice_nostream(
     flash_llama_grammar_tools, response_snapshot
 ):
-    response = await flash_llama_grammar_tools.chat(
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    response = client.chat_completion(
         max_tokens=100,
         seed=1,
         tools=tools,
@@ -203,15 +217,15 @@ async def test_flash_llama_grammar_tools_choice(
     )
     assert response.choices[0].message.content is None
     assert response.choices[0].message.tool_calls == [
-        {
-            "id": "0",
-            "type": "function",
-            "function": {
-                "description": None,
-                "name": "get_current_weather",
-                "arguments": {"format": "celsius", "location": "Brooklyn, New York"},
-            },
-        }
+        ChatCompletionOutputToolCall(
+            id="0",
+            type="function",
+            function=ChatCompletionOutputFunctionDefinition(
+                description=None,
+                name="get_current_weather",
+                arguments='{"format":"fahrenheit","location":"Brooklyn, NY"}',
+            ),
+        )
     ]
 
     assert response == response_snapshot
@@ -219,10 +233,11 @@ async def test_flash_llama_grammar_tools_choice(
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_grammar_tools_stream(
+async def test_flash_llama_grammar_tools_choice_stream(
     flash_llama_grammar_tools, response_snapshot
 ):
-    responses = await flash_llama_grammar_tools.chat(
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    stream = client.chat_completion(
         max_tokens=100,
         seed=1,
         tools=tools,
@@ -241,31 +256,27 @@ async def test_flash_llama_grammar_tools_stream(
         stream=True,
     )
 
-    count = 0
     tool_calls_generated = ""
-    last_response = None
-    async for response in responses:
-        count += 1
-        tool_calls_generated += (
-            response.choices[0].delta.tool_calls[0].function.arguments
-        )
-        last_response = response
-        assert response.choices[0].delta.content is None
+    chunks = []
+    for chunk in stream:
+        tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments
+        assert chunk.choices[0].delta.content is None
+        chunks.append(chunk)
 
     assert (
         tool_calls_generated
         == '{"function": {"_name": "get_current_weather", "location": "Paris, France", "format": "celsius"}}<|eot_id|>'
     )
-    assert count == 28
-    assert last_response == response_snapshot
+    assert chunks == response_snapshot
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_grammar_tools_insufficient_information(
+async def test_flash_llama_grammar_tools_insufficient_information_nostream(
     flash_llama_grammar_tools, response_snapshot
 ):
-    responses = await flash_llama_grammar_tools.chat(
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    response = client.chat_completion(
         max_tokens=100,
         seed=24,
         tools=tools,
@@ -283,10 +294,13 @@ async def test_flash_llama_grammar_tools_insufficient_information(
         stream=False,
     )
 
-    assert responses.choices[0].message.tool_calls is None
-    assert responses.choices[0].message.content == "I am an AI assistant"
+    content_generated = response.choices[0].message.content
+    assert response.choices[0].message.tool_calls is None
 
-    assert responses == response_snapshot
+    ######## FIXME before MERGE ############################
+    # TODO This is different from  the streaming case, this is NOT normal.
+    assert content_generated == "I am a helpful assistant!"
+    assert response == response_snapshot
 
 
 @pytest.mark.asyncio
@@ -294,7 +308,8 @@ async def test_flash_llama_grammar_tools_insufficient_information(
 async def test_flash_llama_grammar_tools_insufficient_information_stream(
     flash_llama_grammar_tools, response_snapshot
 ):
-    responses = await flash_llama_grammar_tools.chat(
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    stream = client.chat_completion(
         max_tokens=100,
         seed=24,
         tools=tools,
@@ -312,26 +327,24 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream(
         stream=True,
     )
 
-    count = 0
     content_generated = ""
-    last_response = None
-    async for response in responses:
-        count += 1
-        content_generated += response.choices[0].delta.content
-        last_response = response
-        assert response.choices[0].delta.tool_calls is None
+    chunks = []
+    for chunk in stream:
+        content_generated += chunk.choices[0].delta.content
+        chunks.append(chunk)
+        assert chunk.choices[0].delta.tool_calls is None
 
-    assert count == 5
-    assert content_generated == "I am an AI assistant"
-    assert last_response == response_snapshot
+    assert content_generated == "I am a helpful assistant"
+    assert chunks == response_snapshot
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_grammar_tools_sea_creatures_stream(
+async def test_flash_llama_grammar_tools_sea_creatures_stream_auto(
     flash_llama_grammar_tools, response_snapshot
 ):
-    responses = await flash_llama_grammar_tools.chat(
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    stream = client.chat_completion(
         max_tokens=100,
         seed=24,
         tools=tools,
@@ -349,21 +362,18 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream(
         stream=True,
     )
 
-    count = 0
     content_generated = ""
-    last_response = None
-    async for response in responses:
-        count += 1
-        content_generated += response.choices[0].delta.content
-        last_response = response
-        assert response.choices[0].delta.tool_calls is None
+    chunks = []
+    for chunk in stream:
+        content_generated += chunk.choices[0].delta.content
+        chunks.append(chunk)
+        assert chunk.choices[0].delta.tool_calls is None
 
-    assert count == 62
     assert (
         content_generated
-        == "Once upon a time, in the ocean, there lived three sea creatures. There was a wise old octopus named Bob, a mischievous seagull named Sam, and a gentle sea turtle named Luna. They all lived together in a beautiful coral reef, surrounded by colorful fish and swaying sea fans"
+        == "There was a wise old octopus named Oracle. He lived in a cozy little cave beneath the waves with his best friend, a curious seahorse named Finley. One day, Finley met a playful dolphin named Daisy, and the three became inseparable. They spent their days exploring the ocean, playing hide-and-seek, and learning about the wonders of the sea from Oracle"
     )
-    assert last_response == response_snapshot
+    assert chunks == response_snapshot
 
 
 @pytest.mark.asyncio
@@ -371,7 +381,8 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream(
 async def test_flash_llama_grammar_tools_sea_creatures_stream_required(
     flash_llama_grammar_tools, response_snapshot
 ):
-    responses = await flash_llama_grammar_tools.chat(
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    stream = client.chat_completion(
         max_tokens=100,
         seed=24,
         tools=tools,
@@ -389,23 +400,17 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_required(
         stream=True,
     )
 
-    count = 0
     tool_calls_generated = ""
-    last_response = None
-    async for response in responses:
-        count += 1
-        assert response.choices[0].delta.content is None
-        tool_calls_generated += (
-            response.choices[0].delta.tool_calls[0].function.arguments
-        )
-        last_response = response
+    chunks = []
+    for chunk in stream:
+        assert chunk.choices[0].delta.content is None
+        tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments
 
-    assert count == 29
     assert (
         tool_calls_generated
-        == '{"function": {"_name": "get_current_weather", "location": "San Francisco, CA", "format": "celsius"}}<|eot_id|>'
+        == '{"function": {"_name": "get_n_day_weather_forecast", "location": "San Francisco, CA", "format": "fahrenheit", "num_days":3}}<|eot_id|>'
     )
-    assert last_response == response_snapshot
+    assert chunks == response_snapshot
 
 
 @pytest.mark.asyncio
@@ -413,7 +418,8 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_required(
 async def test_flash_llama_grammar_tools_sea_creatures_stream_none(
     flash_llama_grammar_tools, response_snapshot
 ):
-    responses = await flash_llama_grammar_tools.chat(
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    stream = client.chat_completion(
         max_tokens=100,
         seed=24,
         tools=tools,
@@ -431,22 +437,18 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_none(
         stream=True,
     )
 
-    count = 0
     content_generated = ""
-    last_response = None
-    async for response in responses:
-        count += 1
-        content_generated += response.choices[0].delta.content
-        last_response = response
-        assert response.choices[0].delta.tool_calls is None
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+        content_generated += chunk.choices[0].delta.content
+        assert chunk.choices[0].delta.tool_calls is None
 
-    assert count == 100
-    print(content_generated)
     assert (
         content_generated
         == "Once upon a time, in a vibrant ocean filled with coral reefs and schools of shimmering fish, lived three dear friends: Luna the sea turtle, Finley the friendly fish, and Crusty the wise crab.\n\nLuna was the oldest of the three. She had traveled the world, exploring hidden caves and shipwrecks, and collecting sparkling shells and shiny pebbles. Her shell was a beautiful mosaic of blues and greens, and her gentle eyes twinkled with the secrets of the deep"
     )
-    assert last_response == response_snapshot
+    assert chunks == response_snapshot
 
 
 @pytest.mark.asyncio
@@ -454,57 +456,37 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_none(
 async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object(
     flash_llama_grammar_tools, response_snapshot
 ):
-    # using `requests` to send the request until the client library supports tool_choice as a function object
-    responses = requests.post(
-        f"{flash_llama_grammar_tools.base_url}/v1/chat/completions",
-        headers=flash_llama_grammar_tools.headers,
-        json={
-            "model": "tgi",
-            "messages": [
-                {
-                    "role": "system",
-                    "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
-                },
-                {
-                    "role": "user",
-                    "content": "Tell me a story about 3 sea creatures",
-                },
-            ],
-            "tools": tools,
-            "tool_choice": {
-                "type": "function",
-                "function": {"name": "get_n_day_weather_forecast"},
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    stream = client.chat_completion(
+        messages=[
+            {
+                "role": "system",
+                "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
             },
-            "seed": 24,
-            "max_tokens": 100,
-            "stream": True,
+            {
+                "role": "user",
+                "content": "Tell me a story about 3 sea creatures",
+            },
+        ],
+        tools=tools,
+        tool_choice={
+            "type": "function",
+            "function": {"name": "get_n_day_weather_forecast"},
         },
+        max_tokens=100,
+        seed=24,
         stream=True,
     )
-    # iterate over the response in chunks
-    count = 0
+    chunks = []
     tool_calls_generated = ""
-    last_response = None
-    for chunk in responses.iter_content(chunk_size=1024):
-        if chunk:
-            count += 1
-            # remove the "data: " prefix, trailing newline, and split the chunk into individual lines
-            lines = chunk.decode("utf-8").replace("data: ", "").rstrip("\n").split("\n")
-            for line in lines:
-                if line == "[DONE]":
-                    break
-                response = json.loads(line)
-                tool_calls_generated += response["choices"][0]["delta"]["tool_calls"][
-                    0
-                ]["function"]["arguments"]
-                last_response = response
-
-    assert count == 39
+    for chunk in stream:
+        tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments
+        chunks.append(chunk)
     assert (
         tool_calls_generated
-        == '{"function": {"_name": "get_n_day_weather_forecast", "location": "San Francisco, CA", "format": "celsius", "num_days":3}}<|eot_id|>'
+        == '{"function": {"_name": "get_n_day_weather_forecast", "location": "San Francisco, CA", "format": "celsius", "num_days": 3}}<|eot_id|>'
     )
-    assert last_response == response_snapshot
+    assert chunks == response_snapshot
 
 
 @pytest.mark.asyncio
@@ -512,7 +494,8 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object(
 async def test_flash_llama_tool_reply_response(
     flash_llama_grammar_tools, response_snapshot
 ):
-    responses = await flash_llama_grammar_tools.chat(
+    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
+    response = client.chat_completion(
         max_tokens=100,
         seed=42,
         messages=[
@@ -536,10 +519,10 @@ async def test_flash_llama_tool_reply_response(
         stream=False,
     )
 
-    assert responses.choices[0].message.tool_calls is None
+    assert response.choices[0].message.tool_calls is None
     assert (
-        responses.choices[0].message.content
-        == "I can't access real-time data, but I can provide you with current conditions and forecast for Paris, France:\n\nThe current conditions in Paris are mostly cloudy with a temperature of 6.7°C (44.1°F). \n\nPlease note that the actual weather may differ from this information, and I recommend checking the forecast on a reliable weather website for the most up-to-date information."
+        response.choices[0].message.content
+        == "I can't access real-time data, but I can provide you with current conditions and forecast for Paris, France:\n\nThe current conditions in Paris are mostly cloudy with a temperature of 6.7°C (44.1°F). \n\nPlease note that the actual weather may differ from the provided information. For up-to-date information, I suggest checking a reliable weather website or app for the latest conditions and forecast."
     )
 
-    assert responses == response_snapshot
+    assert response == response_snapshot
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
index 37003440..07aa4307 100644
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "docker>=7",
     "numpy>=2.0",
     "openai>=1.65",
+    "huggingface_hub>=0.29",
 ]
 
 [tool.isort]
diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
index d419d4b3..a85db4a5 100644
--- a/integration-tests/requirements.txt
+++ b/integration-tests/requirements.txt
@@ -40,7 +40,9 @@ httpcore==1.0.7
 httpx==0.28.1
     # via openai
 huggingface-hub==0.29.0
-    # via text-generation
+    # via
+    #   text-generation-integration-tests (pyproject.toml)
+    #   text-generation
 idna==3.10
     # via
     #   anyio
diff --git a/router/src/infer/chat_template.rs b/router/src/infer/chat_template.rs
index e660cc74..b179dd4d 100644
--- a/router/src/infer/chat_template.rs
+++ b/router/src/infer/chat_template.rs
@@ -1189,7 +1189,7 @@ TOOL CALL ID: 0
         let tool_prompt = "This default prompt will be used".to_string();
         let tools_and_prompt = Some((tools, tool_prompt));
         let result = ct.apply(msgs, tools_and_prompt);
-        let expected = "<s>[INST] I'd like to show off how chat templating works! [/INST]Great! How can I help you today?</s> [INST] Just testing\n---\n[{\"type\":\"function\",\"function\":{\"description\":\"Get the current weather\",\"name\":\"get_current_weather\",\"arguments\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\",\"description\":\"The city and state, e.g. San Francisco, CA\"},\"format\":{\"type\":\"string\",\"enum\":[\"celsius\",\"fahrenheit\"],\"description\":\"The temperature unit to use. Infer this from the users location.\"}},\"required\":[\"location\",\"format\"]}}}]\nThis default prompt will be used [/INST]".to_string();
+        let expected = "<s>[INST] I'd like to show off how chat templating works! [/INST]Great! How can I help you today?</s> [INST] Just testing\n---\n[{\"type\":\"function\",\"function\":{\"description\":\"Get the current weather\",\"name\":\"get_current_weather\",\"arguments\":\"{\\\"type\\\":\\\"object\\\",\\\"properties\\\":{\\\"location\\\":{\\\"type\\\":\\\"string\\\",\\\"description\\\":\\\"The city and state, e.g. San Francisco, CA\\\"},\\\"format\\\":{\\\"type\\\":\\\"string\\\",\\\"enum\\\":[\\\"celsius\\\",\\\"fahrenheit\\\"],\\\"description\\\":\\\"The temperature unit to use. Infer this from the users location.\\\"}},\\\"required\\\":[\\\"location\\\",\\\"format\\\"]}\"}}]\nThis default prompt will be used [/INST]".to_string();
         assert_eq!(result.unwrap(), expected);
     }
 
@@ -1227,7 +1227,7 @@ TOOL CALL ID: 0
         let tool_prompt = "This default prompt will be used".to_string();
         let tools_and_prompt = Some((tools, tool_prompt));
         let result = ct.apply(msgs, tools_and_prompt);
-        let expected = "<s><|start_header_id|>system<|end_header_id|>\n\nEnvironment: ipython\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYoure a helpful assistant! Answer the users question best you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.\n\nRespond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.Do not use variables.\n\n{\n    \"function\": {\n        \"arguments\": {\n            \"properties\": {\n                \"format\": {\n                    \"description\": \"The temperature unit to use. Infer this from the users location.\",\n                    \"enum\": [\n                        \"celsius\",\n                        \"fahrenheit\"\n                    ],\n                    \"type\": \"string\"\n                },\n                \"location\": {\n                    \"description\": \"The city and state, e.g. San Francisco, CA\",\n                    \"type\": \"string\"\n                }\n            },\n            \"required\": [\n                \"location\",\n                \"format\"\n            ],\n            \"type\": \"object\"\n        },\n        \"description\": \"Get the current weather\",\n        \"name\": \"get_current_weather\"\n    },\n    \"type\": \"function\"\n}\n\nWhat is the weather like in Brooklyn, New York?\n---\nThis default prompt will be used<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".to_string();
+        let expected = "<s><|start_header_id|>system<|end_header_id|>\n\nEnvironment: ipython\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYoure a helpful assistant! Answer the users question best you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.\n\nRespond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.Do not use variables.\n\n{\n    \"function\": {\n        \"arguments\": \"{\\\"type\\\":\\\"object\\\",\\\"properties\\\":{\\\"location\\\":{\\\"type\\\":\\\"string\\\",\\\"description\\\":\\\"The city and state, e.g. San Francisco, CA\\\"},\\\"format\\\":{\\\"type\\\":\\\"string\\\",\\\"enum\\\":[\\\"celsius\\\",\\\"fahrenheit\\\"],\\\"description\\\":\\\"The temperature unit to use. Infer this from the users location.\\\"}},\\\"required\\\":[\\\"location\\\",\\\"format\\\"]}\",\n        \"description\": \"Get the current weather\",\n        \"name\": \"get_current_weather\"\n    },\n    \"type\": \"function\"\n}\n\nWhat is the weather like in Brooklyn, New York?\n---\nThis default prompt will be used<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".to_string();
         assert_eq!(result.unwrap(), expected);
     }
 }
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 60f1f73a..a7923c4c 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -1138,10 +1138,17 @@ pub struct FunctionDefinition {
     #[serde(default)]
     pub description: Option<String>,
     pub name: String,
-    #[serde(alias = "parameters")]
+    #[serde(alias = "parameters", serialize_with = "serialize_as_string")]
     pub arguments: serde_json::Value,
 }
 
+fn serialize_as_string<S>(value: &serde_json::Value, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+{
+    serializer.serialize_str(&value.to_string())
+}
+
 #[derive(Clone, Debug, Deserialize, Serialize, ToSchema)]
 #[cfg_attr(test, derive(PartialEq))]
 pub(crate) struct Tool {
@@ -1730,7 +1737,7 @@ mod tests {
         let serialized = serde_json::to_string(&message).unwrap();
         assert_eq!(
             serialized,
-            r#"{"role":"assistant","tool_calls":[{"id":"0","type":"function","function":{"description":null,"name":"myfn","arguments":{"format":"csv"}}}]}"#
+            r#"{"role":"assistant","tool_calls":[{"id":"0","type":"function","function":{"description":null,"name":"myfn","arguments":"{\"format\":\"csv\"}"}}]}"#
         );
     }
 

From 976eae216f7faa4c52d34f4128f18cc3717bf8f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 10 Mar 2025 12:11:10 +0100
Subject: [PATCH 130/183] Nix: the launcher needs a Python env with Torch for
 GPU detection (#3085)

This makes `nix run .` in the repository work again. Should fix #3025.
---
 flake.nix | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/flake.nix b/flake.nix
index 943bf736..5058667a 100644
--- a/flake.nix
+++ b/flake.nix
@@ -44,9 +44,24 @@
         benchmark = cargoNix.workspaceMembers.text-generation-benchmark.build.override {
           inherit crateOverrides;
         };
-        launcher = cargoNix.workspaceMembers.text-generation-launcher.build.override {
-          inherit crateOverrides;
-        };
+        launcher =
+          let
+            launcherUnwrapped = cargoNix.workspaceMembers.text-generation-launcher.build.override {
+              inherit crateOverrides;
+            };
+            packagePath =
+              with pkgs.python3.pkgs;
+              makePythonPath [
+                torch
+              ];
+          in
+          pkgs.writeShellApplication {
+            name = "text-generation-launcher";
+            text = ''
+              PYTHONPATH="${packagePath}" ${launcherUnwrapped}/bin/text-generation-launcher "$@"
+            '';
+          };
+
         router =
           let
             routerUnwrapped = cargoNix.workspaceMembers.text-generation-router-v3.build.override {

From 58a65f7914d5f1e6161996a4d0f9c74f0e256bab Mon Sep 17 00:00:00 2001
From: Alex Weston <43505988+aW3st@users.noreply.github.com>
Date: Mon, 10 Mar 2025 07:26:57 -0400
Subject: [PATCH 131/183] Add request parameters to OTel span for
 `/v1/chat/completions` endpoint (#3000)

Record request parameters in OTel span for /v1/chat/completions endpoint
---
 router/src/server.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/router/src/server.rs b/router/src/server.rs
index 6e55d2bc..0b4da9ad 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1217,7 +1217,7 @@ example = json ! ({"error": "Incomplete generation"})),
 #[instrument(
 skip_all,
 fields(
-// parameters = ? req.parameters,
+parameters,
 total_time,
 validation_time,
 queue_time,
@@ -1243,7 +1243,7 @@ pub(crate) async fn chat_completions(
     } = chat.clone();
     let (generate_request, using_tools): (GenerateRequest, bool) =
         chat.try_into_generate(&infer)?;
-
+    span.record("parameters", format!("{:?}", generate_request.parameters));
     let logprobs = logprobs.unwrap_or_default();
 
     // extract model id from request if specified

From bbe218a4f700cb0b7dbf0fdca20d598470052505 Mon Sep 17 00:00:00 2001
From: EachSheep <hiyoungshen@gmail.com>
Date: Mon, 10 Mar 2025 19:42:59 +0800
Subject: [PATCH 132/183] Add qwen2 multi lora layers support (#3089)

add qwen2 multi lora layers support to solve problem like https://github.com/huggingface/text-generation-inference/issues/2881, the similar PR are at https://github.com/huggingface/text-generation-inference/pull/2883

Co-authored-by: hjs <hjs@pku.edu.cn>
---
 .../custom_modeling/flash_qwen2_modeling.py   | 86 +++++++++++++++----
 1 file changed, 67 insertions(+), 19 deletions(-)

diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index d6569a1d..fc708e58 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -11,6 +11,8 @@ from text_generation_server.layers.attention import (
     Seqlen,
 )
 from text_generation_server.layers import (
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
@@ -23,18 +25,31 @@ from text_generation_server.layers.layernorm import (
 )
 
 
-def load_attention(config, prefix, weights):
+def load_attention(config, prefix, weights, layer_id):
+    prefixes = [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
+    head_size = config.hidden_size // config.num_attention_heads
+    sizes = [
+        head_size * config.num_attention_heads,
+        head_size * config.num_key_value_heads,
+        head_size * config.num_key_value_heads,
+    ]
     if config.num_attention_heads != config.num_key_value_heads:
-        return _load_gqa(config, prefix, weights)
+        base_layer = _load_gqa(config, prefix, weights)
     else:
-        return TensorParallelColumnLinear.load_multi(
+        base_layer = TensorParallelColumnLinear.load_multi(
             config,
-            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            prefixes=prefixes,
             dim=0,
             weights=weights,
             bias=True,
         )
-
+    return TensorParallelMultiAdapterLinear.load(
+        base_layer=base_layer,
+        layer_id=layer_id,
+        layer_names=prefixes,
+        sizes=sizes,
+        process_group=weights.process_group,
+    )
 
 def _load_gqa(config, prefix: str, weights):
     assert config.hidden_size % config.num_attention_heads == 0
@@ -52,6 +67,7 @@ def _load_gqa(config, prefix: str, weights):
 class Qwen2Attention(torch.nn.Module):
     def __init__(
         self,
+        index: int,
         prefix: str,
         config,
         weights,
@@ -83,16 +99,22 @@ class Qwen2Attention(torch.nn.Module):
             config.num_key_value_heads // weights.process_group.size()
         )
 
-        self.query_key_value = load_attention(config, prefix, weights)
+        self.query_key_value = load_attention(config, prefix, weights, index)
 
         self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
-        self.o_proj = TensorParallelRowLinear.load(
+        o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
             bias=False,
         )
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            index,
+            "o_proj",
+            process_group=weights.process_group,
+        )
         self.num_groups = self.num_heads // self.num_key_value_heads
         self.kv_head_mapping = torch.arange(
             0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
@@ -110,8 +132,9 @@ class Qwen2Attention(torch.nn.Module):
         seqlen,
         max_s,
         prefill_cache_indices,
+        adapter_data
     ):
-        qkv = self.query_key_value(hidden_states)
+        qkv = self.query_key_value(hidden_states, adapter_data)
         query, kv = qkv.split(
             [
                 self.head_size * self.num_heads,
@@ -163,11 +186,13 @@ class Qwen2Attention(torch.nn.Module):
                 kv_scales=self.kv_scales,
             )
 
-        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
 
 
 class Qwen2MLP(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix, config, weights, index):
         super().__init__()
         act = config.hidden_act
         self.act = (
@@ -181,27 +206,45 @@ class Qwen2MLP(nn.Module):
             )
         )
         # Fuse gate and up proj
-        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+        prefixes = [f"{prefix}.gate_proj", f"{prefix}.up_proj"]
+        sizes = [
+            config.intermediate_size,
+            config.intermediate_size,
+        ]
+        gate_up_proj = TensorParallelColumnLinear.load_multi(
             config,
-            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            prefixes=prefixes,
             weights=weights,
             dim=0,
             bias=False,
         )
-        self.down_proj = TensorParallelRowLinear.load(
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            layer_id=index,
+            layer_names=prefixes,
+            sizes=sizes,
+            process_group=weights.process_group,
+        )
+        down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
             bias=False,
         )
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            index,
+            "down_proj",
+            process_group=weights.process_group,
+        )
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()
         )
 
-    def forward(self, hidden_states):
-        gate_up_states = self.gate_up_proj(hidden_states)
+    def forward(self, hidden_states, adapter_data):
+        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
         gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
-        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data)
 
 
 class Qwen2Layer(nn.Module):
@@ -209,9 +252,9 @@ class Qwen2Layer(nn.Module):
         super().__init__()
         prefix = f"{prefix}.layers.{layer_id}"
         self.self_attn = Qwen2Attention(
-            prefix=f"{prefix}.self_attn", config=config, weights=weights
+            index=layer_id, prefix=f"{prefix}.self_attn", config=config, weights=weights
         )
-        self.mlp = Qwen2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.mlp = Qwen2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights, index=layer_id)
         self.input_layernorm = FastRMSNorm.load(
             prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
         )
@@ -234,6 +277,7 @@ class Qwen2Layer(nn.Module):
         seqlen,
         max_s,
         prefill_cache_indices,
+        adapter_data,
     ):
         normed_hidden_states, residual = self.input_layernorm(hidden_states)
 
@@ -249,12 +293,13 @@ class Qwen2Layer(nn.Module):
             seqlen,
             max_s,
             prefill_cache_indices,
+            adapter_data
         )
         hidden_states = attn_output + residual
 
         # faster post attention rms norm
         hidden_states, residual = self.post_attention_layernorm(hidden_states)
-        mlp_output = self.mlp(hidden_states)
+        mlp_output = self.mlp(hidden_states, adapter_data)
         hidden_states = mlp_output + residual
         return hidden_states
 
@@ -301,6 +346,7 @@ class Qwen2Model(torch.nn.Module):
         max_s: int,
         true_max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
+        adapter_data,
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
@@ -324,6 +370,7 @@ class Qwen2Model(torch.nn.Module):
                 seqlen,
                 max_s,
                 prefill_cache_indices,
+                adapter_data,
             )
 
         hidden_states, _ = self.norm(hidden_states)
@@ -396,6 +443,7 @@ class Qwen2ForCausalLM(torch.nn.Module):
             max_s,
             true_max_s,
             prefill_cache_indices,
+            adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]

From cae0cbe87dedd82640df9b0fe8c7319861588213 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 10 Mar 2025 22:03:51 +0800
Subject: [PATCH 133/183] Add modules_to_not_convert in quantized model (#3053)

* fix modules_to_not_convert

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix format

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* fix tp quant skip

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* revert unquantized changes

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

* use DefaultWeightsLoader in skip modules

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>

---------

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 .../layers/gptq/__init__.py                   | 27 ++++++++++++++++++-
 .../utils/quantization.py                     |  9 ++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py
index 25387682..f8a62cf5 100644
--- a/server/text_generation_server/layers/gptq/__init__.py
+++ b/server/text_generation_server/layers/gptq/__init__.py
@@ -6,7 +6,12 @@ import torch
 from loguru import logger
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.log import log_once
-from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
+from text_generation_server.utils.weights import (
+    Weight,
+    Weights,
+    WeightsLoader,
+    DefaultWeightsLoader,
+)
 
 if SYSTEM == "ipex":
     from .ipex import QuantLinear
@@ -90,6 +95,7 @@ class GPTQWeightsLoader(WeightsLoader):
         quant_method: str,
         quantize: str,
         sym: bool,
+        modules_to_not_convert: List[str],
     ):
         self.bits = bits
         self.desc_act = desc_act
@@ -97,6 +103,7 @@ class GPTQWeightsLoader(WeightsLoader):
         self.quant_method = quant_method
         self.quantize = quantize
         self.sym = sym
+        self.modules_to_not_convert = modules_to_not_convert
 
     def get_weights(self, weights: Weights, prefix: str):
         self._get_gptq_params(weights)
@@ -109,6 +116,9 @@ class GPTQWeightsLoader(WeightsLoader):
             log_once(logger.warning, "Disabling exllama because desc_act=True")
             use_exllama = False
 
+        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_weights(weights, prefix)
+
         try:
             qweight = weights.get_tensor(f"{prefix}.qweight")
         except RuntimeError:
@@ -175,12 +185,23 @@ class GPTQWeightsLoader(WeightsLoader):
             use_exllama=use_exllama,
         )
 
+    def is_layer_skipped_quantization(
+        self, prefix: str, modules_to_not_convert: List[str]
+    ):
+        if modules_to_not_convert is None:
+            return False
+        return any(module_name in prefix for module_name in modules_to_not_convert)
+
     def get_weights_col_packed(
         self,
         weights: Weights,
         prefix: str,
         block_sizes: Union[int, List[int]],
     ):
+        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_weights_col_packed(
+                weights, prefix, block_sizes
+            )
         try:
             qweight = weights.get_packed_sharded(
                 f"{prefix}.qweight", dim=1, block_sizes=block_sizes
@@ -232,6 +253,8 @@ class GPTQWeightsLoader(WeightsLoader):
         )
 
     def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        if self.is_layer_skipped_quantization(prefixes[0], self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_multi_weights_col(weights, prefixes, dim)
         try:
             qweight = torch.cat(
                 [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
@@ -310,6 +333,8 @@ class GPTQWeightsLoader(WeightsLoader):
             log_once(logger.warning, "Disabling exllama because desc_act=True")
             use_exllama = False
 
+        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_weights_row(weights, prefix)
         try:
             qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
         except RuntimeError:
diff --git a/server/text_generation_server/utils/quantization.py b/server/text_generation_server/utils/quantization.py
index e44cf64f..7324b33f 100644
--- a/server/text_generation_server/utils/quantization.py
+++ b/server/text_generation_server/utils/quantization.py
@@ -21,6 +21,7 @@ class _QuantizerConfig:
     quant_method: str
     sym: bool
     weight_block_size: Optional[List[int]]
+    modules_to_not_convert: Optional[List[str]]
 
 
 @dataclass
@@ -51,6 +52,7 @@ def _get_quantizer_config(model_id, revision):
     sym = False
     desc_act = False
     weight_block_size = None
+    modules_to_not_convert = None
 
     filename = "config.json"
     try:
@@ -73,7 +75,10 @@ def _get_quantizer_config(model_id, revision):
         # Order is important here, desc_act is missing on some real models
         quant_method = data["quantization_config"]["quant_method"]
         checkpoint_format = data["quantization_config"].get("checkpoint_format")
-        desc_act = data["quantization_config"]["desc_act"]
+        desc_act = data["quantization_config"].get("desc_act", False)
+        modules_to_not_convert = data["quantization_config"].get(
+            "modules_to_not_convert", []
+        )
     except Exception:
         filename = "quantize_config.json"
         try:
@@ -110,6 +115,7 @@ def _get_quantizer_config(model_id, revision):
         sym=sym,
         desc_act=desc_act,
         weight_block_size=weight_block_size,
+        modules_to_not_convert=modules_to_not_convert,
     )
 
 
@@ -159,6 +165,7 @@ def get_loader(
                 quant_method=quantizer_config.quant_method,
                 quantize=quantize,
                 sym=quantizer_config.sym,
+                modules_to_not_convert=quantizer_config.modules_to_not_convert,
             )
     elif quantize == "bitsandbytes":
         from text_generation_server.layers.bnb import BNBWeight

From c5ecc7a4de05e51c8a3c119e93d53a1151edce9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 10 Mar 2025 15:08:23 +0100
Subject: [PATCH 134/183] Small test and typing fixes (#3078)

* test_weights: add modules_to_not_convert

* More typing fixes
---
 server/tests/utils/test_weights.py                    | 2 ++
 server/text_generation_server/layers/gptq/__init__.py | 2 --
 server/text_generation_server/utils/quantization.py   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/server/tests/utils/test_weights.py b/server/tests/utils/test_weights.py
index 556fcea1..b21749d1 100644
--- a/server/tests/utils/test_weights.py
+++ b/server/tests/utils/test_weights.py
@@ -25,6 +25,7 @@ def gptq_weights_loader():
         quant_method="gptq",
         quantize="gptq",
         sym=True,
+        modules_to_not_convert=[],
     )
 
 
@@ -37,6 +38,7 @@ def gptq_weights_loader_awq():
         quant_method="awq",
         quantize="awq",
         sym=True,
+        modules_to_not_convert=[],
     )
 
 
diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py
index f8a62cf5..b5549916 100644
--- a/server/text_generation_server/layers/gptq/__init__.py
+++ b/server/text_generation_server/layers/gptq/__init__.py
@@ -188,8 +188,6 @@ class GPTQWeightsLoader(WeightsLoader):
     def is_layer_skipped_quantization(
         self, prefix: str, modules_to_not_convert: List[str]
     ):
-        if modules_to_not_convert is None:
-            return False
         return any(module_name in prefix for module_name in modules_to_not_convert)
 
     def get_weights_col_packed(
diff --git a/server/text_generation_server/utils/quantization.py b/server/text_generation_server/utils/quantization.py
index 7324b33f..e460361a 100644
--- a/server/text_generation_server/utils/quantization.py
+++ b/server/text_generation_server/utils/quantization.py
@@ -21,7 +21,7 @@ class _QuantizerConfig:
     quant_method: str
     sym: bool
     weight_block_size: Optional[List[int]]
-    modules_to_not_convert: Optional[List[str]]
+    modules_to_not_convert: List[str]
 
 
 @dataclass
@@ -52,7 +52,7 @@ def _get_quantizer_config(model_id, revision):
     sym = False
     desc_act = False
     weight_block_size = None
-    modules_to_not_convert = None
+    modules_to_not_convert = []
 
     filename = "config.json"
     try:

From 124398fa57edfc62ba9a3384fe711d78ce1bb3f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 10 Mar 2025 16:19:50 +0100
Subject: [PATCH 135/183] hotfix: qwen2 formatting (#3093)

* hotfix: qwen2 formatting

* cargo fmt
---
 router/src/server.rs                          | 20 +++++++++----------
 .../custom_modeling/flash_qwen2_modeling.py   | 13 ++++++++----
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/router/src/server.rs b/router/src/server.rs
index 0b4da9ad..d201f51a 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1215,16 +1215,16 @@ example = json ! ({"error": "Incomplete generation"})),
 )
 )]
 #[instrument(
-skip_all,
-fields(
-parameters,
-total_time,
-validation_time,
-queue_time,
-inference_time,
-time_per_token,
-seed,
-)
+    skip_all,
+    fields(
+        parameters,
+        total_time,
+        validation_time,
+        queue_time,
+        inference_time,
+        time_per_token,
+        seed,
+    )
 )]
 pub(crate) async fn chat_completions(
     Extension(infer): Extension<Infer>,
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index fc708e58..9d956222 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -51,6 +51,7 @@ def load_attention(config, prefix, weights, layer_id):
         process_group=weights.process_group,
     )
 
+
 def _load_gqa(config, prefix: str, weights):
     assert config.hidden_size % config.num_attention_heads == 0
     assert config.num_attention_heads % weights.process_group.size() == 0
@@ -132,7 +133,7 @@ class Qwen2Attention(torch.nn.Module):
         seqlen,
         max_s,
         prefill_cache_indices,
-        adapter_data
+        adapter_data,
     ):
         qkv = self.query_key_value(hidden_states, adapter_data)
         query, kv = qkv.split(
@@ -244,7 +245,9 @@ class Qwen2MLP(nn.Module):
     def forward(self, hidden_states, adapter_data):
         gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
         gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
-        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+        )
 
 
 class Qwen2Layer(nn.Module):
@@ -254,7 +257,9 @@ class Qwen2Layer(nn.Module):
         self.self_attn = Qwen2Attention(
             index=layer_id, prefix=f"{prefix}.self_attn", config=config, weights=weights
         )
-        self.mlp = Qwen2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights, index=layer_id)
+        self.mlp = Qwen2MLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights, index=layer_id
+        )
         self.input_layernorm = FastRMSNorm.load(
             prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
         )
@@ -293,7 +298,7 @@ class Qwen2Layer(nn.Module):
             seqlen,
             max_s,
             prefill_cache_indices,
-            adapter_data
+            adapter_data,
         )
         hidden_states = attn_output + residual
 

From dc5f05f8e6ad170bc58e48632b137572268eab25 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Mon, 10 Mar 2025 12:56:19 -0400
Subject: [PATCH 136/183] Pr 3003 ci branch (#3007)

* change ChatCompletionChunk to align with "OpenAI Chat Completions streaming API"

Moving after tool_calls2

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add in Buffering..

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

fix: handle usage outside of stream state and add tests

Simplifying everything quite a bit.

Remove the unused model_dump.

Clippy.

Clippy ?

Ruff.

Uppgrade the flake for latest transformers.

Upgrade after rebase.

Remove potential footgun.

Fix completion test.

* Clippy.

* Tweak for multi prompt.

* Ruff.

* Update the snapshot a bit.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 flake.lock                                    | 127 ++++++--
 integration-tests/conftest.py                 |   3 +-
 .../test_chat_hfhub_nousage.json              |  62 ++++
 .../test_chat_hfhub_usage.json                |  75 +++++
 .../test_chat_openai_nousage.json             |  71 ++++
 .../test_chat_openai_usage.json               |  87 +++++
 ...t_flash_llama_completion_many_prompts.json |  18 +-
 ..._llama_completion_many_prompts_stream.json | 304 +++++++++++-------
 ..._flash_llama_completion_single_prompt.json |   6 +-
 ...t_flash_llama_completion_stream_usage.json |  29 +-
 .../models/test_chat_stream_options.py        |  15 +
 .../models/test_completion_prompts.py         | 143 +++++---
 router/src/lib.rs                             |   3 +-
 router/src/server.rs                          |  74 +++--
 14 files changed, 777 insertions(+), 240 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json
 create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json
 create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json
 create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json
 create mode 100644 integration-tests/models/test_chat_stream_options.py

diff --git a/flake.lock b/flake.lock
index b6cf7e53..719cdeea 100644
--- a/flake.lock
+++ b/flake.lock
@@ -2,10 +2,16 @@
   "nodes": {
     "cachix": {
       "inputs": {
-        "devenv": ["crate2nix"],
-        "flake-compat": ["crate2nix"],
+        "devenv": [
+          "crate2nix"
+        ],
+        "flake-compat": [
+          "crate2nix"
+        ],
         "nixpkgs": "nixpkgs",
-        "pre-commit-hooks": ["crate2nix"]
+        "pre-commit-hooks": [
+          "crate2nix"
+        ]
       },
       "locked": {
         "lastModified": 1709700175,
@@ -24,10 +30,19 @@
     },
     "cachix_2": {
       "inputs": {
-        "devenv": ["crate2nix", "crate2nix_stable"],
-        "flake-compat": ["crate2nix", "crate2nix_stable"],
+        "devenv": [
+          "crate2nix",
+          "crate2nix_stable"
+        ],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable"
+        ],
         "nixpkgs": "nixpkgs_2",
-        "pre-commit-hooks": ["crate2nix", "crate2nix_stable"]
+        "pre-commit-hooks": [
+          "crate2nix",
+          "crate2nix_stable"
+        ]
       },
       "locked": {
         "lastModified": 1716549461,
@@ -46,8 +61,16 @@
     },
     "cachix_3": {
       "inputs": {
-        "devenv": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
-        "flake-compat": ["crate2nix", "crate2nix_stable", "crate2nix_stable"],
+        "devenv": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable"
+        ],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable",
+          "crate2nix_stable"
+        ],
         "nixpkgs": "nixpkgs_3",
         "pre-commit-hooks": [
           "crate2nix",
@@ -78,15 +101,18 @@
         "flake-compat": "flake-compat_3",
         "flake-parts": "flake-parts_3",
         "nix-test-runner": "nix-test-runner_3",
-        "nixpkgs": ["tgi-nix", "nixpkgs"],
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ],
         "pre-commit-hooks": "pre-commit-hooks_3"
       },
       "locked": {
-        "lastModified": 1734429562,
-        "narHash": "sha256-V2XNs3Ir8WXNHdocfzkR/fu0FzkZ9uTDJkVecxJrGmQ=",
+        "lastModified": 1739473963,
+        "narHash": "sha256-ItAhpjNUzEWd/cgZVyW/jvoGbCec4TK29e1Mnmn1oJE=",
         "owner": "nix-community",
         "repo": "crate2nix",
-        "rev": "8537c2d7cb623679aaeff62c4c4c43a91566ab09",
+        "rev": "be31feae9a82c225c0fd1bdf978565dc452a483a",
         "type": "github"
       },
       "original": {
@@ -193,7 +219,11 @@
     "devshell_2": {
       "inputs": {
         "flake-utils": "flake-utils_3",
-        "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1717408969,
@@ -212,7 +242,10 @@
     "devshell_3": {
       "inputs": {
         "flake-utils": "flake-utils_4",
-        "nixpkgs": ["crate2nix", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1711099426,
@@ -310,7 +343,11 @@
     },
     "flake-parts_2": {
       "inputs": {
-        "nixpkgs-lib": ["crate2nix", "crate2nix_stable", "nixpkgs"]
+        "nixpkgs-lib": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1719745305,
@@ -328,7 +365,10 @@
     },
     "flake-parts_3": {
       "inputs": {
-        "nixpkgs-lib": ["crate2nix", "nixpkgs"]
+        "nixpkgs-lib": [
+          "crate2nix",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1712014858,
@@ -519,7 +559,11 @@
     },
     "gitignore_3": {
       "inputs": {
-        "nixpkgs": ["crate2nix", "pre-commit-hooks", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "pre-commit-hooks",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1709087332,
@@ -726,10 +770,22 @@
     },
     "pre-commit-hooks_2": {
       "inputs": {
-        "flake-compat": ["crate2nix", "crate2nix_stable", "flake-compat"],
+        "flake-compat": [
+          "crate2nix",
+          "crate2nix_stable",
+          "flake-compat"
+        ],
         "gitignore": "gitignore_2",
-        "nixpkgs": ["crate2nix", "crate2nix_stable", "nixpkgs"],
-        "nixpkgs-stable": ["crate2nix", "crate2nix_stable", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": [
+          "crate2nix",
+          "crate2nix_stable",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1719259945,
@@ -747,11 +803,20 @@
     },
     "pre-commit-hooks_3": {
       "inputs": {
-        "flake-compat": ["crate2nix", "flake-compat"],
+        "flake-compat": [
+          "crate2nix",
+          "flake-compat"
+        ],
         "flake-utils": "flake-utils_5",
         "gitignore": "gitignore_3",
-        "nixpkgs": ["crate2nix", "nixpkgs"],
-        "nixpkgs-stable": ["crate2nix", "nixpkgs"]
+        "nixpkgs": [
+          "crate2nix",
+          "nixpkgs"
+        ],
+        "nixpkgs-stable": [
+          "crate2nix",
+          "nixpkgs"
+        ]
       },
       "locked": {
         "lastModified": 1712055707,
@@ -772,21 +837,27 @@
         "crate2nix": "crate2nix",
         "flake-utils": "flake-utils_6",
         "nix-filter": "nix-filter",
-        "nixpkgs": ["tgi-nix", "nixpkgs"],
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ],
         "rust-overlay": "rust-overlay",
         "tgi-nix": "tgi-nix"
       }
     },
     "rust-overlay": {
       "inputs": {
-        "nixpkgs": ["tgi-nix", "nixpkgs"]
+        "nixpkgs": [
+          "tgi-nix",
+          "nixpkgs"
+        ]
       },
       "locked": {
-        "lastModified": 1738549608,
-        "narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=",
+        "lastModified": 1741141853,
+        "narHash": "sha256-FauVtC+FbOgkKpGVuQTNxSqrvgbmVc7hFkjn/DacwMo=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d",
+        "rev": "02edad1f19d6dec824e0812e4cdc0aa7930ff8ae",
         "type": "github"
       },
       "original": {
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 6490f833..f7852441 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -8,6 +8,7 @@ from huggingface_hub.inference._generated.types.chat_completion import (
 from openai.types.chat.chat_completion_chunk import (
     ChatCompletionChunk as OAIChatCompletionChunk,
 )
+from openai.types.completion import Completion as OAICompletion
 import requests
 
 
@@ -39,7 +40,6 @@ from typing import Dict, List, Optional
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 from syrupy.extensions.json import JSONSnapshotExtension
-
 from text_generation import AsyncClient
 from text_generation.types import (
     BestOfSequence,
@@ -133,6 +133,7 @@ class ResponseComparator(JSONSnapshotExtension):
             or isinstance(data, ChatCompletionComplete)
             or isinstance(data, Completion)
             or isinstance(data, OAIChatCompletionChunk)
+            or isinstance(data, OAICompletion)
         ):
             data = data.model_dump()
         elif isinstance(data, ChatCompletionStreamOutput) or isinstance(
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json
new file mode 100644
index 00000000..a05b685e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json
@@ -0,0 +1,62 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265520,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265520,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265520,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json
new file mode 100644
index 00000000..d2c969b2
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json
@@ -0,0 +1,75 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [],
+    "created": 1741266005,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 3,
+      "prompt_tokens": 39,
+      "total_tokens": 42
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json
new file mode 100644
index 00000000..6c362059
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json
@@ -0,0 +1,71 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265134,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265134,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265134,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json
new file mode 100644
index 00000000..feb32567
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json
@@ -0,0 +1,87 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "OK",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "function_call": null,
+          "refusal": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "stop",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [],
+    "created": 1741265133,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "service_tier": null,
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 3,
+      "completion_tokens_details": null,
+      "prompt_tokens": 39,
+      "prompt_tokens_details": null,
+      "total_tokens": 42
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
index 25b8120d..5bef4172 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
@@ -1,17 +1,17 @@
 {
   "choices": [
-    {
-      "finish_reason": "length",
-      "index": 0,
-      "logprobs": null,
-      "text": " A Beginner’s Guide\nDeep learning is a subset"
-    },
     {
       "finish_reason": "length",
       "index": 1,
       "logprobs": null,
       "text": " This is a question that has puzzled many people for"
     },
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "text": " A Beginner’s Guide\nDeep learning is a subset"
+    },
     {
       "finish_reason": "length",
       "index": 3,
@@ -25,11 +25,11 @@
       "text": " Paris\nWhat is the capital of France?\nThe"
     }
   ],
-  "created": 1725877154,
+  "created": 1741264813,
   "id": "",
-  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "text_completion",
-  "system_fingerprint": "2.2.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 40,
     "prompt_tokens": 22,
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
index dd22ceae..ad4ee6e0 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
@@ -8,11 +8,12 @@
         "text": " A"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340006,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -23,11 +24,12 @@
         "text": " This"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340006,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -38,11 +40,12 @@
         "text": " Paris"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340006,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -53,11 +56,12 @@
         "text": "us"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340006,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -68,11 +72,12 @@
         "text": " Beginner"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340006,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -83,11 +88,12 @@
         "text": " is"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340006,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -98,11 +104,12 @@
         "text": "\n"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -113,11 +120,12 @@
         "text": "cul"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -128,11 +136,12 @@
         "text": "’s"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -143,11 +152,12 @@
         "text": " a"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -158,11 +168,12 @@
         "text": "What"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -173,11 +184,12 @@
         "text": "as"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -188,11 +200,12 @@
         "text": " Guide"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -203,11 +216,12 @@
         "text": " question"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -218,11 +232,12 @@
         "text": " is"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -233,11 +248,12 @@
         "text": "_minus"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -248,11 +264,12 @@
         "text": "\n"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -263,11 +280,12 @@
         "text": " that"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -278,11 +296,12 @@
         "text": " the"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -293,11 +312,12 @@
         "text": "cul"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -308,11 +328,12 @@
         "text": "Deep"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -323,11 +344,12 @@
         "text": " has"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -338,11 +360,12 @@
         "text": " capital"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -353,11 +376,12 @@
         "text": "as"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -368,11 +392,12 @@
         "text": " learning"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -383,11 +408,12 @@
         "text": " puzzled"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -398,11 +424,12 @@
         "text": " of"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -413,11 +440,12 @@
         "text": "(s"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -428,11 +456,12 @@
         "text": " is"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -443,11 +472,12 @@
         "text": " many"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -458,11 +488,12 @@
         "text": " France"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -473,11 +504,12 @@
         "text": "):\n"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -488,11 +520,12 @@
         "text": " a"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -503,11 +536,12 @@
         "text": " people"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -518,11 +552,12 @@
         "text": "?\n"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -533,11 +568,12 @@
         "text": "   "
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
   },
   {
     "choices": [
@@ -548,11 +584,18 @@
         "text": " subset"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "completion_tokens_details": null,
+      "prompt_tokens": 6,
+      "prompt_tokens_details": null,
+      "total_tokens": 16
+    }
   },
   {
     "choices": [
@@ -563,11 +606,18 @@
         "text": " for"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "completion_tokens_details": null,
+      "prompt_tokens": 5,
+      "prompt_tokens_details": null,
+      "total_tokens": 15
+    }
   },
   {
     "choices": [
@@ -578,11 +628,18 @@
         "text": "The"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "completion_tokens_details": null,
+      "prompt_tokens": 8,
+      "prompt_tokens_details": null,
+      "total_tokens": 18
+    }
   },
   {
     "choices": [
@@ -593,10 +650,17 @@
         "text": " \"\"\"\n"
       }
     ],
-    "created": 1725883643,
+    "created": 1741340007,
     "id": "",
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "text_completion",
-    "system_fingerprint": "2.2.1-dev0-native"
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "completion_tokens_details": null,
+      "prompt_tokens": 3,
+      "prompt_tokens_details": null,
+      "total_tokens": 13
+    }
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
index 7ad56271..1cb8c103 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
@@ -7,11 +7,11 @@
       "text": " A Beginner’s Guide\nDeep learning is a subset"
     }
   ],
-  "created": 1725876621,
+  "created": 1741264812,
   "id": "",
-  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "text_completion",
-  "system_fingerprint": "2.2.1-dev0-native",
+  "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
     "completion_tokens": 10,
     "prompt_tokens": 6,
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
index fbb3669f..898fd7fb 100644
--- a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
@@ -12,7 +12,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741338471,
+    "created": 1741373593,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -32,7 +32,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741338471,
+    "created": 1741373593,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -52,7 +52,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741338471,
+    "created": 1741373593,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -72,7 +72,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741338471,
+    "created": 1741373594,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -92,7 +92,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741338472,
+    "created": 1741373594,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -112,7 +112,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741338472,
+    "created": 1741373594,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -132,7 +132,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741338472,
+    "created": 1741373594,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -152,7 +152,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741338472,
+    "created": 1741373594,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -172,7 +172,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741338472,
+    "created": 1741373594,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -192,7 +192,16 @@
         "logprobs": null
       }
     ],
-    "created": 1741338472,
+    "created": 1741373594,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [],
+    "created": 1741373594,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
diff --git a/integration-tests/models/test_chat_stream_options.py b/integration-tests/models/test_chat_stream_options.py
new file mode 100644
index 00000000..f93cc951
--- /dev/null
+++ b/integration-tests/models/test_chat_stream_options.py
@@ -0,0 +1,15 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def chat_handle(launcher):
+    with launcher(
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def chat_client(chat_handle):
+    await chat_handle.health(300)
+    return chat_handle.client
diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py
index 27988ef9..a2c037c5 100644
--- a/integration-tests/models/test_completion_prompts.py
+++ b/integration-tests/models/test_completion_prompts.py
@@ -1,11 +1,8 @@
 import pytest
 import requests
-import json
-from aiohttp import ClientSession
+from openai import OpenAI
 from huggingface_hub import InferenceClient
 
-from text_generation.types import Completion
-
 
 @pytest.fixture(scope="module")
 def flash_llama_completion_handle(launcher):
@@ -73,7 +70,6 @@ async def test_flash_llama_completion_stream_usage(
     for chunk in stream:
         # remove "data:"
         chunks.append(chunk)
-        print(f"Chunk {chunk}")
         if len(chunk.choices) == 1:
             index = chunk.choices[0].index
             assert index == 0
@@ -158,47 +154,29 @@ def test_flash_llama_completion_many_prompts(flash_llama_completion, response_sn
 async def test_flash_llama_completion_many_prompts_stream(
     flash_llama_completion, response_snapshot
 ):
-    request = {
-        "model": "tgi",
-        "prompt": [
+    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.completions.create(
+        model="tgi",
+        prompt=[
             "What is Deep Learning?",
             "Is water wet?",
             "What is the capital of France?",
             "def mai",
         ],
-        "max_tokens": 10,
-        "seed": 0,
-        "temperature": 0.0,
-        "stream": True,
-    }
+        max_tokens=10,
+        seed=0,
+        temperature=0.0,
+        stream=True,
+    )
 
-    url = f"{flash_llama_completion.base_url}/v1/completions"
-
-    chunks = []
     strings = [""] * 4
-    async with ClientSession(headers=flash_llama_completion.headers) as session:
-        async with session.post(url, json=request) as response:
-            # iterate over the stream
-            async for chunk in response.content.iter_any():
-                # remove "data:"
-                chunk = chunk.decode().split("\n\n")
-                # remove "data:" if present
-                chunk = [c.replace("data:", "") for c in chunk]
-                # remove empty strings
-                chunk = [c for c in chunk if c]
-                # remove completion marking chunk
-                chunk = [c for c in chunk if c != " [DONE]"]
-                # parse json
-                chunk = [json.loads(c) for c in chunk]
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+        index = chunk.choices[0].index
+        assert 0 <= index <= 4
+        strings[index] += chunk.choices[0].text
 
-                for c in chunk:
-                    chunks.append(Completion(**c))
-                    assert "choices" in c
-                    index = c["choices"][0]["index"]
-                    assert 0 <= index <= 4
-                    strings[index] += c["choices"][0]["text"]
-
-    assert response.status == 200
     assert list(strings) == [
         " A Beginner’s Guide\nDeep learning is a subset",
         " This is a question that has puzzled many people for",
@@ -206,3 +184,92 @@ async def test_flash_llama_completion_many_prompts_stream(
         'usculas_minusculas(s):\n    """\n',
     ]
     assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_openai_usage(flash_llama_completion, response_snapshot):
+    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
+
+    stream = client.chat.completions.create(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": True},
+    )
+
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+    for chunk in chunks[:-1]:
+        assert chunk.usage is None
+    for chunk in chunks[-1:]:
+        assert chunk.usage is not None
+
+    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_openai_nousage(flash_llama_completion, response_snapshot):
+    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
+
+    stream = client.chat.completions.create(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": False},
+    )
+
+    chunks = []
+    for chunk in stream:
+        assert chunk.usage is None
+        chunks.append(chunk)
+
+    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot):
+    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.chat_completion(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": True},
+    )
+
+    chunks = []
+    for chunk in stream:
+        chunks.append(chunk)
+
+    for chunk in chunks[:-1]:
+        assert chunk.usage is None
+    for chunk in chunks[-1:]:
+        assert chunk.usage is not None
+
+    assert chunks == response_snapshot
+
+
+@pytest.mark.release
+async def test_chat_hfhub_nousage(flash_llama_completion, response_snapshot):
+    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
+    stream = client.chat_completion(
+        model="tgi",
+        messages=[{"role": "user", "content": "Say 'OK!'"}],
+        stream=True,
+        max_tokens=10,
+        seed=42,
+        stream_options={"include_usage": False},
+    )
+
+    chunks = []
+    for chunk in stream:
+        assert chunk.usage is None
+        chunks.append(chunk)
+
+    assert chunks == response_snapshot
diff --git a/router/src/lib.rs b/router/src/lib.rs
index a7923c4c..08c31b64 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -764,7 +764,6 @@ impl ChatCompletionChunk {
         created: u64,
         logprobs: Option<ChatCompletionLogprobs>,
         finish_reason: Option<String>,
-        usage: Option<Usage>,
     ) -> Self {
         let delta = match (delta, tool_calls) {
             (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
@@ -801,7 +800,7 @@ impl ChatCompletionChunk {
                 logprobs,
                 finish_reason,
             }],
-            usage,
+            usage: None,
         }
     }
 }
diff --git a/router/src/server.rs b/router/src/server.rs
index d201f51a..df9e16ff 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1124,7 +1124,6 @@ enum StreamState {
 fn create_event_from_stream_token(
     stream_token: &StreamResponse,
     logprobs: bool,
-    stream_options: Option<StreamOptions>,
     inner_using_tools: bool,
     system_fingerprint: String,
     model_id: String,
@@ -1151,30 +1150,10 @@ fn create_event_from_stream_token(
 
         (content, None)
     };
-
-    let (usage, finish_reason) = match &stream_token.details {
-        Some(details) => {
-            let usage = if stream_options
-                .as_ref()
-                .map(|s| s.include_usage)
-                .unwrap_or(false)
-            {
-                let completion_tokens = details.generated_tokens;
-                let prompt_tokens = details.input_length;
-                let total_tokens = prompt_tokens + completion_tokens;
-                Some(Usage {
-                    completion_tokens,
-                    prompt_tokens,
-                    total_tokens,
-                })
-            } else {
-                None
-            };
-            (usage, Some(details.finish_reason.format(true)))
-        }
-        None => (None, None),
-    };
-
+    let finish_reason = stream_token
+        .details
+        .as_ref()
+        .map(|details| details.finish_reason.format(true));
     let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
         model_id.clone(),
         system_fingerprint.clone(),
@@ -1183,7 +1162,6 @@ fn create_event_from_stream_token(
         current_time,
         logprobs,
         finish_reason,
-        usage,
     ));
 
     event.json_data(chat_complete).unwrap_or_else(|e| {
@@ -1287,6 +1265,17 @@ pub(crate) async fn chat_completions(
                 match result{
                 Ok(stream_token) => {
                     let token_text = &stream_token.token.text.clone();
+                    let usage = stream_token.details.as_ref().map(|details| {
+                        let completion_tokens = details.generated_tokens;
+                        let prompt_tokens = details.input_length;
+                        let total_tokens = prompt_tokens + completion_tokens;
+
+                        Usage {
+                            completion_tokens,
+                            prompt_tokens,
+                            total_tokens,
+                        }
+                    });
                     match state {
                         StreamState::Buffering => {
                             json_buffer.push_str(&token_text.replace(" ", ""));
@@ -1307,7 +1296,6 @@ pub(crate) async fn chat_completions(
                                         let event = create_event_from_stream_token(
                                             stream_token,
                                             logprobs,
-                                            stream_options.clone(),
                                             response_as_tool,
                                             system_fingerprint.clone(),
                                             model_id.clone(),
@@ -1347,7 +1335,6 @@ pub(crate) async fn chat_completions(
                                         current_time,
                                         None,
                                         None,
-                                        None,
                                     ));
                                 yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| {
                                     InferError::StreamSerializationError(e.to_string()).into()
@@ -1369,7 +1356,6 @@ pub(crate) async fn chat_completions(
                             let event = create_event_from_stream_token(
                                 &stream_token,
                                 logprobs,
-                                stream_options.clone(),
                                 response_as_tool,
                                 system_fingerprint.clone(),
                                 model_id.clone(),
@@ -1378,6 +1364,36 @@ pub(crate) async fn chat_completions(
                             yield Ok::<Event, Infallible>(event);
                         }
                     }
+
+                    let should_send_usage = usage.is_some()
+                        && stream_options
+                            .as_ref()
+                            .is_some_and(|opts| opts.include_usage);
+
+                    if should_send_usage {
+                        let usage_data = usage.unwrap();
+                        let current_time = std::time::SystemTime::now()
+                            .duration_since(std::time::UNIX_EPOCH)
+                            .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+                            .as_secs();
+
+                        let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk {
+                            id: String::new(),
+                            created: current_time,
+                            model: model_id.clone(),
+                            system_fingerprint: system_fingerprint.clone(),
+                            choices: vec![],
+                            usage: Some(Usage {
+                                prompt_tokens: usage_data.prompt_tokens,
+                                completion_tokens: usage_data.completion_tokens,
+                                total_tokens: usage_data.total_tokens,
+                            }),
+                        });
+
+                        yield Ok(Event::default()
+                            .json_data(chat_complete)
+                            .unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into()));
+                    }
                 }
                 Err(err) => yield Ok(err.into_openai_event())
                 }

From 094975c3a8186758927086130cbc1fabae79bc7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Tue, 11 Mar 2025 09:19:01 +0100
Subject: [PATCH 137/183] Update the llamacpp backend (#3022)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Build faster

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Make --model-gguf optional

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Bump llama.cpp

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Enable mmap, offload_kqv & flash_attention by default

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update doc

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Better error message

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update doc

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update installed packages

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Save gguf in models/MODEL_ID/model.gguf

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Fix build with Mach-O

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Quantize without llama-quantize

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Bump llama.cpp and switch to ggml-org

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Remove make-gguf.sh

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Update Cargo.lock

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Support HF_HUB_USER_AGENT_ORIGIN

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Bump llama.cpp

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add --build-arg llamacpp_native & llamacpp_cpu_arm_arch

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

---------

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 Cargo.lock                         |   1 +
 Dockerfile_llamacpp                |  32 ++++++---
 backends/llamacpp/Cargo.toml       |   3 +-
 backends/llamacpp/build.rs         |   5 +-
 backends/llamacpp/requirements.txt |   1 +
 backends/llamacpp/src/backend.rs   |   9 +--
 backends/llamacpp/src/llamacpp.rs  |   5 ++
 backends/llamacpp/src/main.rs      | 105 +++++++++++++++++++++++------
 backends/llamacpp/src/quantize.rs  |  35 ++++++++++
 docs/source/backends/llamacpp.md   |  70 ++++++++++++-------
 10 files changed, 202 insertions(+), 64 deletions(-)
 create mode 100644 backends/llamacpp/src/llamacpp.rs
 create mode 100644 backends/llamacpp/src/quantize.rs

diff --git a/Cargo.lock b/Cargo.lock
index 0c28b285..ca921c8e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4754,6 +4754,7 @@ dependencies = [
  "async-trait",
  "bindgen 0.71.1",
  "clap 4.5.30",
+ "hf-hub 0.3.2",
  "num_cpus",
  "pkg-config",
  "text-generation-router",
diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp
index 1f55cf82..4fc61347 100644
--- a/Dockerfile_llamacpp
+++ b/Dockerfile_llamacpp
@@ -1,13 +1,15 @@
 FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps
 
-ARG llamacpp_version=b4651
+ARG llamacpp_version=b4827
 ARG llamacpp_cuda=OFF
+ARG llamacpp_native=ON
+ARG llamacpp_cpu_arm_arch=native
 ARG cuda_arch=75-real;80-real;86-real;89-real;90-real
 
 WORKDIR /opt/src
 
 ENV DEBIAN_FRONTEND=noninteractive
-RUN apt update && apt install -y \
+RUN apt update && apt upgrade -y && apt install -y \
     clang \
     cmake \
     curl \
@@ -17,9 +19,10 @@ RUN apt update && apt install -y \
     pkg-config \
     tar
 
-ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
-RUN tar -xzf ${llamacpp_version}.tar.gz \
- && cd llama.cpp-${llamacpp_version} \
+ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN mkdir -p llama.cpp \
+ && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
+ && cd llama.cpp \
  && cmake -B build \
     -DCMAKE_INSTALL_PREFIX=/usr \
     -DCMAKE_INSTALL_LIBDIR=/usr/lib \
@@ -27,6 +30,8 @@ RUN tar -xzf ${llamacpp_version}.tar.gz \
     -DCMAKE_CXX_COMPILER=clang++ \
     -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
     -DGGML_CUDA=${llamacpp_cuda} \
+    -DGGML_NATIVE=${llamacpp_native} \
+    -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
     -DLLAMA_BUILD_COMMON=OFF \
     -DLLAMA_BUILD_TESTS=OFF \
     -DLLAMA_BUILD_EXAMPLES=OFF \
@@ -48,16 +53,18 @@ FROM deps AS builder
 COPY --from=planner /app/recipe.json recipe.json
 RUN cargo chef cook \
     --recipe-path recipe.json \
-    --profile release-opt \
+    --profile release \
     --package text-generation-router-llamacpp
 COPY . .
 RUN cargo build \
-    --profile release-opt \
+    --profile release \
     --package text-generation-router-llamacpp --frozen
 
 FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
+WORKDIR /app
 
-RUN apt update && apt install -y \
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
     python3-venv \
     python3-pip
 
@@ -65,11 +72,16 @@ RUN python3 -m venv /venv
 ENV PATH="/venv/bin:$PATH"
 
 COPY backends/llamacpp/requirements.txt requirements.txt
-RUN pip3 install --no-cache-dir -r requirements.txt
+COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
+COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
+
+RUN pip3 install --no-cache-dir \
+    -r requirements.txt \
+    -e gguf-py
 
 COPY --from=builder /usr/lib/libllama.so /usr/lib/
 COPY --from=builder /usr/lib/libggml*.so /usr/lib/
-COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/
+COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
 
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
index 18c2ed0a..685a313f 100644
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@@ -12,10 +12,11 @@ pkg-config = "0.3.31"
 [dependencies]
 async-trait = "0.1.85"
 clap = "4.5.27"
+hf-hub.workspace = true
 num_cpus = "1.16.0"
 text-generation-router = { path = "../../router" }
 thiserror = "2.0.11"
 tokenizers.workspace = true
-tokio = "1.43.0"
+tokio = { version = "1.43.0", features = ["process"] }
 tokio-stream = "0.1.17"
 tracing = "0.1.41"
diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs
index 499583cd..8f00f3b5 100644
--- a/backends/llamacpp/build.rs
+++ b/backends/llamacpp/build.rs
@@ -25,8 +25,9 @@ fn main() {
     for path in &llama.link_paths {
         println!("cargo:rustc-link-arg=-Wl,-rpath,{}", path.display());
     }
-    println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
-
+    if cfg!(target_os = "linux") {
+        println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
+    }
     let bindings = bindgen::Builder::default()
         .clang_args(
             llama
diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt
index d19c9e5b..293cd205 100644
--- a/backends/llamacpp/requirements.txt
+++ b/backends/llamacpp/requirements.txt
@@ -1,3 +1,4 @@
 transformers==4.49
 huggingface-hub==0.28.1
 hf-transfer==0.1.9
+torch==2.6.0
diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs
index 1566e1bf..3405cfad 100644
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@@ -1,10 +1,5 @@
-mod llamacpp {
-    #![allow(non_upper_case_globals)]
-    #![allow(non_camel_case_types)]
-    #![allow(non_snake_case)]
-    #![allow(dead_code)]
-    include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
-}
+use crate::llamacpp;
+
 use async_trait::async_trait;
 use std::ffi::CString;
 use std::mem::replace;
diff --git a/backends/llamacpp/src/llamacpp.rs b/backends/llamacpp/src/llamacpp.rs
new file mode 100644
index 00000000..fb206df2
--- /dev/null
+++ b/backends/llamacpp/src/llamacpp.rs
@@ -0,0 +1,5 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(dead_code)]
+include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
index 5a07acdc..b99e9591 100644
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@@ -1,13 +1,21 @@
 mod backend;
+mod llamacpp;
+mod quantize;
+
+use quantize::QuantizeType;
 
 use backend::{
     BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
     LlamacppSplitMode,
 };
 use clap::Parser;
+use hf_hub::api::tokio::ApiBuilder;
+use hf_hub::{Repo, RepoType};
+use std::path::Path;
 use text_generation_router::{logging, server, usage_stats};
 use thiserror::Error;
-use tokenizers::{FromPretrainedParameters, Tokenizer};
+use tokenizers::Tokenizer;
+use tokio::process::Command;
 use tokio::sync::oneshot::error::RecvError;
 use tracing::{error, warn};
 
@@ -25,7 +33,7 @@ struct Args {
 
     /// Path to the GGUF model file for inference.
     #[clap(long, env)]
-    model_gguf: String, // TODO Option() with hf->gguf & quantize
+    model_gguf: Option<String>,
 
     /// Number of threads to use for generation.
     #[clap(long, env)]
@@ -53,7 +61,7 @@ struct Args {
 
     /// Use memory mapping for the model.
     #[clap(long, env)]
-    use_mmap: bool,
+    disable_mmap: bool,
 
     /// Use memory locking to prevent swapping.
     #[clap(long, env)]
@@ -61,11 +69,11 @@ struct Args {
 
     /// Enable offloading of KQV operations to the GPU.
     #[clap(long, env)]
-    offload_kqv: bool,
+    disable_offload_kqv: bool,
 
     /// Enable flash attention for faster inference. (EXPERIMENTAL)
     #[clap(long, env)]
-    flash_attention: bool,
+    disable_flash_attention: bool,
 
     /// Data type used for K cache.
     #[clap(default_value = "f16", value_enum, long, env)]
@@ -194,35 +202,80 @@ async fn main() -> Result<(), RouterError> {
         ));
     }
 
-    // TODO: check if we use the same cache of Server
-    // check if llamacpp is faster
-    let tokenizer = {
-        let token = std::env::var("HF_TOKEN")
-            .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
-            .ok();
-        let params = FromPretrainedParameters {
-            revision: args.revision.clone(),
-            token,
-            ..Default::default()
-        };
-        Tokenizer::from_pretrained(args.model_id.clone(), Some(params))?
+    let api_builder = || {
+        let mut builder = ApiBuilder::new().with_progress(true);
+
+        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
+            builder = builder.with_cache_dir(cache_dir.into());
+        }
+        if let Ok(token) = std::env::var("HF_TOKEN") {
+            builder = builder.with_token(token.into());
+        }
+        if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
+            builder = builder.with_user_agent("origin", origin.as_str());
+        }
+        builder
+    };
+    let api_repo = api_builder().build()?.repo(Repo::with_revision(
+        args.model_id.clone(),
+        RepoType::Model,
+        args.revision.clone(),
+    ));
+
+    let tokenizer_path = api_repo.get("tokenizer.json").await?;
+    let tokenizer = Tokenizer::from_file(&tokenizer_path)?;
+
+    let model_gguf = if let Some(model_gguf) = args.model_gguf {
+        model_gguf
+    } else {
+        let model_gguf = format!("models/{}/model.gguf", args.model_id);
+        let model_gguf_path = Path::new(&model_gguf);
+
+        if !model_gguf_path.exists() {
+            let tmp_gguf = "models/tmp.gguf";
+
+            if let Some(parent) = Path::new(model_gguf_path).parent() {
+                std::fs::create_dir_all(parent)?;
+            }
+            let cache_path = tokenizer_path.parent().unwrap();
+
+            for sibling in api_repo.info().await?.siblings {
+                let _ = api_repo.get(&sibling.rfilename).await?;
+            }
+            let status = Command::new("convert_hf_to_gguf.py")
+                .arg("--outfile")
+                .arg(tmp_gguf)
+                .arg(cache_path)
+                .spawn()?
+                .wait()
+                .await?;
+
+            if !status.success() {
+                let exit_code = status.code().unwrap_or(-1);
+                error!("Failed to generate GGUF, exit code: {}", exit_code);
+                return Err(RouterError::CommandError(exit_code));
+            }
+            quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
+                .map_err(RouterError::QuantizeError)?;
+        }
+        model_gguf
     };
 
     let (backend, ok, shutdown) = LlamacppBackend::new(
         LlamacppConfig {
-            model_gguf: args.model_gguf,
+            model_gguf,
             n_threads,
             n_threads_batch,
             n_gpu_layers: args.n_gpu_layers,
             split_mode: args.split_mode,
             defrag_threshold: args.defrag_threshold,
             numa: args.numa,
-            use_mmap: args.use_mmap,
+            use_mmap: !args.disable_mmap,
             use_mlock: args.use_mlock,
-            flash_attention: args.flash_attention,
+            flash_attention: !args.disable_flash_attention,
             type_k: args.type_k,
             type_v: args.type_v,
-            offload_kqv: args.offload_kqv,
+            offload_kqv: !args.disable_offload_kqv,
             max_batch_total_tokens,
             max_physical_batch_total_tokens,
             max_batch_size,
@@ -281,4 +334,14 @@ enum RouterError {
     WebServer(#[from] server::WebServerError),
     #[error("Recv error: {0}")]
     RecvError(#[from] RecvError),
+    #[error("Io error: {0}")]
+    IoError(#[from] std::io::Error),
+    #[error("Var error: {0}")]
+    VarError(#[from] std::env::VarError),
+    #[error("Quantize error: {0}")]
+    QuantizeError(String),
+    #[error("Command error: {0}")]
+    CommandError(i32),
+    #[error("HF hub error: {0}")]
+    HubError(#[from] hf_hub::api::tokio::ApiError),
 }
diff --git a/backends/llamacpp/src/quantize.rs b/backends/llamacpp/src/quantize.rs
new file mode 100644
index 00000000..31307bec
--- /dev/null
+++ b/backends/llamacpp/src/quantize.rs
@@ -0,0 +1,35 @@
+use crate::llamacpp;
+
+use std::ffi::CString;
+
+#[repr(u32)]
+#[derive(Debug, Clone, Copy)]
+pub enum QuantizeType {
+    MostlyQ4_0 = 2,
+}
+
+pub fn model(
+    input_path: &str,
+    output_path: &str,
+    ftype: QuantizeType,
+    n_threads: usize,
+) -> Result<(), String> {
+    let c_input_path =
+        CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;
+
+    let c_output_path =
+        CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;
+
+    let result = unsafe {
+        let mut params = llamacpp::model_quantize_default_params();
+        params.nthread = n_threads as _;
+        params.ftype = ftype as _;
+        params.quantize_output_tensor = true;
+        llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)
+    };
+    if result == 0 {
+        Ok(())
+    } else {
+        Err(format!("Quantization failed, error code: {}", result))
+    }
+}
diff --git a/docs/source/backends/llamacpp.md b/docs/source/backends/llamacpp.md
index dbd93e86..5cf0edf0 100644
--- a/docs/source/backends/llamacpp.md
+++ b/docs/source/backends/llamacpp.md
@@ -25,9 +25,12 @@ You will find the best models on [Hugging Face][GGUF].
 ## Build Docker image
 
 For optimal performance, the Docker image is compiled with native CPU
-instructions, thus it's highly recommended to execute the container on
-the host used during the build process. Efforts are ongoing to enhance
-portability while maintaining high computational efficiency.
+instructions by default. As a result, it is strongly recommended to run
+the container on the same host architecture used during the build
+process. Efforts are ongoing to improve portability across different
+systems while preserving high computational efficiency.
+
+To build the Docker image, use the following command:
 
 ```bash
 docker build \
@@ -38,20 +41,24 @@ docker build \
 
 ### Build parameters
 
-| Parameter                            | Description                       |
-| ------------------------------------ | --------------------------------- |
-| `--build-arg llamacpp_version=bXXXX` | Specific version of llama.cpp     |
-| `--build-arg llamacpp_cuda=ON`       | Enables CUDA acceleration         |
-| `--build-arg cuda_arch=ARCH`         | Defines target CUDA architecture  |
+| Parameter (with --build-arg)              | Description                      |
+| ----------------------------------------- | -------------------------------- |
+| `llamacpp_version=bXXXX`                  | Specific version of llama.cpp    |
+| `llamacpp_cuda=ON`                        | Enables CUDA acceleration        |
+| `llamacpp_native=OFF`                     | Disable automatic CPU detection  |
+| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features    |
+| `cuda_arch=ARCH`                          | Defines target CUDA architecture |
 
-## Model preparation
-
-Retrieve a GGUF model and store it in a specific directory, for example:
+For example, to target Graviton4 when building on another ARM
+architecture:
 
 ```bash
-mkdir -p ~/models
-cd ~/models
-curl -LOJ "https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_0.gguf?download=true"
+docker build \
+    -t tgi-llamacpp \
+    --build-arg llamacpp_native=OFF \
+    --build-arg llamacpp_cpu_arm_arch=armv9-a+i8mm \
+    https://github.com/huggingface/text-generation-inference.git \
+    -f Dockerfile_llamacpp
 ```
 
 ## Run Docker image
@@ -62,10 +69,9 @@ curl -LOJ "https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwe
 docker run \
     -p 3000:3000 \
     -e "HF_TOKEN=$HF_TOKEN" \
-    -v "$HOME/models:/models" \
+    -v "$HOME/models:/app/models" \
     tgi-llamacpp \
-    --model-id "Qwen/Qwen2.5-3B-Instruct" \
-    --model-gguf "/models/qwen2.5-3b-instruct-q4_0.gguf"
+    --model-id "Qwen/Qwen2.5-3B-Instruct"
 ```
 
 ### GPU-Accelerated inference
@@ -75,13 +81,31 @@ docker run \
     --gpus all \
     -p 3000:3000 \
     -e "HF_TOKEN=$HF_TOKEN" \
-    -v "$HOME/models:/models" \
+    -v "$HOME/models:/app/models" \
     tgi-llamacpp \
     --n-gpu-layers 99
-    --model-id "Qwen/Qwen2.5-3B-Instruct" \
-    --model-gguf "/models/qwen2.5-3b-instruct-q4_0.gguf"
+    --model-id "Qwen/Qwen2.5-3B-Instruct"
 ```
 
+## Using a custom GGUF
+
+GGUF files are optional as they will be automatically generated at
+startup if not already present in the `models` directory. However, if
+the default GGUF generation is not suitable for your use case, you can
+provide your own GGUF file with `--model-gguf`, for example:
+
+```bash
+docker run \
+    -p 3000:3000 \
+    -e "HF_TOKEN=$HF_TOKEN" \
+    -v "$HOME/models:/app/models" \
+    tgi-llamacpp \
+    --model-id "Qwen/Qwen2.5-3B-Instruct" \
+    --model-gguf "models/qwen2.5-3b-instruct-q4_0.gguf"
+```
+
+Note that `--model-id` is still required.
+
 ## Advanced parameters
 
 A full listing of configurable parameters is available in the `--help`:
@@ -101,10 +125,10 @@ The table below summarizes key options:
 | `--split-mode`                      | Split the model across multiple GPUs                                   |
 | `--defrag-threshold`                | Defragment the KV cache if holes/size > threshold                      |
 | `--numa`                            | Enable NUMA optimizations                                              |
-| `--use-mmap`                        | Use memory mapping for the model                                       |
+| `--disable-mmap`                    | Disable memory mapping for the model                                   |
 | `--use-mlock`                       | Use memory locking to prevent swapping                                 |
-| `--offload-kqv`                     | Enable offloading of KQV operations to the GPU                         |
-| `--flash-attention`                 | Enable flash attention for faster inference                            |
+| `--disable-offload-kqv`             | Disable offloading of KQV operations to the GPU                        |
+| `--disable-flash-attention`         | Disable flash attention                                                |
 | `--type-k`                          | Data type used for K cache                                             |
 | `--type-v`                          | Data type used for V cache                                             |
 | `--validation-workers`              | Number of tokenizer workers used for payload validation and truncation |

From b447f7e821ad246fe19a3116d675510e7debcd7d Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 11 Mar 2025 11:00:41 +0100
Subject: [PATCH 138/183] Fix qwen vl (#3096)

* Fixing qwen2.5 VL.

* Fixing the CI.
---
 Cargo.lock                                                     | 2 +-
 .../models/custom_modeling/qwen2_5_vl.py                       | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ca921c8e..9b036e69 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4754,7 +4754,7 @@ dependencies = [
  "async-trait",
  "bindgen 0.71.1",
  "clap 4.5.30",
- "hf-hub 0.3.2",
+ "hf-hub 0.4.2",
  "num_cpus",
  "pkg-config",
  "text-generation-router",
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
index 0ee3a8d9..e317c5b5 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
@@ -313,7 +313,6 @@ class Qwen2_5_VLVisionConfig(PretrainedConfig):
 
 
 class Qwen2_5_VLConfig(PretrainedConfig):
-
     def __init__(
         self,
         vocab_size=152064,
@@ -702,7 +701,6 @@ class Qwen2_5VisionModel(nn.Module):
         pixel_values: torch.Tensor,
         grid_thw: Optional[torch.LongTensor] = None,
     ) -> torch.Tensor:
-
         # reshape the input tensor for processing
         shape = (
             -1,
@@ -967,6 +965,7 @@ class Qwen2_5VLForConditionalGeneration(nn.Module):
             max_s=max_s,
             true_max_s=max_s,
             prefill_cache_indices=prefill_cache_indices,
+            adapter_data=adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]

From ae4451c3da87c447c77b50ff026a814179d633de Mon Sep 17 00:00:00 2001
From: celsowm <celsowm@gmail.com>
Date: Tue, 11 Mar 2025 07:05:21 -0300
Subject: [PATCH 139/183] Update README.md (#3095)

space between param and value
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a24ed5f1..e0633980 100644
--- a/README.md
+++ b/README.md
@@ -264,7 +264,7 @@ After that you can run TGI with `nix run`:
 
 ```shell
 cd text-generation-inference
-nix run --extra-experimental-features nix-command --extra-experimental-features flakes . -- --model-idmeta-llama/Llama-3.1-8B-Instruct
+nix run --extra-experimental-features nix-command --extra-experimental-features flakes . -- --model-id meta-llama/Llama-3.1-8B-Instruct
 ```
 
 **Note:** when you are using Nix on a non-NixOS system, you have to [make some symlinks](https://danieldk.eu/Nix-CUDA-on-non-NixOS-systems#make-runopengl-driverlib-and-symlink-the-driver-library)

From f74c36fe0dbd3260ec28a3a0b208a21b8618ccb1 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 12 Mar 2025 09:22:53 +0100
Subject: [PATCH 140/183] Fix tool call3 (#3086)

* Fixing the tool calling convention.

* Update tehe doc.

* Fixing some corner cases.

* Fixing the tool call id.

* Fmt.

* Snapshot update with the new updated tool_call_id.

* More qwen2.
---
 Cargo.toml                                    |    2 +-
 docs/openapi.json                             |    3 -
 ...ash_llama_grammar_tools_auto_nostream.json |    4 +-
 ...h_llama_grammar_tools_choice_nostream.json |    4 +-
 ...ash_llama_grammar_tools_choice_stream.json |  468 +------
 ...tools_insufficient_information_stream.json |   50 +-
 ...st_flash_llama_grammar_tools_nostream.json |    4 +-
 ...test_flash_llama_grammar_tools_openai.json |  470 +------
 ...ammar_tools_sea_creatures_stream_auto.json |  194 ++-
 ..._sea_creatures_stream_function_object.json | 1233 +----------------
 integration-tests/models/test_tools_llama.py  |   52 +-
 router/src/chat.rs                            |  978 +++++++++++++
 router/src/infer/chat_template.rs             |    2 +-
 router/src/infer/mod.rs                       |    2 +-
 router/src/lib.rs                             |  103 +-
 router/src/server.rs                          |  275 +---
 .../models/custom_modeling/qwen2_vl.py        |    1 +
 17 files changed, 1364 insertions(+), 2481 deletions(-)
 create mode 100644 router/src/chat.rs

diff --git a/Cargo.toml b/Cargo.toml
index 4e3ad010..09a3a4b2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,7 +29,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
 [workspace.dependencies]
 base64 = "0.22.0"
 tokenizers = { version = "0.20.0", features = ["http"] }
-hf-hub = { version = "0.4.1", features = ["tokio"] }
+hf-hub = { version = "0.4.2", features = ["tokio"] }
 metrics = { version = "0.23.0" }
 metrics-exporter-prometheus = { version = "0.15.1", features = [] }
 minijinja = { version = "2.2.0", features = ["json"] }
diff --git a/docs/openapi.json b/docs/openapi.json
index e1ce234e..5888e9ff 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -2148,9 +2148,6 @@
       },
       "StreamOptions": {
         "type": "object",
-        "required": [
-          "include_usage"
-        ],
         "properties": {
           "include_usage": {
             "type": "boolean",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json
index 06cf038a..d9742497 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json
@@ -10,7 +10,7 @@
         "tool_calls": [
           {
             "function": {
-              "arguments": "{\"format\":\"fahrenheit\",\"location\":\"Brooklyn, NY\"}",
+              "arguments": "{\"location\":\"Brooklyn, NY\",\"format\":\"fahrenheit\"}",
               "description": null,
               "name": "get_current_weather"
             },
@@ -21,7 +21,7 @@
       }
     }
   ],
-  "created": 1741263682,
+  "created": 1741372434,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json
index 0152ea70..1c3a5db3 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json
@@ -10,7 +10,7 @@
         "tool_calls": [
           {
             "function": {
-              "arguments": "{\"format\":\"fahrenheit\",\"location\":\"Brooklyn, NY\"}",
+              "arguments": "{\"location\":\"Brooklyn, NY\",\"format\":\"fahrenheit\"}",
               "description": null,
               "name": "get_current_weather"
             },
@@ -21,7 +21,7 @@
       }
     }
   ],
-  "created": 1741263684,
+  "created": 1741372657,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json
index 8dab9a5b..f5a2e955 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json
@@ -8,10 +8,10 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": "{\"",
-                "name": null
+                "arguments": "{",
+                "name": "get_current_weather"
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -22,187 +22,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "function",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\":",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " {\"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "name",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\":",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -221,7 +41,7 @@
                 "arguments": " \"",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -232,157 +52,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "get",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_current",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_weather",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\",",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " \"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -401,7 +71,7 @@
                 "arguments": "location",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -412,7 +82,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -431,7 +101,7 @@
                 "arguments": "\":",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -442,7 +112,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -461,7 +131,7 @@
                 "arguments": " \"",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -472,7 +142,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -488,10 +158,10 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": "Paris",
+                "arguments": "Bro",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -502,7 +172,37 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": null,
+          "role": "assistant",
+          "tool_calls": [
+            {
+              "function": {
+                "arguments": "oklyn",
+                "name": null
+              },
+              "id": "0",
+              "index": 0,
+              "type": "function"
+            }
+          ]
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -521,7 +221,7 @@
                 "arguments": ",",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -532,7 +232,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -548,10 +248,10 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": " France",
+                "arguments": " NY",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -562,7 +262,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -581,7 +281,7 @@
                 "arguments": "\",",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -592,7 +292,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -611,7 +311,7 @@
                 "arguments": " \"",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -622,7 +322,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -641,7 +341,7 @@
                 "arguments": "format",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -652,7 +352,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -671,7 +371,7 @@
                 "arguments": "\":",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -682,7 +382,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -701,7 +401,7 @@
                 "arguments": " \"",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -712,7 +412,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -728,10 +428,10 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": "c",
+                "arguments": "f",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -742,7 +442,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -758,10 +458,10 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": "elsius",
+                "arguments": "ahrenheit",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -772,7 +472,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -788,10 +488,10 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": "\"}}",
+                "arguments": "\"}",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -802,37 +502,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263685,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "<|eot_id|>",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": "stop",
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263685,
+    "created": 1741688515,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
index b1d4fb87..dc969cee 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
@@ -1,4 +1,24 @@
 [
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741364571,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
   {
     "choices": [
       {
@@ -12,7 +32,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263687,
+    "created": 1741364571,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -32,7 +52,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263687,
+    "created": 1741364571,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -52,7 +72,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263687,
+    "created": 1741364571,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -72,7 +92,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263687,
+    "created": 1741364571,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -92,7 +112,27 @@
         "logprobs": null
       }
     ],
-    "created": 1741263687,
+    "created": 1741364571,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "!",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741364571,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json
index 3b22d83e..436c2431 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json
@@ -10,7 +10,7 @@
         "tool_calls": [
           {
             "function": {
-              "arguments": "{\"format\":\"fahrenheit\",\"location\":\"Brooklyn, NY\"}",
+              "arguments": "{\"location\":\"Brooklyn, NY\",\"format\":\"fahrenheit\"}",
               "description": null,
               "name": "get_current_weather"
             },
@@ -21,7 +21,7 @@
       }
     }
   ],
-  "created": 1741263680,
+  "created": 1741372335,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
index c8fc50a2..f21aa4bb 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
@@ -10,10 +10,10 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": "{\"",
-                "name": null
+                "arguments": "{",
+                "name": "get_current_weather"
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -24,205 +24,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "function",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\":",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " {\"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "name",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\":",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -244,7 +46,7 @@
                 "arguments": " \"",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -255,172 +57,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "get",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_current",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_weather",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\",",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " \"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -442,7 +79,7 @@
                 "arguments": "location",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -453,7 +90,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -475,7 +112,7 @@
                 "arguments": "\":",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -486,7 +123,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -508,7 +145,7 @@
                 "arguments": " \"",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -519,7 +156,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -541,7 +178,7 @@
                 "arguments": "Bro",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -552,7 +189,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -574,7 +211,7 @@
                 "arguments": "oklyn",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -585,7 +222,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -607,7 +244,7 @@
                 "arguments": ",",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -618,7 +255,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -640,7 +277,7 @@
                 "arguments": " NY",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -651,7 +288,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -673,7 +310,7 @@
                 "arguments": "\",",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -684,7 +321,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -706,7 +343,7 @@
                 "arguments": " \"",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -717,7 +354,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -739,7 +376,7 @@
                 "arguments": "format",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -750,7 +387,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -772,7 +409,7 @@
                 "arguments": "\":",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -783,7 +420,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -805,7 +442,7 @@
                 "arguments": " \"",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -816,7 +453,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -838,7 +475,7 @@
                 "arguments": "f",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -849,7 +486,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -871,7 +508,7 @@
                 "arguments": "ahrenheit",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -882,7 +519,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -901,10 +538,10 @@
           "tool_calls": [
             {
               "function": {
-                "arguments": "\"}}",
+                "arguments": "\"}",
                 "name": null
               },
-              "id": "",
+              "id": "0",
               "index": 0,
               "type": "function"
             }
@@ -915,40 +552,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263681,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "service_tier": null,
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "function_call": null,
-          "refusal": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "<|eot_id|>",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": "stop",
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263681,
+    "created": 1741689423,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
index 4b0f5a07..cdac9bc4 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
@@ -1,4 +1,24 @@
 [
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741371722,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
   {
     "choices": [
       {
@@ -12,7 +32,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -32,7 +52,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -52,7 +72,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -72,7 +92,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -92,7 +112,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -112,7 +132,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -132,7 +152,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -152,7 +172,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -172,7 +192,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -192,7 +212,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -212,7 +232,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -232,7 +252,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -252,7 +272,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -272,7 +292,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -292,7 +312,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -312,7 +332,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -332,7 +352,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371722,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -352,7 +372,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -372,7 +392,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -392,7 +412,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -412,7 +432,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -432,7 +452,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -452,7 +472,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -472,7 +492,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -492,7 +512,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263688,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -512,7 +532,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -532,7 +552,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -552,7 +572,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -572,7 +592,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -592,7 +612,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -612,7 +632,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -632,7 +652,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -652,7 +672,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -672,7 +692,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -692,7 +712,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -712,7 +732,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -732,7 +752,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -752,7 +772,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -772,7 +792,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -792,7 +812,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -812,7 +832,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -832,7 +852,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -852,7 +872,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -872,7 +892,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -892,7 +912,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -912,7 +932,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -932,7 +952,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371723,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -952,7 +972,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -972,7 +992,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -992,7 +1012,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1012,7 +1032,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1032,7 +1052,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1052,7 +1072,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1072,7 +1092,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263689,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1092,7 +1112,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1112,7 +1132,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1132,7 +1152,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1152,7 +1172,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1172,7 +1192,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1192,7 +1212,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1212,7 +1232,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1232,7 +1252,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1252,7 +1272,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1272,7 +1292,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1292,7 +1312,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1312,7 +1332,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1332,7 +1352,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1352,7 +1372,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1372,7 +1392,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1392,7 +1412,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1412,7 +1432,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1432,7 +1452,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1452,7 +1472,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1472,7 +1492,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1492,7 +1512,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1512,7 +1532,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1532,7 +1552,27 @@
         "logprobs": null
       }
     ],
-    "created": 1741263690,
+    "created": 1741371724,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ".",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741371725,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
index b253d465..fe51488c 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
@@ -1,1232 +1 @@
-[
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "{\"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "function",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\":",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " {\"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "n",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "am",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "e",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\":",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " \"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "get",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_n",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_day",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_weather",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_fore",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "cast",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\",",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " \"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "location",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\":",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " \"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263698,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "San",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " Francisco",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": ",",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " CA",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\",",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " \"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "format",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\":",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " \"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "c",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "elsius",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\",",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " \"",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "num",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "_days",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "\":",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": " ",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "3",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "}}",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": null,
-          "role": "assistant",
-          "tool_calls": [
-            {
-              "function": {
-                "arguments": "<|eot_id|>",
-                "name": null
-              },
-              "id": "",
-              "index": 0,
-              "type": "function"
-            }
-          ]
-        },
-        "finish_reason": "stop",
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741263699,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  }
-]
+[]
diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py
index ebf69cb7..bb4b308b 100644
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@@ -108,7 +108,7 @@ async def test_flash_llama_grammar_tools_nostream(
             function=ChatCompletionOutputFunctionDefinition(
                 description=None,
                 name="get_current_weather",
-                arguments='{"format":"fahrenheit","location":"Brooklyn, NY"}',
+                arguments='{"location":"Brooklyn, NY","format":"fahrenheit"}',
             ),
         )
     ]
@@ -142,14 +142,15 @@ async def test_flash_llama_grammar_tools_openai(
 
     chunks = []
     tool = ""
+    name = ""
     for chunk in stream:
+        if chunk.choices[0].delta.tool_calls[0].function.name:
+            name += chunk.choices[0].delta.tool_calls[0].function.name
         tool += chunk.choices[0].delta.tool_calls[0].function.arguments
         chunks.append(chunk)
 
-    assert (
-        tool
-        == '{"function": {"_name": "get_current_weather", "location": "Brooklyn, NY", "format": "fahrenheit"}}<|eot_id|>'
-    )
+    assert name == "get_current_weather"
+    assert tool == '{ "location": "Brooklyn, NY", "format": "fahrenheit"}'
     assert chunks == response_snapshot
 
 
@@ -184,7 +185,7 @@ async def test_flash_llama_grammar_tools_auto_nostream(
             function=ChatCompletionOutputFunctionDefinition(
                 description=None,
                 name="get_current_weather",
-                arguments='{"format":"fahrenheit","location":"Brooklyn, NY"}',
+                arguments='{"location":"Brooklyn, NY","format":"fahrenheit"}',
             ),
         )
     ]
@@ -223,7 +224,7 @@ async def test_flash_llama_grammar_tools_choice_nostream(
             function=ChatCompletionOutputFunctionDefinition(
                 description=None,
                 name="get_current_weather",
-                arguments='{"format":"fahrenheit","location":"Brooklyn, NY"}',
+                arguments='{"location":"Brooklyn, NY","format":"fahrenheit"}',
             ),
         )
     ]
@@ -250,23 +251,24 @@ async def test_flash_llama_grammar_tools_choice_stream(
             },
             {
                 "role": "user",
-                "content": "What is the weather like in Paris, France?",
+                "content": "What is the weather like in Brooklyn, New York?",
             },
         ],
         stream=True,
     )
 
-    tool_calls_generated = ""
+    arguments = ""
     chunks = []
+    name = ""
     for chunk in stream:
-        tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments
+        if chunk.choices[0].delta.tool_calls[0].function.name:
+            name += chunk.choices[0].delta.tool_calls[0].function.name
+        arguments += chunk.choices[0].delta.tool_calls[0].function.arguments
         assert chunk.choices[0].delta.content is None
         chunks.append(chunk)
 
-    assert (
-        tool_calls_generated
-        == '{"function": {"_name": "get_current_weather", "location": "Paris, France", "format": "celsius"}}<|eot_id|>'
-    )
+    assert name == "get_current_weather"
+    assert arguments == '{ "location": "Brooklyn, NY", "format": "fahrenheit"}'
     assert chunks == response_snapshot
 
 
@@ -297,8 +299,6 @@ async def test_flash_llama_grammar_tools_insufficient_information_nostream(
     content_generated = response.choices[0].message.content
     assert response.choices[0].message.tool_calls is None
 
-    ######## FIXME before MERGE ############################
-    # TODO This is different from  the streaming case, this is NOT normal.
     assert content_generated == "I am a helpful assistant!"
     assert response == response_snapshot
 
@@ -334,7 +334,8 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream(
         chunks.append(chunk)
         assert chunk.choices[0].delta.tool_calls is None
 
-    assert content_generated == "I am a helpful assistant"
+    ######## This is exactly the same as the non streaming case
+    assert content_generated == "I am a helpful assistant!"
     assert chunks == response_snapshot
 
 
@@ -371,7 +372,7 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_auto(
 
     assert (
         content_generated
-        == "There was a wise old octopus named Oracle. He lived in a cozy little cave beneath the waves with his best friend, a curious seahorse named Finley. One day, Finley met a playful dolphin named Daisy, and the three became inseparable. They spent their days exploring the ocean, playing hide-and-seek, and learning about the wonders of the sea from Oracle"
+        == "There was a wise old octopus named Oracle. He lived in a cozy little cave beneath the waves with his best friend, a curious seahorse named Finley. One day, Finley met a playful dolphin named Daisy, and the three became inseparable. They spent their days exploring the ocean, playing hide-and-seek, and learning about the wonders of the sea from Oracle."
     )
     assert chunks == response_snapshot
 
@@ -401,14 +402,18 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_required(
     )
 
     tool_calls_generated = ""
+    name = ""
     chunks = []
     for chunk in stream:
         assert chunk.choices[0].delta.content is None
+        if chunk.choices[0].delta.tool_calls[0].function.name:
+            name += chunk.choices[0].delta.tool_calls[0].function.name
         tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments
 
+    assert name == "get_n_day_weather_forecast"
     assert (
         tool_calls_generated
-        == '{"function": {"_name": "get_n_day_weather_forecast", "location": "San Francisco, CA", "format": "fahrenheit", "num_days":3}}<|eot_id|>'
+        == '{ "location": "San Francisco, CA", "format": "fahrenheit", "num_days":3}'
     )
     assert chunks == response_snapshot
 
@@ -479,12 +484,17 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object(
     )
     chunks = []
     tool_calls_generated = ""
+    name = ""
     for chunk in stream:
+        assert chunk.choices[0].delta.content is None
+        if chunk.choices[0].delta.tool_calls[0].function.name:
+            name += chunk.choices[0].delta.tool_calls[0].function.name
         tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments
-        chunks.append(chunk)
+
+    assert name == "get_n_day_weather_forecast"
     assert (
         tool_calls_generated
-        == '{"function": {"_name": "get_n_day_weather_forecast", "location": "San Francisco, CA", "format": "celsius", "num_days": 3}}<|eot_id|>'
+        == '{ "location": "San Francisco, CA", "format": "celsius", "num_days": 3}'
     )
     assert chunks == response_snapshot
 
diff --git a/router/src/chat.rs b/router/src/chat.rs
new file mode 100644
index 00000000..63bd53bf
--- /dev/null
+++ b/router/src/chat.rs
@@ -0,0 +1,978 @@
+use crate::{
+    infer::InferError, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionDelta,
+    ChatCompletionLogprobs, CompletionType, DeltaToolCall, Function, FunctionDefinition,
+    StreamOptions, StreamResponse, TextMessage, ToolCallDelta, Usage,
+};
+use serde::Deserialize;
+use serde_json::Value;
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "snake_case")]
+enum _NoTool {
+    NoTool,
+}
+
+#[derive(Debug, Deserialize)]
+struct NoToolCall {
+    _name: _NoTool,
+    content: String,
+}
+#[derive(Debug, Deserialize)]
+struct NoTool {
+    function: NoToolCall,
+}
+
+#[derive(Debug, Deserialize)]
+struct ToolCall {
+    _name: String,
+    #[serde(flatten, default)]
+    /// Using Map to preserve order
+    arguments: serde_json::Map<String, Value>,
+}
+#[derive(Debug, Deserialize)]
+struct Call {
+    function: ToolCall,
+}
+
+pub(crate) fn parse_output(
+    generated_text: &str,
+) -> Result<(Option<Vec<crate::ToolCall>>, Option<String>), InferError> {
+    let call: Call = serde_json::from_str(generated_text).map_err(|e| {
+        InferError::ToolError(format!(
+            "Failed to parse generated text: {} {:?}",
+            e, generated_text
+        ))
+    })?;
+    let name = call.function._name;
+
+    match &name[..] {
+        "no_tool" => {
+            // parse the content message
+            let content_message = call
+                .function
+                .arguments
+                .get("content")
+                .and_then(Value::as_str)
+                .ok_or_else(|| {
+                    InferError::ToolError("No `content` found in generated text".to_string())
+                })?
+                .to_string();
+            Ok((None, Some(content_message)))
+        }
+        name => {
+            let tool_calls = vec![crate::ToolCall {
+                id: "0".to_string(),
+                r#type: "function".to_string(),
+                function: FunctionDefinition {
+                    description: None,
+                    name: name.to_string(),
+                    arguments: serde_json::to_value(call.function.arguments).map_err(|err| {
+                        InferError::ToolError(format!(
+                            "Could not convert arguments to JSON map {err}"
+                        ))
+                    })?,
+                },
+            }];
+            Ok((Some(tool_calls), None))
+        }
+    }
+}
+
+/// Convert a StreamResponse into an Event to be sent over SSE
+fn create_event_from_stream_token(
+    stream_token: &StreamResponse,
+    logprobs: bool,
+    inner_using_tools: bool,
+    system_fingerprint: String,
+    model_id: String,
+    function_name: Option<String>,
+    id: String,
+) -> CompletionType {
+    let current_time = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+        .as_secs();
+
+    let logprobs = logprobs.then(|| {
+        ChatCompletionLogprobs::from((stream_token.token.clone(), stream_token.top_tokens.clone()))
+    });
+
+    // replace the content with the tool calls if grammar is present
+    let content = if !stream_token.token.special {
+        Some(stream_token.token.text.clone())
+    } else {
+        None
+    };
+    let (content, tool_calls) = if inner_using_tools {
+        // Cast into a vec
+        (None, content)
+    } else {
+        (content, None)
+    };
+    let finish_reason = stream_token
+        .details
+        .as_ref()
+        .map(|details| details.finish_reason.format(true));
+    let delta = match (content, tool_calls) {
+        (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
+            role: "assistant".to_string(),
+            content: delta,
+            ..Default::default()
+        }),
+        (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta {
+            role: "assistant".to_string(),
+            tool_calls: vec![DeltaToolCall {
+                index: 0,
+                id,
+                r#type: "function".to_string(),
+                function: Function {
+                    name: function_name,
+                    arguments: tool_calls,
+                },
+            }],
+        }),
+        (None, None) => ChatCompletionDelta::Chat(TextMessage {
+            role: "assistant".to_string(),
+            content: "".to_string(),
+            ..Default::default()
+        }),
+    };
+    let choices = vec![ChatCompletionChoice {
+        index: 0,
+        delta,
+        logprobs,
+        finish_reason,
+    }];
+    CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
+        model_id,
+        system_fingerprint,
+        current_time,
+        choices,
+        None,
+    ))
+}
+
+#[derive(Debug)]
+enum StreamState {
+    /// Before the tools was parsed
+    Buffering,
+    /// We detected a tool call here
+    Tool,
+    /// During the `content` part of the tool call
+    NoTool,
+    /// Finishing frames of the ToolCall
+    NoToolFinish,
+    /// This is without tool calling
+    Content,
+}
+
+pub struct ChatState {
+    state: StreamState,
+    text: String,
+    options: StreamOptions,
+    model_id: String,
+    fingerprint: String,
+    logprobs: bool,
+    id: String,
+}
+
+impl ChatState {
+    pub fn new(
+        using_tools: bool,
+        options: StreamOptions,
+        fingerprint: String,
+        model_id: String,
+        logprobs: bool,
+        id: String,
+    ) -> Self {
+        let state = if using_tools {
+            StreamState::Buffering
+        } else {
+            StreamState::Content
+        };
+        let text = String::new();
+        Self {
+            state,
+            text,
+            options,
+            fingerprint,
+            model_id,
+            logprobs,
+            id,
+        }
+    }
+
+    pub fn push(&mut self, mut stream_token: StreamResponse) -> Vec<CompletionType> {
+        let mut events = vec![];
+        let token_text = &stream_token.token.text;
+        match self.state {
+            StreamState::Buffering => {
+                self.text.push_str(token_text);
+                // We have a special match for `no_tool` in order to capture directly the `content`
+                // key which should be re-emitted as raw text.
+                if let Ok(value) = serde_json::from_str::<NoTool>(&format!("{}\"}}}}", self.text)) {
+                    self.state = StreamState::NoTool;
+                    // Modifiy the content of the token to be whatever was captured by the JSON
+                    stream_token.token.text = value.function.content;
+                    let chat_complete = create_event_from_stream_token(
+                        &stream_token,
+                        self.logprobs,
+                        false,
+                        self.fingerprint.clone(),
+                        self.model_id.clone(),
+                        None,
+                        self.id.clone(),
+                    );
+
+                    events.push(chat_complete);
+                }
+                // XXX Caution, here we do not postfix the quote, so that the current output
+                // Is necessarily finished with quotes for us to be able to parse.
+                let partial = &self.text;
+                let partial = partial.trim_end_matches(|c: char| c.is_whitespace() || c == ',');
+                if let Ok(call) = serde_json::from_str::<Call>(&format!("{}}}}}", partial)) {
+                    // This can be no_tool before the content has been emitted
+                    if call.function._name != "no_tool" {
+                        stream_token.token.text = "{".to_string();
+                        let chat_complete = create_event_from_stream_token(
+                            &stream_token,
+                            self.logprobs,
+                            true,
+                            self.fingerprint.clone(),
+                            self.model_id.clone(),
+                            Some(call.function._name),
+                            self.id.clone(),
+                        );
+
+                        events.push(chat_complete);
+                        self.state = StreamState::Tool;
+                    }
+                }
+            }
+            StreamState::Tool => {
+                self.text.push_str(token_text);
+                if serde_json::from_str::<Call>(&self.text).is_ok() {
+                    self.state = StreamState::Buffering;
+                    let mut text = stream_token.token.text.trim_end();
+                    // Effectively trimming only the last closing brace
+                    if text.ends_with('}') {
+                        text = &text[..text.len() - 1];
+                    }
+                    stream_token.token.text = text.to_string();
+                    let chat_complete = create_event_from_stream_token(
+                        &stream_token,
+                        self.logprobs,
+                        true,
+                        self.fingerprint.clone(),
+                        self.model_id.clone(),
+                        None,
+                        self.id.clone(),
+                    );
+                    events.push(chat_complete);
+                } else {
+                    let chat_complete = create_event_from_stream_token(
+                        &stream_token,
+                        self.logprobs,
+                        true,
+                        self.fingerprint.clone(),
+                        self.model_id.clone(),
+                        None,
+                        self.id.clone(),
+                    );
+                    events.push(chat_complete);
+                }
+            }
+            // if we skipped sending the buffer we need to avoid sending the following json key and quotes
+            // We have remainder tokens, ignore everying,
+            StreamState::NoToolFinish => {}
+            StreamState::NoTool => {
+                self.text.push_str(token_text);
+                if token_text.contains("\"") {
+                    let mut text = self
+                        .text
+                        .trim_end_matches(|c: char| c.is_whitespace() || c == '}');
+                    // Trim once
+                    if text.ends_with("\"") {
+                        // Verify we have actually trimmed something
+                        // The opposite can happen if the model is outputting inline JSON.
+                        text = &text[..text.len() - 1];
+                        if let Ok(_value) =
+                            serde_json::from_str::<NoTool>(&format!("{}\"}}}}", text))
+                        {
+                            let mut text = token_text
+                                .trim_end_matches(|c: char| c.is_whitespace() || c == '}');
+                            // Effectively trim_end_match('"', 1)
+                            // because we do not want to eventually trim finishing escaped quotes
+                            // {{"\"Something\""}}
+                            if text.ends_with("\"") {
+                                text = &text[..text.len() - 1];
+                            }
+                            stream_token.token.text = text.to_string();
+                            self.state = StreamState::NoToolFinish;
+                        }
+                    }
+                }
+                // This escaping is usually inline json escaping and we can therefore remove it.
+                stream_token.token.text = stream_token.token.text.replace("\\", "");
+                let chat_complete = create_event_from_stream_token(
+                    &stream_token,
+                    self.logprobs,
+                    false,
+                    self.fingerprint.clone(),
+                    self.model_id.clone(),
+                    None,
+                    self.id.clone(),
+                );
+
+                events.push(chat_complete);
+            }
+            StreamState::Content => {
+                let chat_complete = create_event_from_stream_token(
+                    &stream_token,
+                    self.logprobs,
+                    false,
+                    self.fingerprint.clone(),
+                    self.model_id.clone(),
+                    None,
+                    self.id.clone(),
+                );
+
+                events.push(chat_complete);
+            }
+        }
+
+        if self.options.include_usage {
+            if let Some(details) = stream_token.details {
+                let completion_tokens = details.generated_tokens;
+                let prompt_tokens = details.input_length;
+                let total_tokens = prompt_tokens + completion_tokens;
+
+                let usage = Usage {
+                    completion_tokens,
+                    prompt_tokens,
+                    total_tokens,
+                };
+                let current_time = std::time::SystemTime::now()
+                    .duration_since(std::time::UNIX_EPOCH)
+                    .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+                    .as_secs();
+
+                let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk {
+                    id: String::new(),
+                    created: current_time,
+                    model: self.model_id.clone(),
+                    system_fingerprint: self.fingerprint.clone(),
+                    choices: vec![],
+                    usage: Some(Usage {
+                        prompt_tokens: usage.prompt_tokens,
+                        completion_tokens: usage.completion_tokens,
+                        total_tokens: usage.total_tokens,
+                    }),
+                });
+
+                events.push(chat_complete);
+            }
+        }
+        events
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        ChatCompletionChoice, ChatCompletionDelta, FinishReason, StreamDetails, TextMessage, Token,
+    };
+
+    use super::*;
+
+    fn get_text_content(event: &CompletionType) -> &String {
+        match event {
+            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
+                assert_eq!(choices.len(), 1);
+                if let ChatCompletionChoice {
+                    delta: ChatCompletionDelta::Chat(TextMessage { content, .. }),
+                    ..
+                } = &choices[0]
+                {
+                    content
+                } else {
+                    panic!("Expected plain message");
+                }
+            }
+            _ => panic!("Unexpected chunk"),
+        }
+    }
+
+    fn get_tool_call_content(event: &CompletionType) -> (Option<&String>, &String) {
+        match event {
+            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
+                assert_eq!(choices.len(), 1);
+                if let ChatCompletionChoice {
+                    delta: ChatCompletionDelta::Tool(ToolCallDelta { tool_calls, .. }),
+                    ..
+                } = &choices[0]
+                {
+                    assert_eq!(tool_calls.len(), 1);
+                    let DeltaToolCall {
+                        index,
+                        id,
+                        r#type,
+                        function,
+                    } = &tool_calls[0];
+                    assert_eq!(*index, 0);
+                    assert_eq!(id, "0");
+                    assert_eq!(r#type, "function");
+                    (function.name.as_ref(), &function.arguments)
+                } else {
+                    panic!("Expected plain message");
+                }
+            }
+            _ => panic!("Unexpected chunk"),
+        }
+    }
+
+    #[test]
+    fn test_chat_stream() {
+        let mut chat_state = ChatState::new(
+            false,
+            StreamOptions {
+                include_usage: false,
+            },
+            "fingerprint".to_string(),
+            "model_id".to_string(),
+            false,
+            "0".to_string(),
+        );
+
+        let events = chat_state.push(StreamResponse {
+            generated_text: None,
+            token: Token {
+                id: 42,
+                text: "Hi".to_string(),
+                logprob: 0.0,
+                special: false,
+            },
+            top_tokens: vec![],
+            index: 0,
+            details: None,
+        });
+        assert_eq!(events.len(), 1);
+        match &events[0] {
+            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
+                assert_eq!(
+                    choices,
+                    &[ChatCompletionChoice {
+                        index: 0,
+                        delta: ChatCompletionDelta::Chat(TextMessage {
+                            role: "assistant".to_string(),
+                            content: "Hi".to_string(),
+                            tool_call_id: None,
+                        }),
+                        logprobs: None,
+                        finish_reason: None,
+                    }]
+                );
+            }
+            _ => panic!("Unexpected chunk"),
+        }
+    }
+
+    #[test]
+    fn test_chat_stream_usage() {
+        let mut chat_state = ChatState::new(
+            false,
+            StreamOptions {
+                include_usage: true,
+            },
+            "fingerprint".to_string(),
+            "model_id".to_string(),
+            false,
+            "0".to_string(),
+        );
+
+        let events = chat_state.push(StreamResponse {
+            generated_text: None,
+            token: Token {
+                id: 42,
+                text: "Hi".to_string(),
+                logprob: 0.0,
+                special: false,
+            },
+            top_tokens: vec![],
+            index: 0,
+            details: Some(StreamDetails {
+                input_length: 2,
+                generated_tokens: 10,
+                seed: None,
+                finish_reason: FinishReason::Length,
+            }),
+        });
+        assert_eq!(events.len(), 2);
+        match &events[0] {
+            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
+                assert_eq!(
+                    choices,
+                    &[ChatCompletionChoice {
+                        index: 0,
+                        delta: ChatCompletionDelta::Chat(TextMessage {
+                            role: "assistant".to_string(),
+                            content: "Hi".to_string(),
+                            tool_call_id: None,
+                        }),
+                        logprobs: None,
+                        // HAS A FINISH REASON
+                        finish_reason: Some("length".to_string()),
+                    }]
+                );
+            }
+            _ => panic!("Unexpected chunk"),
+        }
+        match &events[1] {
+            CompletionType::ChatCompletionChunk(ChatCompletionChunk { usage, .. }) => {
+                assert_eq!(
+                    *usage,
+                    Some(Usage {
+                        prompt_tokens: 2,
+                        completion_tokens: 10,
+                        total_tokens: 12,
+                    })
+                );
+            }
+            _ => panic!("Unexpected chunk"),
+        }
+    }
+
+    #[test]
+    fn test_chat_stream_tool_no_tool() {
+        let mut chat_state = ChatState::new(
+            true,
+            StreamOptions {
+                include_usage: true,
+            },
+            "fingerprint".to_string(),
+            "model_id".to_string(),
+            false,
+            "0".to_string(),
+        );
+
+        let tokens = vec![
+            "{\"".to_string(),
+            "function".to_string(),
+            "\":".to_string(),
+            " {\"".to_string(),
+            "_".to_string(),
+            "name".to_string(),
+            "\":".to_string(),
+            " \"".to_string(),
+            "no".to_string(),
+            "_tool".to_string(),
+            "\",".to_string(),
+            " \"".to_string(),
+            "content".to_string(),
+            "\":".to_string(),
+            " \"".to_string(),        // Token 14
+            "I".to_string(),          // Event 1
+            " am".to_string(),        // Event 2
+            " a".to_string(),         // Event 3
+            " helpful".to_string(),   // Event 4
+            " assistant".to_string(), // Event 5
+            "!\"".to_string(),        // Event 6 (with trailing quore removed)
+            "}".to_string(),
+            "}".to_string(),
+        ];
+        let tokens: Vec<_> = tokens
+            .into_iter()
+            .map(|text| StreamResponse {
+                generated_text: None,
+                token: Token {
+                    id: 42,
+                    text: text.to_string(),
+                    logprob: 0.0,
+                    special: false,
+                },
+                top_tokens: vec![],
+                index: 0,
+                details: None,
+            })
+            .collect();
+
+        // Initial ignored output
+        for token in &tokens[..14] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 0);
+        }
+
+        // No tool output
+        let mut output = String::new();
+        for token in &tokens[14..14 + 7] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 1);
+            let content = get_text_content(&events[0]);
+            output.push_str(content);
+        }
+
+        assert_eq!(output, "I am a helpful assistant!");
+
+        // No tool finish
+        for token in &tokens[14 + 7..] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 0);
+        }
+    }
+
+    #[test]
+    fn test_chat_stream_tool_no_tool_many_quotes() {
+        let mut chat_state = ChatState::new(
+            true,
+            StreamOptions {
+                include_usage: true,
+            },
+            "fingerprint".to_string(),
+            "model_id".to_string(),
+            false,
+            "0".to_string(),
+        );
+
+        let tokens = vec![
+            "{\"".to_string(),
+            "function".to_string(),
+            "\":".to_string(),
+            " {\"".to_string(),
+            "_".to_string(),
+            "name".to_string(),
+            "\":".to_string(),
+            " \"".to_string(),
+            "no".to_string(),
+            "_tool".to_string(),
+            "\",".to_string(),
+            " \"".to_string(),
+            "content".to_string(),
+            "\":".to_string(),
+            " \"".to_string(),        // Token 14
+            "I".to_string(),          // Event 1
+            " am".to_string(),        // Event 2
+            " a".to_string(),         // Event 3
+            " helpful".to_string(),   // Event 4
+            " assistant".to_string(), // Event 5
+            "!\\\"\"".to_string(),    // Extra inside the string quote that would get removed
+            "}".to_string(),
+            "}".to_string(),
+        ];
+
+        // Initial ignored output
+        for text in &tokens[..14] {
+            let events = chat_state.push(StreamResponse {
+                generated_text: None,
+                token: Token {
+                    id: 42,
+                    text: text.to_string(),
+                    logprob: 0.0,
+                    special: false,
+                },
+                top_tokens: vec![],
+                index: 0,
+                details: None,
+            });
+            assert_eq!(events.len(), 0);
+        }
+
+        // No tool output
+        let mut output = String::new();
+        for text in &tokens[14..14 + 7] {
+            let events = chat_state.push(StreamResponse {
+                generated_text: None,
+                token: Token {
+                    id: 42,
+                    text: text.to_string(),
+                    logprob: 0.0,
+                    special: false,
+                },
+                top_tokens: vec![],
+                index: 0,
+                details: None,
+            });
+            assert_eq!(events.len(), 1);
+            match &events[0] {
+                CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
+                    assert_eq!(choices.len(), 1);
+                    if let ChatCompletionChoice {
+                        delta: ChatCompletionDelta::Chat(TextMessage { content, .. }),
+                        ..
+                    } = &choices[0]
+                    {
+                        output.push_str(content);
+                    } else {
+                        panic!("Expected plain message");
+                    }
+                }
+                _ => panic!("Unexpected chunk"),
+            }
+        }
+
+        assert_eq!(output, "I am a helpful assistant!\"");
+
+        // No tool finish
+        for text in &tokens[14 + 7..] {
+            let events = chat_state.push(StreamResponse {
+                generated_text: None,
+                token: Token {
+                    id: 42,
+                    text: text.to_string(),
+                    logprob: 0.0,
+                    special: false,
+                },
+                top_tokens: vec![],
+                index: 0,
+                details: None,
+            });
+            assert_eq!(events.len(), 0);
+        }
+    }
+
+    #[test]
+    fn test_chat_stream_tool_no_tool_inline_json() {
+        let mut chat_state = ChatState::new(
+            true,
+            StreamOptions {
+                include_usage: true,
+            },
+            "fingerprint".to_string(),
+            "model_id".to_string(),
+            false,
+            "0".to_string(),
+        );
+
+        let tokens = vec![
+            "{\"".to_string(),
+            "function".to_string(),
+            "\":".to_string(),
+            " {\"".to_string(),
+            "_".to_string(),
+            "name".to_string(),
+            "\":".to_string(),
+            " \"".to_string(),
+            "no".to_string(),
+            "_tool".to_string(),
+            "\",".to_string(),
+            " \"".to_string(),
+            "content".to_string(),
+            "\":".to_string(),
+            " \"".to_string(),    // Token 14
+            "{\\\"".to_string(),  // Event 1
+            "a".to_string(),      // Event 1
+            "\\\":".to_string(),  // Event 1
+            "2".to_string(),      // Event 2
+            ",\\".to_string(),    // Event 2
+            "\"".to_string(),     // Event 2
+            "b".to_string(),      // Event 3
+            "\\\": ".to_string(), // Event 4
+            "1".to_string(),      // Event 5
+            "}".to_string(),      // Event 5
+            "\"}".to_string(),    // Extra inside the string quote that would get removed
+            "}".to_string(),
+        ];
+        let tokens: Vec<_> = tokens
+            .into_iter()
+            .map(|text| StreamResponse {
+                generated_text: None,
+                token: Token {
+                    id: 42,
+                    text: text.to_string(),
+                    logprob: 0.0,
+                    special: false,
+                },
+                top_tokens: vec![],
+                index: 0,
+                details: None,
+            })
+            .collect();
+
+        // Initial ignored output
+        for token in &tokens[..14] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 0);
+        }
+
+        // No tool output
+        let mut output = String::new();
+        for token in &tokens[14..14 + 12] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 1, "Current text is {output:?}");
+            let content = get_text_content(&events[0]);
+            output.push_str(content);
+        }
+
+        assert_eq!(output, "{\"a\":2,\"b\": 1}");
+
+        // No tool finish
+        for token in &tokens[14 + 12..] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 0, "Extra events {events:?}");
+        }
+    }
+
+    #[test]
+    fn test_chat_stream_tool_no_tool_empty() {
+        let mut chat_state = ChatState::new(
+            true,
+            StreamOptions {
+                include_usage: true,
+            },
+            "fingerprint".to_string(),
+            "model_id".to_string(),
+            false,
+            "0".to_string(),
+        );
+
+        let tokens = vec![
+            "{\"".to_string(),
+            "function".to_string(),
+            "\":".to_string(),
+            " {\"".to_string(),
+            "_".to_string(),
+            "name".to_string(),
+            "\":".to_string(),
+            " \"".to_string(),
+            "no".to_string(),
+            "_tool".to_string(),
+            "\",".to_string(),
+            " \"".to_string(),
+            "content".to_string(),
+            "\":\"".to_string(),
+            "\"}".to_string(), // Token 13
+            "}".to_string(),   // Event 1
+        ];
+        let tokens: Vec<_> = tokens
+            .into_iter()
+            .map(|text| StreamResponse {
+                generated_text: None,
+                token: Token {
+                    id: 42,
+                    text: text.to_string(),
+                    logprob: 0.0,
+                    special: false,
+                },
+                top_tokens: vec![],
+                index: 0,
+                details: None,
+            })
+            .collect();
+
+        // Initial ignored output
+        for token in &tokens[..13] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 0);
+        }
+
+        // No tool output
+        let mut output = String::new();
+        for token in &tokens[13..13 + 2] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 1, "Current text is {output:?}");
+            let content = get_text_content(&events[0]);
+            output.push_str(content);
+        }
+
+        assert_eq!(output, "");
+
+        // No tool finish
+        for token in &tokens[13 + 2..] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 0, "Extra events {events:?}");
+        }
+    }
+
+    #[test]
+    fn test_chat_stream_tool_get_weather() {
+        let mut chat_state = ChatState::new(
+            true,
+            StreamOptions {
+                include_usage: true,
+            },
+            "fingerprint".to_string(),
+            "model_id".to_string(),
+            false,
+            "0".to_string(),
+        );
+
+        let tokens = vec![
+            "{\"".to_string(),
+            "function".to_string(),
+            "\":".to_string(),
+            " {\"".to_string(),
+            "_".to_string(),
+            "name".to_string(),
+            "\":".to_string(),
+            " \"".to_string(),
+            "get".to_string(),
+            "_current".to_string(),
+            "_weather".to_string(),
+            "\",".to_string(),
+            // Event 1 is the function name
+            // Event 2 is the start of the arguments "{"
+            " \"".to_string(),        // Event 3
+            "location".to_string(),   // Event 4
+            "\":".to_string(),        // Event 5
+            " \"".to_string(),        // Event 6
+            "San".to_string(),        // Event 7
+            " Francisco".to_string(), // Event 8
+            ",".to_string(),          // Event 9
+            " CA".to_string(),        // Event 10
+            "\",".to_string(),        // Event 11
+            " \"".to_string(),        // Event 12
+            "format".to_string(),     // Event 13
+            "\":".to_string(),        // Event 14
+            " \"".to_string(),        // Event 15
+            "c".to_string(),          // Event 16
+            "elsius".to_string(),     // Event 17
+            "\"}}".to_string(),       // Event 18 retained (trailing brace removed)
+        ];
+        let tokens: Vec<_> = tokens
+            .into_iter()
+            .map(|text| StreamResponse {
+                generated_text: None,
+                token: Token {
+                    id: 42,
+                    text: text.to_string(),
+                    logprob: 0.0,
+                    special: false,
+                },
+                top_tokens: vec![],
+                index: 0,
+                details: None,
+            })
+            .collect();
+
+        // Initial ignored output
+        for token in &tokens[..11] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 0, "{events:?}");
+        }
+
+        // No tool output
+        let mut output = String::new();
+        let mut output_name = String::new();
+        for token in &tokens[11..11 + 17] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 1);
+            let (name, arguments) = get_tool_call_content(&events[0]);
+            if let Some(name) = name {
+                assert_eq!(name, "get_current_weather");
+                output_name.push_str(&name);
+            }
+            output.push_str(arguments);
+        }
+
+        assert_eq!(output_name, "get_current_weather");
+        assert_eq!(
+            output,
+            "{ \"location\": \"San Francisco, CA\", \"format\": \"celsius\"}"
+        );
+
+        // No tool finish
+        for token in &tokens[11 + 17..] {
+            let events = chat_state.push(token.clone());
+            assert_eq!(events.len(), 0);
+        }
+    }
+}
diff --git a/router/src/infer/chat_template.rs b/router/src/infer/chat_template.rs
index b179dd4d..ebba5fd3 100644
--- a/router/src/infer/chat_template.rs
+++ b/router/src/infer/chat_template.rs
@@ -16,7 +16,7 @@ pub(crate) fn strftime_now(format_str: String) -> Result<String, minijinja::Erro
     Ok(Local::now().format(&format_str).to_string())
 }
 
-#[derive(Clone)]
+#[derive(Debug, Clone)]
 pub(crate) struct ChatTemplate {
     template: Template<'static, 'static>,
     bos_token: Option<String>,
diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs
index 7eb8a41b..cdce8188 100644
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@@ -52,7 +52,7 @@ pub struct Infer {
     /// Request backend
     backend: Arc<dyn Backend + Send + Sync>,
     /// Chat template
-    chat_template: Option<ChatTemplate>,
+    pub(crate) chat_template: Option<ChatTemplate>,
     /// Inference limit
     limit_concurrent_requests: Arc<Semaphore>,
     /// Backend health
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 08c31b64..eac41276 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -8,6 +8,7 @@ pub mod validation;
 mod kserve;
 pub mod logging;
 
+mod chat;
 mod sagemaker;
 pub mod usage_stats;
 mod vertex;
@@ -20,6 +21,7 @@ use serde::{Deserialize, Serialize};
 use tokenizers::Encoding;
 use tracing::warn;
 use utoipa::ToSchema;
+use uuid::Uuid;
 use validation::Validation;
 
 #[allow(clippy::large_enum_variant)]
@@ -541,6 +543,7 @@ pub(crate) struct Chunk {
 }
 
 #[derive(Clone, Deserialize, Serialize, ToSchema)]
+#[cfg_attr(test, derive(Debug))]
 pub(crate) struct ChatCompletion {
     pub id: String,
     #[schema(example = "1706270835")]
@@ -553,6 +556,7 @@ pub(crate) struct ChatCompletion {
 }
 
 #[derive(Clone, Deserialize, Serialize, ToSchema)]
+#[cfg_attr(test, derive(Debug))]
 pub(crate) struct ChatCompletionComplete {
     pub index: u32,
     pub message: OutputMessage,
@@ -561,6 +565,7 @@ pub(crate) struct ChatCompletionComplete {
 }
 
 #[derive(Clone, Deserialize, Serialize, ToSchema)]
+#[cfg_attr(test, derive(Debug, PartialEq))]
 pub(crate) struct ChatCompletionLogprobs {
     content: Vec<ChatCompletionLogprob>,
 }
@@ -619,6 +624,7 @@ impl From<(Vec<Token>, Vec<Vec<Token>>)> for ChatCompletionLogprobs {
 }
 
 #[derive(Clone, Deserialize, Serialize, ToSchema)]
+#[cfg_attr(test, derive(Debug, PartialEq))]
 pub(crate) struct ChatCompletionLogprob {
     token: String,
     logprob: f32,
@@ -626,12 +632,14 @@ pub(crate) struct ChatCompletionLogprob {
 }
 
 #[derive(Clone, Deserialize, Serialize, ToSchema)]
+#[cfg_attr(test, derive(Debug, PartialEq))]
 pub(crate) struct ChatCompletionTopLogprob {
     token: String,
     logprob: f32,
 }
 
 #[derive(Clone, Deserialize, Serialize, ToSchema, Default)]
+#[cfg_attr(test, derive(Debug, PartialEq))]
 pub(crate) struct Usage {
     pub prompt_tokens: u32,
     pub completion_tokens: u32,
@@ -640,6 +648,7 @@ pub(crate) struct Usage {
 
 #[derive(Clone, Serialize, ToSchema)]
 #[serde(tag = "object")]
+#[cfg_attr(test, derive(Debug))]
 enum CompletionType {
     #[serde(rename = "chat.completion.chunk")]
     ChatCompletionChunk(ChatCompletionChunk),
@@ -707,6 +716,7 @@ impl ChatCompletion {
     }
 }
 #[derive(Clone, Serialize, ToSchema)]
+#[cfg_attr(test, derive(Debug))]
 pub(crate) struct ChatCompletionChunk {
     pub id: String,
     #[schema(example = "1706270978")]
@@ -719,6 +729,7 @@ pub(crate) struct ChatCompletionChunk {
 }
 
 #[derive(Clone, Serialize, ToSchema)]
+#[cfg_attr(test, derive(Debug, PartialEq))]
 pub(crate) struct ChatCompletionChoice {
     pub index: u32,
     pub delta: ChatCompletionDelta,
@@ -735,6 +746,7 @@ pub struct ToolCallDelta {
 
 #[derive(Clone, Debug, Serialize, ToSchema)]
 #[serde(untagged)]
+#[cfg_attr(test, derive(PartialEq))]
 enum ChatCompletionDelta {
     Chat(TextMessage),
     Tool(ToolCallDelta),
@@ -759,48 +771,17 @@ impl ChatCompletionChunk {
     pub(crate) fn new(
         model: String,
         system_fingerprint: String,
-        delta: Option<String>,
-        tool_calls: Option<Vec<String>>,
         created: u64,
-        logprobs: Option<ChatCompletionLogprobs>,
-        finish_reason: Option<String>,
+        choices: Vec<ChatCompletionChoice>,
+        usage: Option<Usage>,
     ) -> Self {
-        let delta = match (delta, tool_calls) {
-            (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
-                role: "assistant".to_string(),
-                content: delta,
-                ..Default::default()
-            }),
-            (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta {
-                role: "assistant".to_string(),
-                tool_calls: vec![DeltaToolCall {
-                    index: 0,
-                    id: String::new(),
-                    r#type: "function".to_string(),
-                    function: Function {
-                        name: None,
-                        arguments: tool_calls[0].to_string(),
-                    },
-                }],
-            }),
-            (None, None) => ChatCompletionDelta::Chat(TextMessage {
-                role: "assistant".to_string(),
-                content: "".to_string(),
-                ..Default::default()
-            }),
-        };
         Self {
             id: String::new(),
             created,
             model,
             system_fingerprint,
-            choices: vec![ChatCompletionChoice {
-                index: 0,
-                delta,
-                logprobs,
-                finish_reason,
-            }],
-            usage: None,
+            choices,
+            usage,
         }
     }
 }
@@ -915,7 +896,7 @@ pub(crate) struct ChatRequest {
     /// Options for streaming response. Only set this when you set stream: true.
     #[serde(default)]
     #[schema(nullable = true, example = "null")]
-    pub stream_options: Option<StreamOptions>,
+    pub stream_options: StreamOptions,
 }
 
 impl ChatRequest {
@@ -1015,13 +996,37 @@ impl ChatRequest {
             using_tools,
         ))
     }
+
+    fn next_int_id(&self) -> Result<String, Box<dyn std::error::Error>> {
+        let mut id: usize = 0;
+        for message in &self.messages {
+            if let MessageBody::Tool { tool_calls } = &message.body {
+                for tool_call in tool_calls {
+                    let new_id: usize = tool_call.id.parse()?;
+                    id = std::cmp::max(id, new_id + 1);
+                }
+            }
+        }
+        Ok(id.to_string())
+    }
+
+    /// Try to have linearly increasing id
+    /// or resort to using Uuid if the initial
+    /// scheme is not understood
+    fn next_tool_call_id(&self) -> String {
+        self.next_int_id().unwrap_or_else(|_| {
+            let uid = Uuid::new_v4().to_string();
+            uid.to_string()
+        })
+    }
 }
 
-#[derive(Clone, Deserialize, ToSchema, Serialize)]
+#[derive(Clone, Deserialize, ToSchema, Serialize, Default)]
 #[cfg_attr(test, derive(Debug, PartialEq))]
 struct StreamOptions {
     /// If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
     #[schema(example = "true")]
+    #[serde(default)]
     include_usage: bool,
 }
 
@@ -1445,7 +1450,7 @@ pub(crate) struct ChatTokenizeResponse {
 #[serde(transparent)]
 pub(crate) struct TokenizeResponse(Vec<SimpleToken>);
 
-#[derive(Serialize, ToSchema)]
+#[derive(Serialize, ToSchema, Clone)]
 pub(crate) struct StreamDetails {
     #[schema(example = "length")]
     pub finish_reason: FinishReason,
@@ -1457,7 +1462,7 @@ pub(crate) struct StreamDetails {
     pub input_length: u32,
 }
 
-#[derive(Serialize, ToSchema)]
+#[derive(Serialize, ToSchema, Clone)]
 pub(crate) struct StreamResponse {
     pub index: u32,
     pub token: Token,
@@ -1700,9 +1705,25 @@ mod tests {
 
         assert!(matches!(
             request.stream_options,
-            Some(StreamOptions {
+            StreamOptions {
                 include_usage: true
-            })
+            }
+        ));
+
+        let json = json!({
+            "model": "",
+            "messages": [{
+                "role": "user",
+                "content": "Hello"
+            }]
+        });
+        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
+
+        assert!(matches!(
+            request.stream_options,
+            StreamOptions {
+                include_usage: false
+            }
         ));
     }
 
diff --git a/router/src/server.rs b/router/src/server.rs
index df9e16ff..689b2f50 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1,3 +1,4 @@
+use crate::chat::ChatState;
 /// HTTP Server logic
 use crate::config::Config;
 use crate::infer::{Backend, Infer, InferError, InferResponse, InferStreamResponse};
@@ -47,8 +48,6 @@ use http::header::AUTHORIZATION;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
 use pyo3::prelude::*;
 use pyo3::types::IntoPyDict;
-use regex::Regex;
-use serde_json::Value;
 use std::convert::Infallible;
 use std::fs::File;
 use std::io::BufReader;
@@ -1114,62 +1113,6 @@ pub(crate) async fn completions(
     }
 }
 
-enum StreamState {
-    Buffering,
-    BufferTrailing,
-    Content { skip_close_quote: bool },
-}
-
-/// Convert a StreamResponse into an Event to be sent over SSE
-fn create_event_from_stream_token(
-    stream_token: &StreamResponse,
-    logprobs: bool,
-    inner_using_tools: bool,
-    system_fingerprint: String,
-    model_id: String,
-) -> Event {
-    let event = Event::default();
-    let current_time = std::time::SystemTime::now()
-        .duration_since(std::time::UNIX_EPOCH)
-        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
-        .as_secs();
-
-    let logprobs = logprobs.then(|| {
-        ChatCompletionLogprobs::from((stream_token.token.clone(), stream_token.top_tokens.clone()))
-    });
-
-    // replace the content with the tool calls if grammar is present
-    let (content, tool_calls) = if inner_using_tools {
-        (None, Some(vec![stream_token.token.text.clone()]))
-    } else {
-        let content = if !stream_token.token.special {
-            Some(stream_token.token.text.clone())
-        } else {
-            None
-        };
-
-        (content, None)
-    };
-    let finish_reason = stream_token
-        .details
-        .as_ref()
-        .map(|details| details.finish_reason.format(true));
-    let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
-        model_id.clone(),
-        system_fingerprint.clone(),
-        content,
-        tool_calls,
-        current_time,
-        logprobs,
-        finish_reason,
-    ));
-
-    event.json_data(chat_complete).unwrap_or_else(|e| {
-        println!("Failed to serialize ChatCompletionChunk: {:?}", e);
-        Event::default()
-    })
-}
-
 /// Generate tokens
 #[utoipa::path(
 post,
@@ -1219,6 +1162,9 @@ pub(crate) async fn chat_completions(
         logprobs,
         ..
     } = chat.clone();
+
+    tracing::debug!("Got chat_template {:?}", infer.chat_template);
+    let id = chat.next_tool_call_id();
     let (generate_request, using_tools): (GenerateRequest, bool) =
         chat.try_into_generate(&infer)?;
     span.record("parameters", format!("{:?}", generate_request.parameters));
@@ -1235,164 +1181,18 @@ pub(crate) async fn chat_completions(
         let (headers, response_stream) =
             generate_stream_internal(infer, compute_type, Json(generate_request), span).await;
 
-        // regex to match any function name
-        let function_regex = match Regex::new(r#"\{"function":\{"_name":"([^"]+)""#) {
-            Ok(regex) => regex,
-            Err(e) => {
-                return Err((
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                    Json(ErrorResponse {
-                        error: format!("Failed to compile regex: {}", e),
-                        error_type: "regex".to_string(),
-                    }),
-                ))
-            }
-        };
-
         let response_stream = async_stream::stream! {
             let mut response_stream = Box::pin(response_stream);
-            let mut buffer = Vec::new();
-            let mut json_buffer = String::new();
-            let mut state = if using_tools {
-                StreamState::Buffering
-            } else {
-                StreamState::Content {
-                    skip_close_quote: false,
-                }
-            };
-            let mut response_as_tool = using_tools;
+            let mut state = ChatState::new(using_tools, stream_options, system_fingerprint, model_id, logprobs, id);
             while let Some(result) = response_stream.next().await {
                 match result{
                 Ok(stream_token) => {
-                    let token_text = &stream_token.token.text.clone();
-                    let usage = stream_token.details.as_ref().map(|details| {
-                        let completion_tokens = details.generated_tokens;
-                        let prompt_tokens = details.input_length;
-                        let total_tokens = prompt_tokens + completion_tokens;
-
-                        Usage {
-                            completion_tokens,
-                            prompt_tokens,
-                            total_tokens,
-                        }
-                    });
-                    match state {
-                        StreamState::Buffering => {
-                            json_buffer.push_str(&token_text.replace(" ", ""));
-                            buffer.push(stream_token);
-                            if let Some(captures) = function_regex.captures(&json_buffer) {
-                                let function_name = captures[1].to_string();
-                                if function_name == "no_tool" {
-                                    state = StreamState::BufferTrailing;
-                                    response_as_tool = false;
-                                    buffer.clear();
-                                    json_buffer.clear();
-                                } else {
-                                    state = StreamState::Content {
-                                        skip_close_quote: false,
-                                    };
-                                    // send all the buffered messages
-                                    for stream_token in &buffer {
-                                        let event = create_event_from_stream_token(
-                                            stream_token,
-                                            logprobs,
-                                            response_as_tool,
-                                            system_fingerprint.clone(),
-                                            model_id.clone(),
-                                        );
-                                        yield Ok::<Event, Infallible>(event);
-                                    }
-                                }
-                            }
-                        }
-                        // if we skipped sending the buffer we need to avoid sending the following json key and quotes
-                        StreamState::BufferTrailing => {
-                            let infix_text = "\"content\":\"";
-                            json_buffer.push_str(&token_text.replace(" ", ""));
-                            // keep capturing until we find the infix text
-                            match json_buffer.find(infix_text) {
-                                Some(content_key_index) => {
-                                    json_buffer =
-                                        json_buffer[content_key_index + infix_text.len()..].to_string();
-                                }
-                                None => {
-                                    continue;
-                                }
-                            }
-                            // if there is leftover text after removing the infix text, we need to send it
-                            if !json_buffer.is_empty() {
-                                let event = Event::default();
-                                let current_time = std::time::SystemTime::now()
-                                    .duration_since(std::time::UNIX_EPOCH)
-                                    .unwrap_or_else(|_| std::time::Duration::from_secs(0))
-                                    .as_secs();
-                                let chat_complete =
-                                    CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
-                                        model_id.clone(),
-                                        system_fingerprint.clone(),
-                                        Some(json_buffer.clone()),
-                                        None,
-                                        current_time,
-                                        None,
-                                        None,
-                                    ));
-                                yield Ok(event.json_data(chat_complete).unwrap_or_else(|e| {
-                                    InferError::StreamSerializationError(e.to_string()).into()
-                                }));
-                            }
-                            // cleanup the buffers
-                            buffer.clear();
-                            json_buffer.clear();
-                            state = StreamState::Content {
-                                skip_close_quote: true,
-                            };
-                        }
-                        StreamState::Content { skip_close_quote } => {
-                            if skip_close_quote && token_text.contains('"') {
-                                break;
-                            }
-
-                            // send the content
-                            let event = create_event_from_stream_token(
-                                &stream_token,
-                                logprobs,
-                                response_as_tool,
-                                system_fingerprint.clone(),
-                                model_id.clone(),
-                            );
-
-                            yield Ok::<Event, Infallible>(event);
-                        }
-                    }
-
-                    let should_send_usage = usage.is_some()
-                        && stream_options
-                            .as_ref()
-                            .is_some_and(|opts| opts.include_usage);
-
-                    if should_send_usage {
-                        let usage_data = usage.unwrap();
-                        let current_time = std::time::SystemTime::now()
-                            .duration_since(std::time::UNIX_EPOCH)
-                            .unwrap_or_else(|_| std::time::Duration::from_secs(0))
-                            .as_secs();
-
-                        let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk {
-                            id: String::new(),
-                            created: current_time,
-                            model: model_id.clone(),
-                            system_fingerprint: system_fingerprint.clone(),
-                            choices: vec![],
-                            usage: Some(Usage {
-                                prompt_tokens: usage_data.prompt_tokens,
-                                completion_tokens: usage_data.completion_tokens,
-                                total_tokens: usage_data.total_tokens,
-                            }),
-                        });
-
-                        yield Ok(Event::default()
-                            .json_data(chat_complete)
-                            .unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into()));
+                    let events = state.push(stream_token);
+                    for chat_complete in events{
+                        yield Ok(Event::default().json_data(chat_complete).unwrap_or_else(|e| {
+                            tracing::error!("Failed to serialize ChatCompletionChunk: {:?}", e);
+                            Event::default()
+                        }));
                     }
                 }
                 Err(err) => yield Ok(err.into_openai_event())
@@ -1413,56 +1213,7 @@ pub(crate) async fn chat_completions(
             .as_secs();
 
         let (tool_calls, output) = if using_tools {
-            let gen_text_value: Value =
-                serde_json::from_str(&generation.generated_text).map_err(|e| {
-                    InferError::ToolError(format!(
-                        "Failed to parse generated text: {} {:?}",
-                        e, generation.generated_text
-                    ))
-                })?;
-            let function = gen_text_value.get("function").ok_or(InferError::ToolError(
-                "No function found in generated text".to_string(),
-            ))?;
-
-            let name = function
-                .get("_name")
-                .and_then(Value::as_str)
-                .ok_or(InferError::ToolError(
-                    "No _name found in generated text".to_string(),
-                ))?
-                .to_string();
-
-            let mut arguments = function.clone();
-            if let Value::Object(ref mut props) = arguments {
-                props.remove("_name");
-            }
-            match name.as_str() {
-                "no_tool" => {
-                    // parse the content message
-                    let content_message = arguments
-                        .get("content")
-                        .and_then(Value::as_str)
-                        .ok_or_else(|| {
-                            InferError::ToolError(
-                                "No `content` found in generated text".to_string(),
-                            )
-                        })?
-                        .to_string();
-                    (None, Some(content_message))
-                }
-                _ => {
-                    let tool_calls = vec![ToolCall {
-                        id: "0".to_string(),
-                        r#type: "function".to_string(),
-                        function: FunctionDefinition {
-                            description: None,
-                            name,
-                            arguments,
-                        },
-                    }];
-                    (Some(tool_calls), None)
-                }
-            }
+            crate::chat::parse_output(&generation.generated_text)?
         } else {
             (None, Some(generation.generated_text))
         };
@@ -1817,6 +1568,7 @@ pub async fn run(
             )
         }
         Type::Cache(cache) => {
+            tracing::info!("Cache {cache:?}");
             let repo = cache.repo(Repo::with_revision(
                 tokenizer_name.to_string(),
                 RepoType::Model,
@@ -1833,6 +1585,7 @@ pub async fn run(
     };
 
     // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
+    tracing::warn!("Tokenizer_config {tokenizer_config_path:?} - {tokenizer_config_filename:?}");
     let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
     {
         HubTokenizerConfig::from_file(filename)
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
index a72e0e55..26e6fede 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -542,6 +542,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
             max_s=max_s,
             true_max_s=max_s,
             prefill_cache_indices=prefill_cache_indices,
+            adapter_data=adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]

From ed46c2c414460973a3a60fa47eb00e1842fb9130 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Wed, 12 Mar 2025 13:55:51 +0530
Subject: [PATCH 141/183] Add gemma3 model (#3099)

---
 docs/source/supported_models.md               |   2 +
 .../test_flash_gemma3/test_exceed_window.json | 133 +++
 .../test_flash_gemma3/test_flash_gemma3.json  | 613 ++++++++++++
 .../test_flash_gemma3_image_cow.json          |  26 +
 .../test_flash_gemma3_image_cow_dog.json      |  26 +
 integration-tests/models/test_flash_gemma3.py |  90 ++
 launcher/src/main.rs                          |   1 +
 router/src/config.rs                          |  14 +
 router/src/infer/chat_template.rs             | 109 ++-
 router/src/lib.rs                             |  12 +
 router/src/server.rs                          |  23 +-
 router/src/validation.rs                      |  13 +-
 .../text_generation_server/models/__init__.py |  98 ++
 .../custom_modeling/flash_gemma3_modeling.py  | 922 ++++++++++++++++++
 .../flash_pali_gemma_modeling.py              |   2 +-
 .../gemma3/configuration_gemma3.py            | 313 ++++++
 .../gemma3/image_processing_gemma3.py         | 463 +++++++++
 .../gemma3/processing_gemma3.py               | 206 ++++
 .../models/custom_modeling/gemma3/utils.py    |  61 ++
 .../models/custom_modeling/vlm.py             |  19 +-
 .../models/vlm_causal_lm.py                   |   8 +
 21 files changed, 3145 insertions(+), 9 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
 create mode 100644 integration-tests/models/test_flash_gemma3.py
 create mode 100644 server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
 create mode 100644 server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py
 create mode 100644 server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
 create mode 100644 server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py
 create mode 100644 server/text_generation_server/models/custom_modeling/gemma3/utils.py

diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index 1f804d5a..453b4f61 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -14,6 +14,8 @@ Text Generation Inference enables serving optimized models. The following sectio
 - [Gemma](https://huggingface.co/google/gemma-7b)
 - [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
 - [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
+- [Gemma3](https://huggingface.co/collections/google/gemma-3)
+- [Gemma3 Text](https://huggingface.co/collections/google/gemma-3)
 - [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
 - [Dbrx](https://huggingface.co/databricks/dbrx-instruct)
 - [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj)
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json
new file mode 100644
index 00000000..ec8cd4f6
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json
@@ -0,0 +1,133 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 20,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 236764,
+        "logprob": -0.44726562,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 236743,
+        "logprob": -0.011413574,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 236812,
+        "logprob": -0.09814453,
+        "special": false,
+        "text": "4"
+      },
+      {
+        "id": 236764,
+        "logprob": -0.044189453,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 236743,
+        "logprob": -0.15625,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 236810,
+        "logprob": -0.010864258,
+        "special": false,
+        "text": "5"
+      },
+      {
+        "id": 236764,
+        "logprob": -0.040039062,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 236743,
+        "logprob": -0.26757812,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 236825,
+        "logprob": -0.0047302246,
+        "special": false,
+        "text": "6"
+      },
+      {
+        "id": 236764,
+        "logprob": -0.026123047,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 236743,
+        "logprob": -0.265625,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 236832,
+        "logprob": -0.014160156,
+        "special": false,
+        "text": "7"
+      },
+      {
+        "id": 236764,
+        "logprob": -0.013977051,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 236743,
+        "logprob": -0.103515625,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 236828,
+        "logprob": -0.008178711,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 236764,
+        "logprob": -0.030151367,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 236743,
+        "logprob": -0.39453125,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 236819,
+        "logprob": -0.008728027,
+        "special": false,
+        "text": "9"
+      },
+      {
+        "id": 236764,
+        "logprob": -0.020629883,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 236743,
+        "logprob": -0.08154297,
+        "special": false,
+        "text": " "
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": ", 4, 5, 6, 7, 8, 9, "
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
new file mode 100644
index 00000000..1324555a
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
@@ -0,0 +1,613 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 100,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 1331,
+        "logprob": -0.32421875,
+        "special": false,
+        "text": " people"
+      },
+      {
+        "id": 8390,
+        "logprob": -0.15332031,
+        "special": false,
+        "text": " died"
+      },
+      {
+        "id": 528,
+        "logprob": -1.140625,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 506,
+        "logprob": -0.42578125,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 3640,
+        "logprob": -0.64453125,
+        "special": false,
+        "text": " United"
+      },
+      {
+        "id": 4184,
+        "logprob": -0.0027770996,
+        "special": false,
+        "text": " States"
+      },
+      {
+        "id": 236761,
+        "logprob": -0.37890625,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 108,
+        "logprob": -0.08300781,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 818,
+        "logprob": -1.1796875,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 6816,
+        "logprob": -1.765625,
+        "special": false,
+        "text": " generally"
+      },
+      {
+        "id": 10951,
+        "logprob": -0.14550781,
+        "special": false,
+        "text": " accepted"
+      },
+      {
+        "id": 10967,
+        "logprob": -0.90625,
+        "special": false,
+        "text": " estimate"
+      },
+      {
+        "id": 563,
+        "logprob": -0.49414062,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 600,
+        "logprob": -0.65625,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 236743,
+        "logprob": -1.1796875,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 236825,
+        "logprob": -0.0009918213,
+        "special": false,
+        "text": "6"
+      },
+      {
+        "id": 236832,
+        "logprob": -6.532669e-05,
+        "special": false,
+        "text": "7"
+      },
+      {
+        "id": 236810,
+        "logprob": -4.863739e-05,
+        "special": false,
+        "text": "5"
+      },
+      {
+        "id": 236764,
+        "logprob": -0.00017929077,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 236771,
+        "logprob": -1.2397766e-05,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 236771,
+        "logprob": -2.1457672e-06,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 236771,
+        "logprob": 0.0,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 1331,
+        "logprob": -0.50390625,
+        "special": false,
+        "text": " people"
+      },
+      {
+        "id": 8390,
+        "logprob": -0.011474609,
+        "special": false,
+        "text": " died"
+      },
+      {
+        "id": 528,
+        "logprob": -0.08496094,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 506,
+        "logprob": -0.0003299713,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 3640,
+        "logprob": -0.028442383,
+        "special": false,
+        "text": " United"
+      },
+      {
+        "id": 4184,
+        "logprob": -0.00011014938,
+        "special": false,
+        "text": " States"
+      },
+      {
+        "id": 236761,
+        "logprob": -1.1796875,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 3153,
+        "logprob": -0.104003906,
+        "special": false,
+        "text": " However"
+      },
+      {
+        "id": 236764,
+        "logprob": -0.009094238,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1070,
+        "logprob": -0.88671875,
+        "special": false,
+        "text": " some"
+      },
+      {
+        "id": 61806,
+        "logprob": -0.84765625,
+        "special": false,
+        "text": " historians"
+      },
+      {
+        "id": 4646,
+        "logprob": -1.34375,
+        "special": false,
+        "text": " believe"
+      },
+      {
+        "id": 506,
+        "logprob": -0.59375,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 5396,
+        "logprob": -0.8046875,
+        "special": false,
+        "text": " actual"
+      },
+      {
+        "id": 1548,
+        "logprob": -0.04321289,
+        "special": false,
+        "text": " number"
+      },
+      {
+        "id": 1451,
+        "logprob": -0.60546875,
+        "special": false,
+        "text": " could"
+      },
+      {
+        "id": 577,
+        "logprob": -0.091308594,
+        "special": false,
+        "text": " be"
+      },
+      {
+        "id": 618,
+        "logprob": -0.61328125,
+        "special": false,
+        "text": " as"
+      },
+      {
+        "id": 1494,
+        "logprob": -0.00033569336,
+        "special": false,
+        "text": " high"
+      },
+      {
+        "id": 618,
+        "logprob": -0.0001411438,
+        "special": false,
+        "text": " as"
+      },
+      {
+        "id": 236743,
+        "logprob": -0.001045227,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 236770,
+        "logprob": -0.21289062,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 236771,
+        "logprob": -0.13378906,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 3625,
+        "logprob": -0.0087890625,
+        "special": false,
+        "text": " million"
+      },
+      {
+        "id": 236761,
+        "logprob": -0.2109375,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 108,
+        "logprob": -0.39453125,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 236777,
+        "logprob": -1.1328125,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 1006,
+        "logprob": -1.4140625,
+        "special": false,
+        "text": " am"
+      },
+      {
+        "id": 3182,
+        "logprob": -1.15625,
+        "special": false,
+        "text": " looking"
+      },
+      {
+        "id": 573,
+        "logprob": -0.035888672,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 919,
+        "logprob": -1.2734375,
+        "special": false,
+        "text": " more"
+      },
+      {
+        "id": 1938,
+        "logprob": -1.2265625,
+        "special": false,
+        "text": " information"
+      },
+      {
+        "id": 580,
+        "logprob": -0.7734375,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 672,
+        "logprob": -0.77734375,
+        "special": false,
+        "text": " this"
+      },
+      {
+        "id": 59725,
+        "logprob": -0.70703125,
+        "special": false,
+        "text": " discrepancy"
+      },
+      {
+        "id": 532,
+        "logprob": -0.8515625,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 506,
+        "logprob": -0.65625,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 5872,
+        "logprob": -1.15625,
+        "special": false,
+        "text": " factors"
+      },
+      {
+        "id": 600,
+        "logprob": -0.2265625,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 19263,
+        "logprob": -1.125,
+        "special": false,
+        "text": " contributed"
+      },
+      {
+        "id": 531,
+        "logprob": -0.001083374,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 506,
+        "logprob": -0.2109375,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 5777,
+        "logprob": -1.21875,
+        "special": false,
+        "text": " wide"
+      },
+      {
+        "id": 2644,
+        "logprob": -0.018310547,
+        "special": false,
+        "text": " range"
+      },
+      {
+        "id": 529,
+        "logprob": -0.12988281,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 14287,
+        "logprob": -0.03564453,
+        "special": false,
+        "text": " estimates"
+      },
+      {
+        "id": 236761,
+        "logprob": -0.010314941,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 108,
+        "logprob": -0.060546875,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 8291,
+        "logprob": -0.734375,
+        "special": false,
+        "text": "Here"
+      },
+      {
+        "id": 236789,
+        "logprob": -0.26367188,
+        "special": false,
+        "text": "'"
+      },
+      {
+        "id": 236751,
+        "logprob": -1.1920929e-06,
+        "special": false,
+        "text": "s"
+      },
+      {
+        "id": 496,
+        "logprob": -0.15527344,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 25890,
+        "logprob": -0.08886719,
+        "special": false,
+        "text": " breakdown"
+      },
+      {
+        "id": 529,
+        "logprob": -0.0020446777,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 506,
+        "logprob": -0.17871094,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 5872,
+        "logprob": -0.90234375,
+        "special": false,
+        "text": " factors"
+      },
+      {
+        "id": 20894,
+        "logprob": -0.25976562,
+        "special": false,
+        "text": " contributing"
+      },
+      {
+        "id": 531,
+        "logprob": -8.34465e-05,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 506,
+        "logprob": -0.008544922,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 5777,
+        "logprob": -0.62109375,
+        "special": false,
+        "text": " wide"
+      },
+      {
+        "id": 2644,
+        "logprob": -0.0023345947,
+        "special": false,
+        "text": " range"
+      },
+      {
+        "id": 529,
+        "logprob": -0.016723633,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 14287,
+        "logprob": -0.011291504,
+        "special": false,
+        "text": " estimates"
+      },
+      {
+        "id": 573,
+        "logprob": -0.29101562,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 506,
+        "logprob": -0.21484375,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 236743,
+        "logprob": -0.2890625,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 236770,
+        "logprob": -3.5762787e-07,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 236819,
+        "logprob": 0.0,
+        "special": false,
+        "text": "9"
+      },
+      {
+        "id": 236770,
+        "logprob": 0.0,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 236828,
+        "logprob": 0.0,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 7745,
+        "logprob": -0.70703125,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 10248,
+        "logprob": -0.01953125,
+        "special": false,
+        "text": " pandemic"
+      },
+      {
+        "id": 4355,
+        "logprob": -0.78515625,
+        "special": false,
+        "text": " death"
+      },
+      {
+        "id": 25363,
+        "logprob": -6.771088e-05,
+        "special": false,
+        "text": " toll"
+      },
+      {
+        "id": 528,
+        "logprob": -0.08496094,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 506,
+        "logprob": -7.033348e-06,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 3640,
+        "logprob": -0.0067443848,
+        "special": false,
+        "text": " United"
+      },
+      {
+        "id": 4184,
+        "logprob": 0.0,
+        "special": false,
+        "text": " States"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " people died in the United States.\n\nThe generally accepted estimate is that 675,000 people died in the United States. However, some historians believe the actual number could be as high as 10 million.\n\nI am looking for more information on this discrepancy and the factors that contributed to the wide range of estimates.\n\nHere's a breakdown of the factors contributing to the wide range of estimates for the 1918 flu pandemic death toll in the United States"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
new file mode 100644
index 00000000..6c30ada4
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a humorous and unexpected sight of a cow enjoying a tropical beach!",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1741703756,
+  "id": "",
+  "model": "gg-hf-g/gemma-3-4b-it",
+  "object": "chat.completion",
+  "system_fingerprint": "3.1.2-dev0-native",
+  "usage": {
+    "completion_tokens": 70,
+    "prompt_tokens": 277,
+    "total_tokens": 347
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
new file mode 100644
index 00000000..fe67c995
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "Based on the image, the animal is a cow, not a dog! \n\nIt appears to be a **Brazilian cattle breed** known as a **Gir Cow**. They are recognized for their reddish-brown color and distinctive markings.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1741703753,
+  "id": "",
+  "model": "gg-hf-g/gemma-3-4b-it",
+  "object": "chat.completion",
+  "system_fingerprint": "3.1.2-dev0-native",
+  "usage": {
+    "completion_tokens": 48,
+    "prompt_tokens": 281,
+    "total_tokens": 329
+  }
+}
diff --git a/integration-tests/models/test_flash_gemma3.py b/integration-tests/models/test_flash_gemma3.py
new file mode 100644
index 00000000..022d2c47
--- /dev/null
+++ b/integration-tests/models/test_flash_gemma3.py
@@ -0,0 +1,90 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_gemma3_handle(launcher):
+    with launcher("gg-hf-g/gemma-3-4b-it", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_gemma3(flash_gemma3_handle):
+    await flash_gemma3_handle.health(300)
+    return flash_gemma3_handle.client
+
+
+async def test_flash_gemma3(flash_gemma3, response_snapshot):
+    response = await flash_gemma3.generate(
+        "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
+        seed=42,
+        max_new_tokens=100,
+    )
+
+    assert (
+        response.generated_text
+        == " people died in the United States.\n\nThe generally accepted estimate is that 675,000 people died in the United States. However, some historians believe the actual number could be as high as 10 million.\n\nI am looking for more information on this discrepancy and the factors that contributed to the wide range of estimates.\n\nHere's a breakdown of the factors contributing to the wide range of estimates for the 1918 flu pandemic death toll in the United States"
+    )
+    assert response.details.generated_tokens == 100
+    assert response == response_snapshot
+
+
+async def test_flash_gemma3_image_cow_dog(flash_gemma3, response_snapshot):
+    image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+    response = await flash_gemma3.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": "What is the breed of the dog in the image?",
+                    },
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
+
+    assert (
+        response.choices[0].message.content
+        == "Based on the image, the animal is a cow, not a dog! \n\nIt appears to be a **Brazilian cattle breed** known as a **Gir Cow**. They are recognized for their reddish-brown color and distinctive markings."
+    )
+    assert response.usage["completion_tokens"] == 48
+    assert response == response_snapshot
+
+
+async def test_flash_gemma3_image_cow(flash_gemma3, response_snapshot):
+    image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+    response = await flash_gemma3.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
+    assert (
+        response.choices[0].message.content
+        == "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a humorous and unexpected sight of a cow enjoying a tropical beach!"
+    )
+    assert response.usage["completion_tokens"] == 70
+    assert response == response_snapshot
+
+
+async def test_exceed_window(flash_gemma3, response_snapshot):
+    response = await flash_gemma3.generate(
+        "This is a nice place. " * 800 + "Now count: 1, 2, 3",
+        seed=42,
+        max_new_tokens=20,
+    )
+
+    assert response.generated_text == ", 4, 5, 6, 7, 8, 9, "
+    assert response.details.generated_tokens == 20
+    assert response == response_snapshot
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 321d7c69..d1041e26 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -2064,6 +2064,7 @@ fn main() -> Result<(), LauncherError> {
                 let default_optimal = match config {
                     Some(ref config) => match config.model_type.as_deref() {
                         Some("qwen2_vl") | Some("qwen2_5_vl") => 10_000,
+                        Some("gemma3") => 8000,
                         _ => 4096,
                     },
                     None => 4096,
diff --git a/router/src/config.rs b/router/src/config.rs
index a0135984..244bbb1f 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -216,6 +216,19 @@ impl Qwen2_5Vl {
     }
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Gemma3VisionConfig {
+    pub(crate) image_size: usize,
+    pub(crate) patch_size: usize,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Gemma3 {
+    vision_config: Gemma3VisionConfig,
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "model_type")]
 #[serde(rename_all = "snake_case")]
@@ -249,6 +262,7 @@ pub enum Config {
     Paligemma(Paligemma),
     Gemma,
     Gemma2,
+    Gemma3(Gemma3),
     Cohere,
     Drbx,
     Falcon,
diff --git a/router/src/infer/chat_template.rs b/router/src/infer/chat_template.rs
index ebba5fd3..a6687203 100644
--- a/router/src/infer/chat_template.rs
+++ b/router/src/infer/chat_template.rs
@@ -33,7 +33,16 @@ impl ChatTemplate {
         let mut env = Box::new(Environment::new());
         // enable things like .strip() or .capitalize()
         env.set_unknown_method_callback(pycompat::unknown_method_callback);
-        let template_str = template.into_boxed_str();
+
+        // TODO: replace with better solution
+        // hack to adjust gemma3 template for debug
+        // replace 'messages[0]['content'][0]['text']' with 'messages[0]['content']'
+        let mutated_template = template.replace(
+            "messages[0]['content'][0]['text']",
+            "messages[0]['content']",
+        );
+
+        let template_str = mutated_template.into_boxed_str();
         env.add_function("raise_exception", raise_exception);
         env.add_function("strftime_now", strftime_now);
         tracing::debug!("Loading template: {}", template_str);
@@ -123,8 +132,8 @@ mod tests {
     use crate::infer::chat_template::{raise_exception, strftime_now};
     use crate::infer::ChatTemplate;
     use crate::{
-        ChatTemplateInputs, Message, MessageBody, MessageContent, TextMessage,
-        TokenizerConfigToken, Tool,
+        ChatTemplateInputs, Message, MessageBody, MessageChunk, MessageContent, TextMessage,
+        TokenizerConfigToken, Tool, Url,
     };
     use chrono::Local;
     use minijinja::Environment;
@@ -1230,4 +1239,98 @@ TOOL CALL ID: 0
         let expected = "<s><|start_header_id|>system<|end_header_id|>\n\nEnvironment: ipython\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYoure a helpful assistant! Answer the users question best you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.\n\nRespond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.Do not use variables.\n\n{\n    \"function\": {\n        \"arguments\": \"{\\\"type\\\":\\\"object\\\",\\\"properties\\\":{\\\"location\\\":{\\\"type\\\":\\\"string\\\",\\\"description\\\":\\\"The city and state, e.g. San Francisco, CA\\\"},\\\"format\\\":{\\\"type\\\":\\\"string\\\",\\\"enum\\\":[\\\"celsius\\\",\\\"fahrenheit\\\"],\\\"description\\\":\\\"The temperature unit to use. Infer this from the users location.\\\"}},\\\"required\\\":[\\\"location\\\",\\\"format\\\"]}\",\n        \"description\": \"Get the current weather\",\n        \"name\": \"get_current_weather\"\n    },\n    \"type\": \"function\"\n}\n\nWhat is the weather like in Brooklyn, New York?\n---\nThis default prompt will be used<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".to_string();
         assert_eq!(result.unwrap(), expected);
     }
+
+    #[test]
+    fn test_chat_template_with_special_system_prompt() {
+        // chat template from gemma3
+        let ct = ChatTemplate::new(
+            r#"{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+
+' -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}
+"#
+            .to_string(),
+            Some(TokenizerConfigToken::String("<bos>".to_string())),
+            Some(TokenizerConfigToken::String("</eos>".to_string())),
+        );
+        let msgs: Vec<Message> = vec![
+            Message {
+                name: None,
+                role: "system".to_string(),
+                body: MessageBody::Content {
+                    content: MessageContent::MultipleChunks(vec![MessageChunk::Text {
+                        text: "You are a helpful assistant.".to_string(),
+                    }]),
+                },
+            },
+            Message {
+                name: None,
+                role: "user".to_string(),
+                body: MessageBody::Content {
+                    content: MessageContent::MultipleChunks(vec![
+                        MessageChunk::Text {
+                            text: "I'm already using this supplement ".to_string(),
+                        },
+                        MessageChunk::ImageUrl {
+                            image_url: Url {
+                                url:  "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3018.JPG".to_string()
+                            },
+                        },
+                        MessageChunk::Text {
+                            text: "and I want to use this one too ".to_string()
+                        },
+                        MessageChunk::ImageUrl {
+                            image_url: Url {
+                                url: "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3015.jpg".to_string()
+                            },
+                        },
+                        MessageChunk::Text {
+                            text: " what are cautions?".to_string()
+                        },
+                    ]),
+                },
+            },
+        ];
+
+        let result = ct.apply(msgs, None);
+        let expected = "<bos><start_of_turn>user\nYou are a helpful assistant.\n\nI'm already using this supplement ![](https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3018.JPG)and I want to use this one too ![](https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3015.jpg) what are cautions?<end_of_turn>\n<start_of_turn>model\n".to_string();
+        assert_eq!(result.unwrap(), expected);
+    }
 }
diff --git a/router/src/lib.rs b/router/src/lib.rs
index eac41276..e8b8f663 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -152,6 +152,11 @@ impl HubTokenizerConfig {
     }
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct ChatTemplateStandalone {
+    pub chat_template: ChatTemplateVersions,
+}
+
 #[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
 #[serde(untagged)]
 pub enum TokenizerConfigToken {
@@ -173,6 +178,7 @@ impl TokenizerConfigToken {
 pub enum HubPreprocessorConfig {
     Idefics2Processor(Idefics2Preprocessor),
     Idefics3Processor(Idefics2Preprocessor),
+    Gemma3Processor(Gemma3Processor),
 }
 
 impl HubPreprocessorConfig {
@@ -188,6 +194,12 @@ pub struct Idefics2Preprocessor {
     do_image_splitting: bool,
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Gemma3Processor {
+    #[serde(default)]
+    do_image_splitting: bool,
+}
+
 #[derive(Debug, Clone, Deserialize, Default)]
 pub struct HubProcessorConfig {
     pub chat_template: Option<ChatTemplateVersions>,
diff --git a/router/src/server.rs b/router/src/server.rs
index 689b2f50..d0fa15d5 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1532,6 +1532,7 @@ pub async fn run(
         tokenizer_config_filename,
         preprocessor_config_filename,
         processor_config_filename,
+        chat_template_filename,
         model_info,
     ) = match api {
         Type::None => (
@@ -1539,6 +1540,7 @@ pub async fn run(
             Some(local_path.join("tokenizer_config.json")),
             Some(local_path.join("preprocessor_config.json")),
             Some(local_path.join("processor_config.json")),
+            Some(local_path.join("chat_template.json")),
             None,
         ),
         Type::Api(api) => {
@@ -1552,6 +1554,7 @@ pub async fn run(
             let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
             let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
             let processor_config_filename = api_repo.get("processor_config.json").await.ok();
+            let chat_template_filename = api_repo.get("chat_template.json").await.ok();
 
             let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await {
                 Some(model_info)
@@ -1564,6 +1567,7 @@ pub async fn run(
                 tokenizer_config_filename,
                 preprocessor_config_filename,
                 processor_config_filename,
+                chat_template_filename,
                 model_info,
             )
         }
@@ -1579,11 +1583,23 @@ pub async fn run(
                 repo.get("tokenizer_config.json"),
                 repo.get("preprocessor_config.json"),
                 repo.get("processor_config.json"),
+                repo.get("chat_template.json"),
                 None,
             )
         }
     };
 
+    // if chat_template_filename is present, load the chat template
+    let chat_template: Option<crate::ChatTemplateVersions> = chat_template_filename
+        .and_then(|f| std::fs::read_to_string(f).ok())
+        .and_then(|c| {
+            let res = serde_json::from_str::<crate::ChatTemplateStandalone>(&c);
+            if let Err(e) = &res {
+                tracing::warn!("Could not parse chat template {e:?}");
+            }
+            res.ok().map(|t| t.chat_template)
+        });
+
     // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
     tracing::warn!("Tokenizer_config {tokenizer_config_path:?} - {tokenizer_config_filename:?}");
     let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
@@ -1592,11 +1608,16 @@ pub async fn run(
     } else {
         tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
     };
-    let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
+    let mut tokenizer_config = tokenizer_config.unwrap_or_else(|| {
         tracing::warn!("Could not find tokenizer config locally and no API specified");
         HubTokenizerConfig::default()
     });
 
+    if chat_template.is_some() {
+        tracing::info!("Using chat template from chat_template.json");
+        tokenizer_config.chat_template = chat_template;
+    }
+
     let tokenizer: Result<Tokenizer, WebServerError> = {
         use pyo3::prelude::*;
         Python::with_gil(|py| -> PyResult<()> {
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 320e7f03..87b28eb7 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -18,6 +18,7 @@ use std::sync::Arc;
 use thiserror::Error;
 use tokio::sync::mpsc;
 use tokio::sync::oneshot;
+use tracing::warn;
 use tracing::{instrument, Span};
 use {once_cell::sync::Lazy, regex::Regex};
 
@@ -694,6 +695,14 @@ fn image_tokens(
             "<|vision_start|>{:?}<|vision_end|>",
             "<|image_pad|>".repeat(config.get_number_of_features(height, width))
         ),
+        Gemma3(_config) => {
+            // TODO: prefer using the config to determine the number of features
+            let num_mm_soft_tokens_per_image = 256;
+            format!(
+                "\n\n<start_of_image>{:?}<end_of_image>\n\n",
+                "<image_soft_token>".repeat(num_mm_soft_tokens_per_image)
+            )
+        }
         _ => unimplemented!("Images tokens are not supported for this model configuration"),
     }
 }
@@ -721,8 +730,8 @@ fn prepare_input<T: TokenizerTrait>(
     static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
     let (tokenizer_query, input_chunks) = match config {
         Some(
-            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Paligemma(_) | LlavaNext(_)
-            | Qwen2Vl(_) | Qwen2_5Vl(_)),
+            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Paligemma(_)
+            | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)),
         ) => {
             let mut input_chunks = Vec::new();
             let mut tokenizer_query = String::with_capacity(inputs.len());
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index ab12429b..0fdc009c 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -106,6 +106,17 @@ try:
     from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
         FlashGemma2ForCausalLM,
     )
+    from text_generation_server.models.custom_modeling.flash_gemma3_modeling import (
+        FlashGemma3ForCausalLM,
+        Gemma3ForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.gemma3.processing_gemma3 import (
+        Gemma3Processor,
+    )
+    from text_generation_server.models.custom_modeling.gemma3.configuration_gemma3 import (
+        Gemma3Config,
+        Gemma3TextConfig,
+    )
     from text_generation_server.models.custom_modeling.flash_dbrx_modeling import (
         FlashDbrxForCausalLM,
         DbrxConfig,
@@ -258,6 +269,16 @@ class ModelType(enum.Enum):
         "name": "Gemma2",
         "url": "https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315",
     }
+    GEMMA3 = {
+        "type": "gemma3",
+        "name": "Gemma3",
+        "url": "https://huggingface.co/collections/google/gemma-3",
+    }
+    GEMMA3_TEXT = {
+        "type": "gemma3_text",
+        "name": "Gemma3 Text",
+        "url": "https://huggingface.co/collections/google/gemma-3",
+    }
     COHERE = {
         "type": "cohere",
         "name": "Cohere",
@@ -1094,6 +1115,83 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+    elif model_type == GEMMA3_TEXT:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGemma3ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # TODO: once implemented in transformers, use the config class
+                # and processor class from there.
+                config_class=Gemma3TextConfig,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma3"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+    elif model_type == GEMMA3:
+        if FLASH_ATTENTION:
+            # TODO: Use VlmCausalLM when image support is added.
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=Gemma3ForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # TODO: once implemented in transformers, use the config class
+                # and processor class from there.
+                config_class=Gemma3Config,
+                processor_class=Gemma3Processor,
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma3"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
 
     if model_type == COHERE:
         if FLASH_ATTENTION:
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
new file mode 100644
index 00000000..085f57ef
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
@@ -0,0 +1,922 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+from torch import nn
+from typing import Optional, List, Tuple
+import copy
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    get_linear,
+    #
+    SpeculativeHead,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
+)
+
+import torch
+import torch.nn.functional as F
+
+
+from text_generation_server.models.custom_modeling.vlm import (
+    load_text_model,
+    load_vision_model,
+)
+
+
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+from text_generation_server.utils.weights import UnquantizedWeight
+from transformers.activations import ACT2FN
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    Seqlen,
+)
+
+
+ATTENTION_TYPE_GLOBAL = "global"
+ATTENTION_TYPE_LOCAL = "local_sliding"
+
+
+class Gemma3FastRMSNorm(FastRMSNorm):
+    @classmethod
+    def load(cls, prefix: str, weights, eps=1e-6):
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        weight = weights.get_tensor(f"{prefix}.weight") + 1
+        weights.dtype = dtype
+        new = cls(weight, eps)
+        new.dtype = dtype
+        return new
+
+    # perform the multiplication in full precision and downcast after
+    def forward(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states * self.weight
+        return hidden_states.to(self.dtype), residual
+
+
+def load_attention(config, prefix: str, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+    )
+
+    if isinstance(weight, UnquantizedWeight):
+        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.head_dim
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=None))
+
+
+class FlashGemma3Attention(torch.nn.Module):
+    def __init__(
+        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.head_dim
+        self.causal = causal
+        if is_sliding:
+            self.window_size = config.sliding_window
+            # TODO: remove this hack to support local sliding window
+            config = copy.deepcopy(config)
+            config.rope_scaling = dict(rope_type="default")
+            self.rotary_emb = PositionRotaryEmbedding.static(
+                config=config,
+                dim=config.head_dim,
+                base=config.rope_local_base_freq,
+                device=weights.device,
+            )
+        else:
+            self.window_size = -1
+            self.rotary_emb = PositionRotaryEmbedding.static(
+                config=config,
+                dim=config.head_dim,
+                base=config.rope_theta,
+                device=weights.device,
+            )
+
+        self.softmax_scale = (
+            config.query_pre_attn_scalar**-0.5
+            if config.query_pre_attn_scalar is not None
+            else None
+        )
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+        self.softcap = None  # config.attn_logit_softcapping
+
+        query_key_value = load_attention(config, prefix, weights)
+        self.query_key_value = TensorParallelMultiAdapterLinear.load(
+            query_key_value,
+            layer_id,
+            ["q_proj", "k_proj", "v_proj"],
+            sizes=[
+                self.head_size * config.num_attention_heads,
+                self.head_size * config.num_key_value_heads,
+                self.head_size * config.num_key_value_heads,
+            ],
+            process_group=weights.process_group,
+        )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
+        o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            layer_id,
+            "o_proj",
+            process_group=weights.process_group,
+        )
+
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+        self.q_norm = Gemma3FastRMSNorm.load(
+            prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.k_norm = Gemma3FastRMSNorm.load(
+            prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.enable_gqa = self.num_heads != self.num_key_value_heads
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        adapter_data,
+        attention_mask,
+    ):
+
+        qkv = self.query_key_value(hidden_states, adapter_data)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+
+        kv = kv.view(-1, 2, self.num_key_value_heads * self.head_size)
+        key = kv[:, 0]
+        value = kv[:, 1]
+
+        query = query.reshape(-1, self.head_size)
+        key = key.reshape(-1, self.head_size)
+
+        query, _ = self.q_norm(query.contiguous())
+        key, _ = self.k_norm(key.contiguous())
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_key_value_heads, self.head_size)
+        value = value.view(-1, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, key, cos, sin)
+
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            if attention_mask is None:
+                # flash attention
+                attn_output = attention(
+                    query=query,
+                    key=key,
+                    value=value,
+                    kv_cache=kv_cache,
+                    kv_scales=self.kv_scales,
+                    seqlen=seqlen,
+                    block_tables=block_tables,
+                    softmax_scale=self.softmax_scale,
+                    window_size_left=self.window_size,
+                    softcap=self.softcap,
+                )
+            else:
+                lengths = cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]
+
+                # Split tensors using vectorized split
+                query_list = torch.split(query, lengths.tolist(), dim=0)
+                key_list = torch.split(key, lengths.tolist(), dim=0)
+                value_list = torch.split(value, lengths.tolist(), dim=0)
+
+                padded_query = torch.nn.utils.rnn.pad_sequence(
+                    query_list, batch_first=True
+                )
+                padded_key = torch.nn.utils.rnn.pad_sequence(key_list, batch_first=True)
+                padded_value = torch.nn.utils.rnn.pad_sequence(
+                    value_list, batch_first=True
+                )
+
+                padded_query = padded_query.transpose(1, 2).contiguous()
+                padded_key = padded_key.transpose(1, 2).contiguous()
+                padded_value = padded_value.transpose(1, 2).contiguous()
+                zeros_to_add = torch.zeros(
+                    padded_key.size(0),
+                    self.num_key_value_heads,
+                    1,
+                    self.head_size,
+                    dtype=padded_key.dtype,
+                    device=padded_key.device,
+                )
+                key_states = torch.cat([padded_key, zeros_to_add], dim=2)
+                value_states = torch.cat([padded_value, zeros_to_add], dim=2)
+
+                # Compute attention
+                attn_output = F.scaled_dot_product_attention(
+                    padded_query,
+                    key_states,
+                    value_states,
+                    attn_mask=attention_mask,
+                    scale=self.softmax_scale,
+                    enable_gqa=self.enable_gqa,
+                )
+
+                attn_output = attn_output.transpose(
+                    1, 2
+                )  # [batch_size, seq_len, num_heads, head_dim]
+                max_seq_len = padded_query.size(2)
+                seq_range = torch.arange(
+                    max_seq_len, device=padded_query.device
+                ).unsqueeze(0)
+                lengths_tensor = torch.tensor(
+                    lengths, device=padded_query.device
+                ).unsqueeze(1)
+                mask = seq_range < lengths_tensor  # [batch, max_seq_len]
+                attn_output = attn_output[mask]  # [total_seq_len, num_heads, head_dim]
+
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache,
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                seqlen,
+                max_s,
+                softcap=self.softcap,
+                kv_scales=self.kv_scales,
+            )
+
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
+
+
+class Gemma3MLP(nn.Module):
+    def __init__(self, prefix, config, weights, layer_id):
+        super().__init__()
+        act = config.hidden_activation
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            layer_id,
+            ["gate_proj", "up_proj"],
+            sizes=[
+                config.intermediate_size,
+                config.intermediate_size,
+            ],
+            process_group=weights.process_group,
+        )
+
+        down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            layer_id,
+            "down_proj",
+            process_group=weights.process_group,
+        )
+
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states, adapter_data):
+        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+        )
+
+
+class FlashGemma3Layer(nn.Module):
+    def __init__(
+        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
+    ):
+        super().__init__()
+        self.self_attn = FlashGemma3Attention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+            layer_id=layer_id,
+            causal=causal,
+            is_sliding=is_sliding,
+        )
+        self.mlp = Gemma3MLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
+        )
+
+        self.input_layernorm = Gemma3FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = Gemma3FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.pre_feedforward_layernorm = Gemma3FastRMSNorm.load(
+            prefix=f"{prefix}.pre_feedforward_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.post_feedforward_layernorm = Gemma3FastRMSNorm.load(
+            prefix=f"{prefix}.post_feedforward_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        seqlen,
+        max_s,
+        adapter_data,
+        attention_mask,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            adapter_data,
+            attention_mask,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, _ = self.post_attention_layernorm(attn_output)
+        normed_attn_res_output = normed_attn_res_output + res
+        res = normed_attn_res_output
+
+        pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output)
+        mlp_output = self.mlp(pre_normed, adapter_data)
+        post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output)
+
+        return post_hidden_states, normed_attn_res_output
+
+
+class FlashGemma3Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+
+        self.layers = nn.ModuleList(
+            [
+                FlashGemma3Layer(
+                    prefix=f"{prefix}.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                    layer_id=layer_id,
+                    causal=causal,
+                    is_sliding=bool((layer_id + 1) % config.sliding_window_pattern),
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = Gemma3FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        adapter_data: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            cos, sin = self.layers[i].self_attn.rotary_emb.get_cos_sin(
+                position_ids, max_s, hidden_states.dtype
+            )
+
+            # apply sliding window mask if needed
+            if layer.self_attn.window_size > 0 and attention_mask is not None:
+                min_dtype = torch.finfo(hidden_states.dtype).min
+                # prefill may be larger than sliding window
+                effective_seq_len = max(
+                    position_ids.shape[0], self.layers[i].self_attn.window_size
+                )
+                sliding_window_mask = torch.tril(
+                    torch.ones_like(attention_mask, dtype=torch.bool),
+                    diagonal=-self.layers[i].self_attn.window_size,
+                )
+                attention_mask = torch.where(
+                    sliding_window_mask, min_dtype, attention_mask
+                )
+                offset = max(0, position_ids.shape[0] - effective_seq_len)
+                attention_mask = attention_mask[
+                    :, :, offset : offset + effective_seq_len
+                ]
+
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                seqlen,
+                max_s,
+                adapter_data,
+                attention_mask,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGemma3ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
+        super().__init__()
+
+        embed_norm = config.hidden_size**0.5
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.embed_tokens.weight *= embed_norm
+
+        self.model = FlashGemma3Model(
+            prefix=prefix, config=config, weights=weights, causal=causal
+        )
+        self.lm_head = SpeculativeHead.load(
+            prefix=(
+                f"{prefix}.embed_tokens"
+                if config.tie_word_embeddings
+                else f"{prefix}.lm_head"
+            ),
+            config=config,
+            weights=weights,
+        )
+        # self.softcap = config.attn_logit_softcapping
+        # assert isinstance(self.softcap, float)
+        self.softcap = None
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = self.model(
+            input_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            seqlen,
+            max_s,
+            adapter_data,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+
+        return logits, speculative_logits
+
+
+class Gemma3MultimodalInputProjection(torch.nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        self.mm_input_projection_weight = weights.get_tensor(
+            "multi_modal_projector.mm_input_projection_weight"
+        )
+
+        self.mm_soft_emb_norm = Gemma3FastRMSNorm.load(
+            prefix=f"{prefix}.mm_soft_emb_norm",
+            weights=weights,
+            eps=config.vision_config.layer_norm_eps,
+        )
+
+        self.patches_per_image = int(
+            config.vision_config.image_size // config.vision_config.patch_size
+        )
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(
+            kernel_size=self.kernel_size, stride=self.kernel_size
+        )
+
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, seq_length = vision_outputs.shape
+
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, seq_length, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        normed_vision_outputs, _ = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        projected_vision_outputs = torch.matmul(
+            normed_vision_outputs, self.mm_input_projection_weight
+        )
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+class Gemma3ForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        self.config = config
+
+        if config.vision_config is not None:
+
+            config.vision_config.quantize = config.quantize
+
+            self.post_vision_model_layernorm = nn.LayerNorm.load(
+                prefix="vision_tower.vision_model.post_layernorm",
+                weights=weights,
+                eps=config.vision_config.layer_norm_eps,
+            )
+
+            self.multimodal_projector = Gemma3MultimodalInputProjection(
+                prefix="multi_modal_projector",
+                config=config,
+                weights=weights,
+            )
+
+            text_config = config.text_config
+            text_config.speculator = config.speculator
+            text_config.quantize = config.quantize
+
+            self.vision_model = load_vision_model(
+                prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
+                config=config.vision_config,
+                weights=weights,
+            )
+
+            self.text_model = load_text_model(
+                prefix="language_model" if not prefix else f"{prefix}.language_model",
+                config=config.text_config,
+                weights=weights,
+            )
+        else:
+            config.text_config.quantize = config.quantize
+            config.text_config.speculator = config.speculator
+            self.text_model = load_text_model(
+                prefix=prefix,
+                config=config.text_config,
+                weights=weights,
+            )
+
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+
+    def get_image_token_mask(self, input_ids):
+        device = input_ids.device
+
+        start_token_id = self.config.boi_token_index
+        K = self.config.mm_tokens_per_image
+
+        mask = torch.zeros_like(input_ids, dtype=torch.bool, device=device)
+        start_positions = (input_ids == start_token_id).nonzero(as_tuple=True)[0]
+        mask_indices = start_positions.unsqueeze(1) + torch.arange(
+            1, K + 1, device=device
+        ).unsqueeze(0)
+
+        valid_mask = mask_indices < input_ids.size(0)
+        mask_indices = mask_indices[valid_mask]
+        mask[mask_indices] = True
+
+        return mask
+
+    def get_attention_mask(
+        self, input_ids, max_s, cu_seqlen_prefill, dtype, image_token_mask
+    ):
+        device = input_ids.device
+        min_dtype = torch.finfo(dtype).min
+
+        lengths = (cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]).tolist()
+        batch_size = len(lengths)
+
+        sequence_length = max(lengths)
+        target_length = max_s
+        # Create the padding mask from the computed lengths.
+        # pad_mask: [batch, sequence_length] where True indicates valid tokens.
+        seq_range = torch.arange(sequence_length, device=device).unsqueeze(0)
+        lengths_tensor = torch.tensor(lengths, device=device).unsqueeze(1)
+        pad_mask = seq_range < lengths_tensor  # shape: [batch, sequence_length]
+
+        # Build the base causal mask (for non-image tokens):
+        causal_mask = torch.tril(
+            torch.ones(
+                (sequence_length, sequence_length), dtype=torch.bool, device=device
+            )
+        )
+        base_mask = pad_mask.unsqueeze(2) & pad_mask.unsqueeze(
+            1
+        )  # [batch, sequence_length, sequence_length]
+        base_mask = base_mask & causal_mask.unsqueeze(0)  # apply causal constraint
+
+        image_token_mask = torch.nn.utils.rnn.pad_sequence(
+            torch.split(image_token_mask, lengths), batch_first=True, padding_value=0
+        )
+        bidirectional_mask = image_token_mask.unsqueeze(2) & image_token_mask.unsqueeze(
+            1
+        )
+
+        # Combine the causal base mask and the bidirectional mask.
+        combined_mask = torch.logical_or(
+            base_mask.unsqueeze(1), bidirectional_mask.unsqueeze(1)
+        ).to(device)
+        # combined_mask now has shape [batch, 1, sequence_length, sequence_length]
+
+        full_attention_mask = torch.zeros(
+            (batch_size, 1, sequence_length, target_length),
+            device=device,
+            dtype=torch.bool,
+        )
+        full_attention_mask[:, :, :, :sequence_length] = combined_mask
+
+        final_attention_mask = torch.where(full_attention_mask, 0, min_dtype).to(device)
+
+        return final_attention_mask
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        # Unused here
+        attention_mask: Optional[torch.BoolTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
+        if cu_seqlen_prefill is not None:
+            max_s += 1
+            position_ids += 1
+
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(dtype=inputs_embeds.dtype)
+            image_outputs = self.vision_model(pixel_values)
+            vision_outputs = self.post_vision_model_layernorm(
+                image_outputs.last_hidden_state
+            )
+            image_features = self.multimodal_projector(vision_outputs)
+
+            image_token_mask = (input_ids == self.config.image_token_index).to(
+                input_ids.device
+            )
+            inputs_embeds[image_token_mask] = image_features.view(
+                -1, image_features.shape[-1]
+            )
+            attention_mask = self.get_attention_mask(
+                input_ids,
+                max_s,
+                cu_seqlen_prefill,
+                inputs_embeds.dtype,
+                image_token_mask,
+            )
+        # Use flash attention for text-only input
+        # else:
+        #     if cu_seqlen_prefill is not None:
+        #         min_dtype = torch.finfo(inputs_embeds.dtype).min
+        #         lengths = (cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]).tolist()
+
+        #         # Determine the maximum sequence length (after padding) from query.
+        #         sequence_length = max(lengths)
+        #         target_length = max_s
+
+        #         # Create the padding mask from the computed lengths.
+        #         # pad_mask: [batch, sequence_length] where True indicates valid tokens.
+        #         seq_range = torch.arange(
+        #             sequence_length, device=input_ids.device
+        #         ).unsqueeze(0)
+        #         lengths_tensor = torch.tensor(
+        #             lengths, device=input_ids.device
+        #         ).unsqueeze(1)
+        #         pad_mask = seq_range < lengths_tensor  # shape: [batch, sequence_length]
+
+        #         # Build the base causal mask (for non-image tokens):
+        #         causal_mask = torch.tril(
+        #             torch.ones(
+        #                 (sequence_length, sequence_length),
+        #                 dtype=torch.bool,
+        #                 device=input_ids.device,
+        #             )
+        #         )
+        #         base_mask = pad_mask.unsqueeze(2) & pad_mask.unsqueeze(
+        #             1
+        #         )  # [batch, sequence_length, sequence_length]
+        #         base_mask = base_mask & causal_mask.unsqueeze(0)
+        #         attention_mask = base_mask.unsqueeze(
+        #             1
+        #         )  # [batch, 1, sequence_length, sequence_length]
+        #         full_attention_mask = torch.zeros(
+        #             (len(lengths), 1, sequence_length, target_length),
+        #             device=input_ids.device,
+        #             dtype=torch.bool,
+        #         )
+        #         full_attention_mask[:, :, :, :sequence_length] = attention_mask
+
+        #         attention_mask = torch.where(full_attention_mask, 0, min_dtype).to(
+        #             input_ids.device
+        #         )
+
+        hidden_states = self.text_model.model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+            attention_mask=attention_mask,
+        )
+
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
+
+        # pad logit with 1 zero logit for the image token
+        if pixel_values is not None:
+            logits = torch.cat(
+                [logits, torch.zeros(logits.size(0), 1, device=logits.device)], dim=1
+            )
+            if speculative_logits is not None:
+                speculative_logits = torch.cat(
+                    [
+                        speculative_logits,
+                        torch.zeros(
+                            speculative_logits.size(0),
+                            1,
+                            device=speculative_logits.device,
+                        ),
+                    ],
+                    dim=1,
+                )
+
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
index b1f89eff..4ea60451 100644
--- a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
@@ -31,7 +31,7 @@ class PaliGemmaForConditionalGeneration(nn.Module):
         super().__init__()
         config.vision_config.quantize = config.quantize
         self.vision_tower = load_vision_model(
-            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
+            prefix="vision_model" if not prefix else f"{prefix}.vision_model",
             config=config.vision_config,
             weights=weights,
         )
diff --git a/server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py
new file mode 100644
index 00000000..08b468e2
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py
@@ -0,0 +1,313 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+from transformers import SiglipVisionConfig
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3Model`]. It is used to instantiate a Gemma3
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma3-4B.
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262144):
+            Vocabulary size of the Gemma3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma3Model`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3, every other layer uses sliding window
+            attention. This is the size of the sliding window.
+        query_pre_attn_scalar (`float`, *optional*):
+            The scaling factor used on the attention scores, not that
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings used for global attention.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        sliding_window_pattern (`int`, *optional*, defaults to 6):
+            Pattern for the sliding window attention.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to
+            `"gelu_pytorch_tanh"` if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"`
+            activation function.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        final_logit_softcapping (`bool`, *optional*, defaults to `True`):
+            Whether to apply logit softcapping or nor
+        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
+            Scaling factor when applying tanh soft-capping on the attention scorexs.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`):
+            The cache type to be used with `generate`.
+
+    ```python
+    >>> from transformers import Gemma3Model, Gemma3TextConfig
+    >>> # Initializing a Gemma3 gemma3-4b style configuration
+    >>> configuration = Gemma3Config()
+    >>> # Initializing a model from the gemma3-4b style configuration
+    >>> model = Gemma3Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3_text"
+
+    def __init__(
+        self,
+        vocab_size: int = 262_144,
+        hidden_size: int = 2304,
+        intermediate_size: int = 9216,
+        num_hidden_layers: int = 26,
+        num_attention_heads: int = 8,
+        num_key_value_heads: int = 4,
+        head_dim: int = 256,
+        sliding_window: int = 4096,
+        query_pre_attn_scalar: Optional[float] = 256,
+        rope_theta: float = 1_000_000.0,
+        rope_scaling=None,
+        rope_local_base_freq: float = 10_000.0,
+        sliding_window_pattern: int = 6,
+        rms_norm_eps: float = 1e-6,
+        hidden_activation: str = "gelu_pytorch_tanh",
+        pad_token_id: int = 0,
+        eos_token_id: int = 1,
+        bos_token_id: int = 2,
+        tie_word_embeddings: bool = True,
+        max_position_embeddings: int = 131_072,
+        initializer_range: float = 0.02,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        use_cache: bool = True,
+        final_logit_softcapping=None,
+        attn_logit_softcapping=None,
+        cache_implementation: str = "hybrid",
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.rope_local_base_freq = rope_local_base_freq
+        # For configuring HybridCache to work with 5:1 attention pattern
+        self.sliding_window_pattern = sliding_window_pattern
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.query_pre_attn_scalar = query_pre_attn_scalar
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.cache_implementation = cache_implementation
+        rope_config_validation(self)
+
+
+class Gemma3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
+    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
+
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        mm_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of tokens per image embedding.
+        boi_token_index (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 256000):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 262144):
+            The image token index to encode the image prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
+
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+
+    >>> # Initializing a Gemma3 Text config
+    >>> text_config = Gemma3TextConfig()
+
+    >>> # Initializing a Gemma3 gemma-3-4b style configuration
+    >>> configuration = Gemma3Config(vision_config, text_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3TextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3"
+    sub_configs = {
+        "text_config": Gemma3TextConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+
+    def __init__(
+        self,
+        text_config: Optional[Gemma3TextConfig] = None,
+        vision_config: Optional[SiglipVisionConfig] = None,
+        mm_tokens_per_image: int = 256,
+        boi_token_index: int = 255_999,
+        eoi_token_index: int = 256_000,
+        image_token_index: int = 262_144,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = Gemma3TextConfig()
+            logger.info(
+                "text_config is None, using default Gemma3TextConfig vision config."
+            )
+        elif isinstance(text_config, dict):
+            text_config = Gemma3TextConfig(**text_config)
+
+        if isinstance(vision_config, dict):
+            vision_config = SiglipVisionConfig(**vision_config)
+        else:
+            vision_config = SiglipVisionConfig()
+            logger.info(
+                "vision_config is None or incompatible with Gemma3VisionConfig intialization. Gemma3 will be limited "
+                "to text tasks."
+            )
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["Gemma3Config", "Gemma3TextConfig"]
diff --git a/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
new file mode 100644
index 00000000..803d81ea
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
@@ -0,0 +1,463 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Gemma3."""
+
+import itertools
+import math
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.image_processing_utils import (
+    BaseImageProcessor,
+    BatchFeature,
+    get_size_dict,
+)
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_vision_available,
+    logging,
+)
+
+from .utils import make_nested_list_of_images
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class Gemma3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SigLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
+            `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_pan_and_scan (`bool`, *optional*):
+            Whether to apply `pan_and_scan` to images.
+    """
+
+    model_input_names = ["pixel_values", "num_crops"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = False,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        do_pan_and_scan: bool = None,
+        pan_and_scan_min_crop_size: int = None,
+        pan_and_scan_max_num_crops: int = None,
+        pan_and_scan_min_ratio_to_activate: float = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+        self.do_pan_and_scan = do_pan_and_scan
+        self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size
+        self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops
+        self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate
+
+    def pan_and_scan(
+        self,
+        image: np.ndarray,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pan and Scan and image, whatever it means. TODO: write-up docs
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            pan_and_scan_min_crop_size (`int`):
+                Size of pan_and_scan_min_crop_size.
+            pan_and_scan_max_num_crops (`int`):
+                pan_and_scan_max_num_crops for the image.
+            pan_and_scan_min_ratio_to_activate (`int`):
+                pan_and_scan_min_ratio_to_activate for the image..
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        height, width = get_image_size(image)
+
+        # Square or landscape image.
+        if width >= height:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if width / height < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_w = int(
+                math.floor(width / height + 0.5)
+            )  # Half round up rounding.
+            num_crops_w = min(
+                int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w
+            )
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+
+        # Portrait image.
+        else:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if height / width < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_h = int(math.floor(height / width + 0.5))
+            num_crops_h = min(
+                int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h
+            )
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+
+        crop_size_w = int(math.ceil(width / num_crops_w))
+        crop_size_h = int(math.ceil(height / num_crops_h))
+
+        # Don't apply PaS if crop size is too small.
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return []
+
+        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
+        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
+
+        if input_data_format == ChannelDimension.LAST:
+            image_crops = [
+                image[pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+                for pos_h, pos_w in itertools.product(
+                    crop_positions_h, crop_positions_w
+                )
+            ]
+        else:
+            image_crops = [
+                image[:, pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+                for pos_h, pos_w in itertools.product(
+                    crop_positions_h, crop_positions_w
+                )
+            ]
+
+        return image_crops
+
+    def _process_images_for_pas(
+        self,
+        images: List[np.ndarray],
+        do_pan_and_scan: bool,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        pas_images_list = []
+        num_crops = []
+        for image in images:
+            pas_images = self.pan_and_scan(
+                image=image,
+                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            pas_images_list.extend([image] + pas_images)
+            num_crops.append(len(pas_images))
+        return pas_images_list, num_crops
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: bool = None,
+        do_pan_and_scan: bool = None,
+        pan_and_scan_min_crop_size: int = None,
+        pan_and_scan_max_num_crops: int = None,
+        pan_and_scan_min_ratio_to_activate: float = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_pan_and_scan (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to apply `pan_and_scan` to images.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+        do_pan_and_scan = (
+            do_pan_and_scan if do_pan_and_scan is not None else self.do_pan_and_scan
+        )
+        pan_and_scan_min_crop_size = (
+            pan_and_scan_min_crop_size
+            if pan_and_scan_min_crop_size is not None
+            else self.pan_and_scan_min_crop_size
+        )
+        pan_and_scan_max_num_crops = (
+            pan_and_scan_max_num_crops
+            if pan_and_scan_max_num_crops is not None
+            else self.pan_and_scan_max_num_crops
+        )
+        pan_and_scan_min_ratio_to_activate = (
+            pan_and_scan_min_ratio_to_activate
+            if pan_and_scan_min_ratio_to_activate is not None
+            else self.pan_and_scan_min_ratio_to_activate
+        )
+
+        images_list = make_nested_list_of_images(images)
+
+        if not valid_images(images_list[0]):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if do_convert_rgb:
+            images_list = [
+                [convert_to_rgb(image) for image in images] for images in images_list
+            ]
+
+        # All transformations expect numpy arrays.
+        images_list = [
+            [to_numpy_array(image) for image in images] for images in images_list
+        ]
+
+        if do_rescale and is_scaled_image(images_list[0][0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+        if do_pan_and_scan:
+            images_list_and_num_crops = [
+                self._process_images_for_pas(
+                    images=images,
+                    do_pan_and_scan=do_pan_and_scan,
+                    pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                    pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                    pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for images in images_list
+            ]
+            images_list = [images for images, _ in images_list_and_num_crops]
+            num_crops = [num_crops for _, num_crops in images_list_and_num_crops]
+        else:
+            num_crops = [[0] for images in images_list]
+
+        if do_resize:
+            height, width = size["height"], size["width"]
+            images_list = [
+                [
+                    resize(
+                        image=image,
+                        size=(height, width),
+                        resample=resample,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        if do_rescale:
+            images_list = [
+                [
+                    self.rescale(
+                        image=image,
+                        scale=rescale_factor,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        if do_normalize:
+            images_list = [
+                [
+                    self.normalize(
+                        image=image,
+                        mean=image_mean,
+                        std=image_std,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        images = [
+            to_channel_dimension_format(
+                image, data_format, input_channel_dim=input_data_format
+            )
+            for images in images_list
+            for image in images
+        ]
+
+        data = {"pixel_values": images, "num_crops": num_crops}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["Gemma3ImageProcessor"]
diff --git a/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py
new file mode 100644
index 00000000..08e39a7c
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import List, Optional, Union
+
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import to_py_obj
+from text_generation_server.models.custom_modeling.gemma3.image_processing_gemma3 import (
+    Gemma3ImageProcessor,
+)
+
+from transformers.image_utils import PILImageResampling
+
+from .utils import make_nested_list_of_images
+
+
+class Gemma3ImagesKwargs(ImagesKwargs):
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+    do_convert_rgb: Optional[bool]
+
+
+class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            "do_pan_and_scan": False,
+            "pan_and_scan_min_crop_size": 256,
+            "pan_and_scan_max_num_crops": 4,
+            "pan_and_scan_min_ratio_to_activate": 1.2,
+        },
+    }
+
+
+class Gemma3Processor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    # # image_processor_class = "Gemma3ImageProcessor"
+    image_processor_class = "AutoProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor,
+        tokenizer,
+        chat_template=None,
+        num_mm_soft_tokens_per_image: int = 256,
+        **kwargs,
+    ):
+        num_mm_soft_tokens_per_image = 256
+        chat_template = None
+
+        image_processor = Gemma3ImageProcessor(
+            image_mean=(127.5,) * 3,
+            image_std=(127.5,) * 3,
+            size={"height": 896, "width": 896},
+            do_rescale=False,
+            resample=PILImageResampling.BILINEAR,
+        )
+        # import ipdb; ipdb.set_trace()
+        self.image_token_id = tokenizer.image_token_id
+        image_tokens_expanded = "".join(
+            [tokenizer.image_token] * num_mm_soft_tokens_per_image
+        )
+        self.full_image_sequence = (
+            f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
+        )
+
+        # import ipdb; ipdb.set_trace()
+
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.chat_template = chat_template
+
+        # super().__init__(
+        #     image_processor=image_processor,
+        #     tokenizer=tokenizer,
+        #     chat_template=chat_template,
+        #     **kwargs,
+        # )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        videos=None,
+        audio=None,
+        **kwargs: Unpack[Gemma3ProcessorKwargs],
+    ) -> BatchFeature:
+        if text is None and images is None:
+            raise ValueError("Provide at least one of `text` or `images`.")
+
+        output_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError(
+                "Invalid input text. Please provide a string, or a list of strings"
+            )
+
+        image_inputs = {}
+        if images is not None:
+            batched_images = make_nested_list_of_images(images)
+            image_inputs = self.image_processor(
+                batched_images, **output_kwargs["images_kwargs"]
+            )
+
+            # Create empty text to be replaced with placeholders
+            if not text:
+                text = [
+                    " ".join(["<image>"] * len(images)) for images in batched_images
+                ]
+
+            if len(batched_images) != len(text):
+                raise ValueError(
+                    f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})."
+                )
+
+            # Replace image tokens by the full expanded sequence
+            batch_num_crops = to_py_obj(image_inputs.pop("num_crops"))
+            for prompt, images, num_crops in zip(text, batched_images, batch_num_crops):
+                image_indexes = [m.start() for m in re.finditer("<image>", prompt)]
+
+                if len(images) != len(image_indexes):
+                    raise ValueError(
+                        f"Prompt contained {len(image_indexes)} image tokens but received {len(images)} images."
+                    )
+
+                # Insert additional image tokens for Pan-and-Scan crops
+                for num, idx in reversed(list(zip(num_crops, image_indexes))):
+                    if num:
+                        formatted_image_text = (
+                            "Here is the original image <image> and here are some crops to help you see better "
+                            + " ".join(["<image>"] * num)
+                        )
+                        prompt = (
+                            prompt[:idx]
+                            + formatted_image_text
+                            + prompt[idx + len("<image>") :]
+                        )
+
+            # Expand placeholder image tokens to the full image token sequence
+            text = [
+                prompt.replace("<image>", self.full_image_sequence) for prompt in text
+            ]
+
+        text_input = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_input, **image_inputs})
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->PaliGemma
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["Gemma3Processor"]
diff --git a/server/text_generation_server/models/custom_modeling/gemma3/utils.py b/server/text_generation_server/models/custom_modeling/gemma3/utils.py
new file mode 100644
index 00000000..8d431fb2
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/gemma3/utils.py
@@ -0,0 +1,61 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+
+from transformers.image_utils import ImageInput, is_valid_image, is_pil_image
+
+
+def is_valid_list_of_images(images: List):
+    return images and all(is_valid_image(image) for image in images)
+
+
+def make_nested_list_of_images(
+    images: Union[List[ImageInput], ImageInput],
+) -> ImageInput:
+    """
+    Ensure that the output is a nested list of images.
+    Args:
+        images (`Union[List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of list of images or a list of 4d array of images.
+    """
+    # If it's a list of batches, it's already in the right format
+    if (
+        isinstance(images, (list, tuple))
+        and all(isinstance(images_i, (list, tuple)) for images_i in images)
+        and all(is_valid_list_of_images(images_i) for images_i in images)
+    ):
+        return images
+
+    # If it's a list of images, it's a single batch, so convert it to a list of lists
+    if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
+        if is_pil_image(images[0]) or images[0].ndim == 3:
+            return [images]
+        if images[0].ndim == 4:
+            return [list(image) for image in images]
+
+    # If it's a single image, convert it to a list of lists
+    if is_valid_image(images):
+        if is_pil_image(images) or images.ndim == 3:
+            return [[images]]
+        if images.ndim == 4:
+            return [list(images)]
+
+    raise ValueError(
+        "Invalid input type. Must be a single image, a list of images, or a list of batches of images."
+    )
diff --git a/server/text_generation_server/models/custom_modeling/vlm.py b/server/text_generation_server/models/custom_modeling/vlm.py
index 94b8522d..4447a73f 100644
--- a/server/text_generation_server/models/custom_modeling/vlm.py
+++ b/server/text_generation_server/models/custom_modeling/vlm.py
@@ -23,6 +23,13 @@ def load_text_model(prefix, config, weights, name=None):
         )
 
         return FlashGemma2ForCausalLM(prefix, config, weights)
+
+    elif config.model_type == "gemma3" or config.model_type == "gemma3_text":
+        from text_generation_server.models.custom_modeling.flash_gemma3_modeling import (
+            FlashGemma3ForCausalLM,
+        )
+
+        return FlashGemma3ForCausalLM(prefix, config, weights)
     elif config.model_type == "paligemma":
         from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
             FlashGemmaForCausalLM,
@@ -42,13 +49,21 @@ def load_vision_model(prefix, config, weights):
         return CLIPVisionTransformer(
             prefix=f"{prefix}.vision_model", config=config, weights=weights
         )
-    if config.model_type == "siglip_vision_model":
+    if (
+        config.model_type == "siglip_vision_model"
+        or config.model_type == "gemma3_vision"
+    ):
         from text_generation_server.models.custom_modeling.siglip import (
             SiglipVisionTransformer,
         )
 
+        # TODO: ensure that using the prefix doesn't break any existing models
+        # that rely on the old prefix (update the old models if necessary)
         return SiglipVisionTransformer(
-            prefix="vision_tower.vision_model", config=config, weights=weights
+            # prefix="vision_model.vision_model", config=config, weights=weights
+            prefix=f"{prefix}.vision_model",
+            config=config,
+            weights=weights,
         )
     else:
         raise RuntimeError(f"Unsupported model type {config.model_type}")
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 39046f2a..9111fdc0 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -128,6 +128,12 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
         num_pads = grid_t * grid_h * grid_w // 4
         padding = "<|image_pad|>" * num_pads
         return f"<|vision_start|>{padding}<|vision_end|>"
+    elif config.model_type == "gemma3":
+        # TODO: get correct number of features via reviewing the Gemma3 architecture
+        # and calculating the number of image tokens
+        num_pads = 256
+        padding = "<image_soft_token>" * num_pads
+        return f"\n\n<start_of_image>{padding}<end_of_image>\n\n"
     else:
         raise RuntimeError(f"Unknown config {config.model_type} for multimodal")
 
@@ -244,6 +250,8 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
 
                     if config.model_type == "llava_next":
                         images.append(image)
+                    elif config.model_type == "gemma3":
+                        images.append(image)
                     else:
                         images.append([image])
                 else:

From 5c5528e36287c2207fdbc6958f565da883f8d5d8 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 12 Mar 2025 09:28:47 +0100
Subject: [PATCH 142/183] Fix tool call4 (#3094)

* Removing the no_tool content information.

* Removing a lot of NO_TOOL shenanigans.

* Update the tests.
---
 ...ols_insufficient_information_nostream.json |   10 +-
 ...tools_insufficient_information_stream.json |  320 ++++-
 ...ammar_tools_sea_creatures_stream_auto.json | 1250 +----------------
 integration-tests/models/test_tools_llama.py  |   18 +-
 router/src/chat.rs                            |  516 ++-----
 router/src/infer/tool_grammar.rs              |   14 +-
 router/src/server.rs                          |   72 +-
 7 files changed, 526 insertions(+), 1674 deletions(-)

diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json
index 797c9578..6d841747 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json
@@ -5,20 +5,20 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "I am a helpful assistant!",
+        "content": "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI",
         "role": "assistant",
         "tool_calls": null
       }
     }
   ],
-  "created": 1741263686,
+  "created": 1741693957,
   "id": "",
   "model": "meta-llama/Llama-3.1-8B-Instruct",
   "object": "chat.completion",
   "system_fingerprint": "3.1.2-dev0-native",
   "usage": {
-    "completion_tokens": 23,
-    "prompt_tokens": 494,
-    "total_tokens": 517
+    "completion_tokens": 12,
+    "prompt_tokens": 53,
+    "total_tokens": 65
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
index dc969cee..47f23f4c 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
@@ -1,24 +1,4 @@
 [
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741364571,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
   {
     "choices": [
       {
@@ -32,7 +12,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741364571,
+    "created": 1741694017,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -43,7 +23,7 @@
     "choices": [
       {
         "delta": {
-          "content": " am",
+          "content": "'m",
           "role": "assistant",
           "tool_calls": null
         },
@@ -52,7 +32,127 @@
         "logprobs": null
       }
     ],
-    "created": 1741364571,
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " an",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " artificial",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " intelligence",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " model",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " known",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " as",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -72,7 +172,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741364571,
+    "created": 1741694017,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -83,7 +183,7 @@
     "choices": [
       {
         "delta": {
-          "content": " helpful",
+          "content": " large",
           "role": "assistant",
           "tool_calls": null
         },
@@ -92,7 +192,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741364571,
+    "created": 1741694017,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -103,7 +203,7 @@
     "choices": [
       {
         "delta": {
-          "content": " assistant",
+          "content": " language",
           "role": "assistant",
           "tool_calls": null
         },
@@ -112,7 +212,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741364571,
+    "created": 1741694017,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -123,7 +223,7 @@
     "choices": [
       {
         "delta": {
-          "content": "!",
+          "content": " model",
           "role": "assistant",
           "tool_calls": null
         },
@@ -132,7 +232,167 @@
         "logprobs": null
       }
     ],
-    "created": 1741364571,
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " (",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "LL",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "M",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ")",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " or",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " convers",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "ational",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
+    "id": "",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "3.1.2-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " AI",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1741694017,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
index cdac9bc4..30f03920 100644
--- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
@@ -3,7 +3,7 @@
     "choices": [
       {
         "delta": {
-          "content": "",
+          "content": "Once",
           "role": "assistant",
           "tool_calls": null
         },
@@ -12,7 +12,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371722,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -23,7 +23,7 @@
     "choices": [
       {
         "delta": {
-          "content": "There",
+          "content": " upon",
           "role": "assistant",
           "tool_calls": null
         },
@@ -32,27 +32,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " was",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -72,7 +52,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371722,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -83,7 +63,7 @@
     "choices": [
       {
         "delta": {
-          "content": " wise",
+          "content": " time",
           "role": "assistant",
           "tool_calls": null
         },
@@ -92,7 +72,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371722,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -103,7 +83,7 @@
     "choices": [
       {
         "delta": {
-          "content": " old",
+          "content": ",",
           "role": "assistant",
           "tool_calls": null
         },
@@ -112,147 +92,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " oct",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "opus",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " named",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " Oracle",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": ".",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " He",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " lived",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -272,7 +112,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371722,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -292,7 +132,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371722,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -303,7 +143,7 @@
     "choices": [
       {
         "delta": {
-          "content": " cozy",
+          "content": " vibrant",
           "role": "assistant",
           "tool_calls": null
         },
@@ -312,887 +152,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " little",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " cave",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371722,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " beneath",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " the",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " waves",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " with",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " his",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " best",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " friend",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": ",",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " a",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " curious",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " se",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "ah",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "orse",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " named",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " Fin",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "ley",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": ".",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " One",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " day",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": ",",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " Fin",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "ley",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " met",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " a",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " playful",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " dolphin",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " named",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " Daisy",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": ",",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " and",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371723,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " the",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " three",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " became",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " inse",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "parable",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": ".",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " They",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " spent",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " their",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " days",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " exploring",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " the",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1212,7 +172,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1223,7 +183,7 @@
     "choices": [
       {
         "delta": {
-          "content": ",",
+          "content": " filled",
           "role": "assistant",
           "tool_calls": null
         },
@@ -1232,7 +192,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1243,7 +203,7 @@
     "choices": [
       {
         "delta": {
-          "content": " playing",
+          "content": " with",
           "role": "assistant",
           "tool_calls": null
         },
@@ -1252,7 +212,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1263,7 +223,7 @@
     "choices": [
       {
         "delta": {
-          "content": " hide",
+          "content": " coral",
           "role": "assistant",
           "tool_calls": null
         },
@@ -1272,7 +232,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1283,7 +243,7 @@
     "choices": [
       {
         "delta": {
-          "content": "-and",
+          "content": " reefs",
           "role": "assistant",
           "tool_calls": null
         },
@@ -1292,67 +252,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "-se",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": "ek",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": ",",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1372,7 +272,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1383,7 +283,7 @@
     "choices": [
       {
         "delta": {
-          "content": " learning",
+          "content": " schools",
           "role": "assistant",
           "tool_calls": null
         },
@@ -1392,67 +292,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " about",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " the",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": " wonders",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1472,7 +312,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1483,7 +323,7 @@
     "choices": [
       {
         "delta": {
-          "content": " the",
+          "content": " shimmer",
           "role": "assistant",
           "tool_calls": null
         },
@@ -1492,7 +332,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1503,7 +343,7 @@
     "choices": [
       {
         "delta": {
-          "content": " sea",
+          "content": "ing",
           "role": "assistant",
           "tool_calls": null
         },
@@ -1512,7 +352,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1523,7 +363,7 @@
     "choices": [
       {
         "delta": {
-          "content": " from",
+          "content": " fish",
           "role": "assistant",
           "tool_calls": null
         },
@@ -1532,7 +372,7 @@
         "logprobs": null
       }
     ],
-    "created": 1741371724,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
@@ -1543,36 +383,16 @@
     "choices": [
       {
         "delta": {
-          "content": " Oracle",
+          "content": ",",
           "role": "assistant",
           "tool_calls": null
         },
-        "finish_reason": null,
+        "finish_reason": "length",
         "index": 0,
         "logprobs": null
       }
     ],
-    "created": 1741371724,
-    "id": "",
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "object": "chat.completion.chunk",
-    "system_fingerprint": "3.1.2-dev0-native",
-    "usage": null
-  },
-  {
-    "choices": [
-      {
-        "delta": {
-          "content": ".",
-          "role": "assistant",
-          "tool_calls": null
-        },
-        "finish_reason": null,
-        "index": 0,
-        "logprobs": null
-      }
-    ],
-    "created": 1741371725,
+    "created": 1741695408,
     "id": "",
     "model": "meta-llama/Llama-3.1-8B-Instruct",
     "object": "chat.completion.chunk",
diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py
index bb4b308b..612fa6bd 100644
--- a/integration-tests/models/test_tools_llama.py
+++ b/integration-tests/models/test_tools_llama.py
@@ -279,7 +279,7 @@ async def test_flash_llama_grammar_tools_insufficient_information_nostream(
 ):
     client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
     response = client.chat_completion(
-        max_tokens=100,
+        max_tokens=20,
         seed=24,
         tools=tools,
         tool_choice="auto",
@@ -299,7 +299,10 @@ async def test_flash_llama_grammar_tools_insufficient_information_nostream(
     content_generated = response.choices[0].message.content
     assert response.choices[0].message.tool_calls is None
 
-    assert content_generated == "I am a helpful assistant!"
+    assert (
+        content_generated
+        == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI"
+    )
     assert response == response_snapshot
 
 
@@ -310,7 +313,7 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream(
 ):
     client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
     stream = client.chat_completion(
-        max_tokens=100,
+        max_tokens=20,
         seed=24,
         tools=tools,
         tool_choice="auto",
@@ -335,7 +338,10 @@ async def test_flash_llama_grammar_tools_insufficient_information_stream(
         assert chunk.choices[0].delta.tool_calls is None
 
     ######## This is exactly the same as the non streaming case
-    assert content_generated == "I am a helpful assistant!"
+    assert (
+        content_generated
+        == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI"
+    )
     assert chunks == response_snapshot
 
 
@@ -346,7 +352,7 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_auto(
 ):
     client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
     stream = client.chat_completion(
-        max_tokens=100,
+        max_tokens=20,
         seed=24,
         tools=tools,
         tool_choice="auto",
@@ -372,7 +378,7 @@ async def test_flash_llama_grammar_tools_sea_creatures_stream_auto(
 
     assert (
         content_generated
-        == "There was a wise old octopus named Oracle. He lived in a cozy little cave beneath the waves with his best friend, a curious seahorse named Finley. One day, Finley met a playful dolphin named Daisy, and the three became inseparable. They spent their days exploring the ocean, playing hide-and-seek, and learning about the wonders of the sea from Oracle."
+        == "Once upon a time, in a vibrant ocean filled with coral reefs and schools of shimmering fish,"
     )
     assert chunks == response_snapshot
 
diff --git a/router/src/chat.rs b/router/src/chat.rs
index 63bd53bf..d5824fea 100644
--- a/router/src/chat.rs
+++ b/router/src/chat.rs
@@ -6,22 +6,6 @@ use crate::{
 use serde::Deserialize;
 use serde_json::Value;
 
-#[derive(Debug, Deserialize)]
-#[serde(rename_all = "snake_case")]
-enum _NoTool {
-    NoTool,
-}
-
-#[derive(Debug, Deserialize)]
-struct NoToolCall {
-    _name: _NoTool,
-    content: String,
-}
-#[derive(Debug, Deserialize)]
-struct NoTool {
-    function: NoToolCall,
-}
-
 #[derive(Debug, Deserialize)]
 struct ToolCall {
     _name: String,
@@ -34,9 +18,19 @@ struct Call {
     function: ToolCall,
 }
 
-pub(crate) fn parse_output(
-    generated_text: &str,
-) -> Result<(Option<Vec<crate::ToolCall>>, Option<String>), InferError> {
+#[cfg_attr(test, derive(Debug))]
+pub(crate) enum ChatEvent {
+    NoTool,
+    Events(Vec<CompletionType>),
+}
+
+#[cfg_attr(test, derive(Debug))]
+pub(crate) enum ChatChoice {
+    NoTool,
+    ToolCalls(Vec<crate::ToolCall>),
+}
+
+pub(crate) fn parse_output(generated_text: &str) -> Result<ChatChoice, InferError> {
     let call: Call = serde_json::from_str(generated_text).map_err(|e| {
         InferError::ToolError(format!(
             "Failed to parse generated text: {} {:?}",
@@ -48,16 +42,7 @@ pub(crate) fn parse_output(
     match &name[..] {
         "no_tool" => {
             // parse the content message
-            let content_message = call
-                .function
-                .arguments
-                .get("content")
-                .and_then(Value::as_str)
-                .ok_or_else(|| {
-                    InferError::ToolError("No `content` found in generated text".to_string())
-                })?
-                .to_string();
-            Ok((None, Some(content_message)))
+            Ok(ChatChoice::NoTool)
         }
         name => {
             let tool_calls = vec![crate::ToolCall {
@@ -73,7 +58,7 @@ pub(crate) fn parse_output(
                     })?,
                 },
             }];
-            Ok((Some(tool_calls), None))
+            Ok(ChatChoice::ToolCalls(tool_calls))
         }
     }
 }
@@ -158,10 +143,6 @@ enum StreamState {
     Buffering,
     /// We detected a tool call here
     Tool,
-    /// During the `content` part of the tool call
-    NoTool,
-    /// Finishing frames of the ToolCall
-    NoToolFinish,
     /// This is without tool calling
     Content,
 }
@@ -202,34 +183,16 @@ impl ChatState {
         }
     }
 
-    pub fn push(&mut self, mut stream_token: StreamResponse) -> Vec<CompletionType> {
+    pub fn push(&mut self, mut stream_token: StreamResponse) -> ChatEvent {
         let mut events = vec![];
         let token_text = &stream_token.token.text;
         match self.state {
             StreamState::Buffering => {
                 self.text.push_str(token_text);
-                // We have a special match for `no_tool` in order to capture directly the `content`
-                // key which should be re-emitted as raw text.
-                if let Ok(value) = serde_json::from_str::<NoTool>(&format!("{}\"}}}}", self.text)) {
-                    self.state = StreamState::NoTool;
-                    // Modifiy the content of the token to be whatever was captured by the JSON
-                    stream_token.token.text = value.function.content;
-                    let chat_complete = create_event_from_stream_token(
-                        &stream_token,
-                        self.logprobs,
-                        false,
-                        self.fingerprint.clone(),
-                        self.model_id.clone(),
-                        None,
-                        self.id.clone(),
-                    );
-
-                    events.push(chat_complete);
-                }
-                // XXX Caution, here we do not postfix the quote, so that the current output
-                // Is necessarily finished with quotes for us to be able to parse.
+                tracing::info!("Current text {:?}", self.text);
                 let partial = &self.text;
-                let partial = partial.trim_end_matches(|c: char| c.is_whitespace() || c == ',');
+                let partial =
+                    partial.trim_end_matches(|c: char| c.is_whitespace() || c == ',' || c == '}');
                 if let Ok(call) = serde_json::from_str::<Call>(&format!("{}}}}}", partial)) {
                     // This can be no_tool before the content has been emitted
                     if call.function._name != "no_tool" {
@@ -246,6 +209,8 @@ impl ChatState {
 
                         events.push(chat_complete);
                         self.state = StreamState::Tool;
+                    } else {
+                        return ChatEvent::NoTool;
                     }
                 }
             }
@@ -282,50 +247,6 @@ impl ChatState {
                     events.push(chat_complete);
                 }
             }
-            // if we skipped sending the buffer we need to avoid sending the following json key and quotes
-            // We have remainder tokens, ignore everying,
-            StreamState::NoToolFinish => {}
-            StreamState::NoTool => {
-                self.text.push_str(token_text);
-                if token_text.contains("\"") {
-                    let mut text = self
-                        .text
-                        .trim_end_matches(|c: char| c.is_whitespace() || c == '}');
-                    // Trim once
-                    if text.ends_with("\"") {
-                        // Verify we have actually trimmed something
-                        // The opposite can happen if the model is outputting inline JSON.
-                        text = &text[..text.len() - 1];
-                        if let Ok(_value) =
-                            serde_json::from_str::<NoTool>(&format!("{}\"}}}}", text))
-                        {
-                            let mut text = token_text
-                                .trim_end_matches(|c: char| c.is_whitespace() || c == '}');
-                            // Effectively trim_end_match('"', 1)
-                            // because we do not want to eventually trim finishing escaped quotes
-                            // {{"\"Something\""}}
-                            if text.ends_with("\"") {
-                                text = &text[..text.len() - 1];
-                            }
-                            stream_token.token.text = text.to_string();
-                            self.state = StreamState::NoToolFinish;
-                        }
-                    }
-                }
-                // This escaping is usually inline json escaping and we can therefore remove it.
-                stream_token.token.text = stream_token.token.text.replace("\\", "");
-                let chat_complete = create_event_from_stream_token(
-                    &stream_token,
-                    self.logprobs,
-                    false,
-                    self.fingerprint.clone(),
-                    self.model_id.clone(),
-                    None,
-                    self.id.clone(),
-                );
-
-                events.push(chat_complete);
-            }
             StreamState::Content => {
                 let chat_complete = create_event_from_stream_token(
                     &stream_token,
@@ -373,7 +294,7 @@ impl ChatState {
                 events.push(chat_complete);
             }
         }
-        events
+        ChatEvent::Events(events)
     }
 }
 
@@ -385,24 +306,6 @@ mod tests {
 
     use super::*;
 
-    fn get_text_content(event: &CompletionType) -> &String {
-        match event {
-            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
-                assert_eq!(choices.len(), 1);
-                if let ChatCompletionChoice {
-                    delta: ChatCompletionDelta::Chat(TextMessage { content, .. }),
-                    ..
-                } = &choices[0]
-                {
-                    content
-                } else {
-                    panic!("Expected plain message");
-                }
-            }
-            _ => panic!("Unexpected chunk"),
-        }
-    }
-
     fn get_tool_call_content(event: &CompletionType) -> (Option<&String>, &String) {
         match event {
             CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
@@ -456,24 +359,28 @@ mod tests {
             index: 0,
             details: None,
         });
-        assert_eq!(events.len(), 1);
-        match &events[0] {
-            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
-                assert_eq!(
-                    choices,
-                    &[ChatCompletionChoice {
-                        index: 0,
-                        delta: ChatCompletionDelta::Chat(TextMessage {
-                            role: "assistant".to_string(),
-                            content: "Hi".to_string(),
-                            tool_call_id: None,
-                        }),
-                        logprobs: None,
-                        finish_reason: None,
-                    }]
-                );
+        if let ChatEvent::Events(events) = events {
+            assert_eq!(events.len(), 1);
+            match &events[0] {
+                CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
+                    assert_eq!(
+                        choices,
+                        &[ChatCompletionChoice {
+                            index: 0,
+                            delta: ChatCompletionDelta::Chat(TextMessage {
+                                role: "assistant".to_string(),
+                                content: "Hi".to_string(),
+                                tool_call_id: None,
+                            }),
+                            logprobs: None,
+                            finish_reason: None,
+                        }]
+                    );
+                }
+                _ => panic!("Unexpected chunk"),
             }
-            _ => panic!("Unexpected chunk"),
+        } else {
+            panic!("Expected chat events");
         }
     }
 
@@ -507,43 +414,47 @@ mod tests {
                 finish_reason: FinishReason::Length,
             }),
         });
-        assert_eq!(events.len(), 2);
-        match &events[0] {
-            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
-                assert_eq!(
-                    choices,
-                    &[ChatCompletionChoice {
-                        index: 0,
-                        delta: ChatCompletionDelta::Chat(TextMessage {
-                            role: "assistant".to_string(),
-                            content: "Hi".to_string(),
-                            tool_call_id: None,
-                        }),
-                        logprobs: None,
-                        // HAS A FINISH REASON
-                        finish_reason: Some("length".to_string()),
-                    }]
-                );
+        if let ChatEvent::Events(events) = events {
+            assert_eq!(events.len(), 2);
+            match &events[0] {
+                CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
+                    assert_eq!(
+                        choices,
+                        &[ChatCompletionChoice {
+                            index: 0,
+                            delta: ChatCompletionDelta::Chat(TextMessage {
+                                role: "assistant".to_string(),
+                                content: "Hi".to_string(),
+                                tool_call_id: None,
+                            }),
+                            logprobs: None,
+                            // HAS A FINISH REASON
+                            finish_reason: Some("length".to_string()),
+                        }]
+                    );
+                }
+                _ => panic!("Unexpected chunk"),
             }
-            _ => panic!("Unexpected chunk"),
-        }
-        match &events[1] {
-            CompletionType::ChatCompletionChunk(ChatCompletionChunk { usage, .. }) => {
-                assert_eq!(
-                    *usage,
-                    Some(Usage {
-                        prompt_tokens: 2,
-                        completion_tokens: 10,
-                        total_tokens: 12,
-                    })
-                );
+            match &events[1] {
+                CompletionType::ChatCompletionChunk(ChatCompletionChunk { usage, .. }) => {
+                    assert_eq!(
+                        *usage,
+                        Some(Usage {
+                            prompt_tokens: 2,
+                            completion_tokens: 10,
+                            total_tokens: 12,
+                        })
+                    );
+                }
+                _ => panic!("Unexpected chunk"),
             }
-            _ => panic!("Unexpected chunk"),
+        } else {
+            panic!("Expected chat events");
         }
     }
 
     #[test]
-    fn test_chat_stream_tool_no_tool() {
+    fn test_chat_stream_tool_no_tool_simple() {
         let mut chat_state = ChatState::new(
             true,
             StreamOptions {
@@ -597,217 +508,21 @@ mod tests {
             .collect();
 
         // Initial ignored output
-        for token in &tokens[..14] {
+        for token in &tokens[..10] {
             let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
-        }
-
-        // No tool output
-        let mut output = String::new();
-        for token in &tokens[14..14 + 7] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 1);
-            let content = get_text_content(&events[0]);
-            output.push_str(content);
-        }
-
-        assert_eq!(output, "I am a helpful assistant!");
-
-        // No tool finish
-        for token in &tokens[14 + 7..] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
-        }
-    }
-
-    #[test]
-    fn test_chat_stream_tool_no_tool_many_quotes() {
-        let mut chat_state = ChatState::new(
-            true,
-            StreamOptions {
-                include_usage: true,
-            },
-            "fingerprint".to_string(),
-            "model_id".to_string(),
-            false,
-            "0".to_string(),
-        );
-
-        let tokens = vec![
-            "{\"".to_string(),
-            "function".to_string(),
-            "\":".to_string(),
-            " {\"".to_string(),
-            "_".to_string(),
-            "name".to_string(),
-            "\":".to_string(),
-            " \"".to_string(),
-            "no".to_string(),
-            "_tool".to_string(),
-            "\",".to_string(),
-            " \"".to_string(),
-            "content".to_string(),
-            "\":".to_string(),
-            " \"".to_string(),        // Token 14
-            "I".to_string(),          // Event 1
-            " am".to_string(),        // Event 2
-            " a".to_string(),         // Event 3
-            " helpful".to_string(),   // Event 4
-            " assistant".to_string(), // Event 5
-            "!\\\"\"".to_string(),    // Extra inside the string quote that would get removed
-            "}".to_string(),
-            "}".to_string(),
-        ];
-
-        // Initial ignored output
-        for text in &tokens[..14] {
-            let events = chat_state.push(StreamResponse {
-                generated_text: None,
-                token: Token {
-                    id: 42,
-                    text: text.to_string(),
-                    logprob: 0.0,
-                    special: false,
-                },
-                top_tokens: vec![],
-                index: 0,
-                details: None,
-            });
-            assert_eq!(events.len(), 0);
-        }
-
-        // No tool output
-        let mut output = String::new();
-        for text in &tokens[14..14 + 7] {
-            let events = chat_state.push(StreamResponse {
-                generated_text: None,
-                token: Token {
-                    id: 42,
-                    text: text.to_string(),
-                    logprob: 0.0,
-                    special: false,
-                },
-                top_tokens: vec![],
-                index: 0,
-                details: None,
-            });
-            assert_eq!(events.len(), 1);
-            match &events[0] {
-                CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
-                    assert_eq!(choices.len(), 1);
-                    if let ChatCompletionChoice {
-                        delta: ChatCompletionDelta::Chat(TextMessage { content, .. }),
-                        ..
-                    } = &choices[0]
-                    {
-                        output.push_str(content);
-                    } else {
-                        panic!("Expected plain message");
-                    }
-                }
-                _ => panic!("Unexpected chunk"),
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 0, "{events:?}");
+            } else {
+                panic!("Expected chat events");
             }
         }
 
-        assert_eq!(output, "I am a helpful assistant!\"");
-
-        // No tool finish
-        for text in &tokens[14 + 7..] {
-            let events = chat_state.push(StreamResponse {
-                generated_text: None,
-                token: Token {
-                    id: 42,
-                    text: text.to_string(),
-                    logprob: 0.0,
-                    special: false,
-                },
-                top_tokens: vec![],
-                index: 0,
-                details: None,
-            });
-            assert_eq!(events.len(), 0);
-        }
-    }
-
-    #[test]
-    fn test_chat_stream_tool_no_tool_inline_json() {
-        let mut chat_state = ChatState::new(
-            true,
-            StreamOptions {
-                include_usage: true,
-            },
-            "fingerprint".to_string(),
-            "model_id".to_string(),
-            false,
-            "0".to_string(),
-        );
-
-        let tokens = vec![
-            "{\"".to_string(),
-            "function".to_string(),
-            "\":".to_string(),
-            " {\"".to_string(),
-            "_".to_string(),
-            "name".to_string(),
-            "\":".to_string(),
-            " \"".to_string(),
-            "no".to_string(),
-            "_tool".to_string(),
-            "\",".to_string(),
-            " \"".to_string(),
-            "content".to_string(),
-            "\":".to_string(),
-            " \"".to_string(),    // Token 14
-            "{\\\"".to_string(),  // Event 1
-            "a".to_string(),      // Event 1
-            "\\\":".to_string(),  // Event 1
-            "2".to_string(),      // Event 2
-            ",\\".to_string(),    // Event 2
-            "\"".to_string(),     // Event 2
-            "b".to_string(),      // Event 3
-            "\\\": ".to_string(), // Event 4
-            "1".to_string(),      // Event 5
-            "}".to_string(),      // Event 5
-            "\"}".to_string(),    // Extra inside the string quote that would get removed
-            "}".to_string(),
-        ];
-        let tokens: Vec<_> = tokens
-            .into_iter()
-            .map(|text| StreamResponse {
-                generated_text: None,
-                token: Token {
-                    id: 42,
-                    text: text.to_string(),
-                    logprob: 0.0,
-                    special: false,
-                },
-                top_tokens: vec![],
-                index: 0,
-                details: None,
-            })
-            .collect();
-
-        // Initial ignored output
-        for token in &tokens[..14] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
-        }
-
         // No tool output
-        let mut output = String::new();
-        for token in &tokens[14..14 + 12] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 1, "Current text is {output:?}");
-            let content = get_text_content(&events[0]);
-            output.push_str(content);
-        }
-
-        assert_eq!(output, "{\"a\":2,\"b\": 1}");
-
-        // No tool finish
-        for token in &tokens[14 + 12..] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0, "Extra events {events:?}");
+        let events = chat_state.push(tokens[10].clone());
+        if let ChatEvent::NoTool = events {
+            assert!(true);
+        } else {
+            panic!("Expected chat events");
         }
     }
 
@@ -859,26 +574,21 @@ mod tests {
             .collect();
 
         // Initial ignored output
-        for token in &tokens[..13] {
+        for token in &tokens[..10] {
             let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 0, "{events:?}");
+            } else {
+                panic!("Expected chat events");
+            }
         }
 
         // No tool output
-        let mut output = String::new();
-        for token in &tokens[13..13 + 2] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 1, "Current text is {output:?}");
-            let content = get_text_content(&events[0]);
-            output.push_str(content);
-        }
-
-        assert_eq!(output, "");
-
-        // No tool finish
-        for token in &tokens[13 + 2..] {
-            let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0, "Extra events {events:?}");
+        let events = chat_state.push(tokens[10].clone());
+        if let ChatEvent::NoTool = events {
+            assert!(true);
+        } else {
+            panic!("Expected chat events");
         }
     }
 
@@ -946,7 +656,11 @@ mod tests {
         // Initial ignored output
         for token in &tokens[..11] {
             let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0, "{events:?}");
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 0, "{events:?}");
+            } else {
+                panic!("Expected chat events");
+            }
         }
 
         // No tool output
@@ -954,13 +668,17 @@ mod tests {
         let mut output_name = String::new();
         for token in &tokens[11..11 + 17] {
             let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 1);
-            let (name, arguments) = get_tool_call_content(&events[0]);
-            if let Some(name) = name {
-                assert_eq!(name, "get_current_weather");
-                output_name.push_str(&name);
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 1);
+                let (name, arguments) = get_tool_call_content(&events[0]);
+                if let Some(name) = name {
+                    assert_eq!(name, "get_current_weather");
+                    output_name.push_str(&name);
+                }
+                output.push_str(arguments);
+            } else {
+                panic!("Expected chat events");
             }
-            output.push_str(arguments);
         }
 
         assert_eq!(output_name, "get_current_weather");
@@ -972,7 +690,11 @@ mod tests {
         // No tool finish
         for token in &tokens[11 + 17..] {
             let events = chat_state.push(token.clone());
-            assert_eq!(events.len(), 0);
+            if let ChatEvent::Events(events) = events {
+                assert_eq!(events.len(), 0, "{events:?}");
+            } else {
+                panic!("Expected chat events");
+            }
         }
     }
 }
diff --git a/router/src/infer/tool_grammar.rs b/router/src/infer/tool_grammar.rs
index 7770cd9d..e4e20859 100644
--- a/router/src/infer/tool_grammar.rs
+++ b/router/src/infer/tool_grammar.rs
@@ -40,13 +40,13 @@ impl ToolGrammar {
                             ),
                             arguments: json!({
                                 "type": "object",
-                                "properties": {
-                                    "content": {
-                                        "type": "string",
-                                        "description": "The response content",
-                                    }
-                                },
-                                "required": ["content"]
+                                // "properties": {
+                                //     "content": {
+                                //         "type": "string",
+                                //         "description": "The response content",
+                                //     }
+                                // },
+                                // "required": ["content"]
                             }),
                         },
                     }))
diff --git a/router/src/server.rs b/router/src/server.rs
index d0fa15d5..0346b1f1 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1,4 +1,4 @@
-use crate::chat::ChatState;
+use crate::chat::{ChatChoice, ChatEvent, ChatState};
 /// HTTP Server logic
 use crate::config::Config;
 use crate::infer::{Backend, Infer, InferError, InferResponse, InferStreamResponse};
@@ -1151,7 +1151,7 @@ pub(crate) async fn chat_completions(
     Extension(infer): Extension<Infer>,
     Extension(compute_type): Extension<ComputeType>,
     Extension(info): Extension<Info>,
-    Json(chat): Json<ChatRequest>,
+    Json(mut chat): Json<ChatRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
     let span = tracing::Span::current();
     metrics::counter!("tgi_request_count").increment(1);
@@ -1166,7 +1166,7 @@ pub(crate) async fn chat_completions(
     tracing::debug!("Got chat_template {:?}", infer.chat_template);
     let id = chat.next_tool_call_id();
     let (generate_request, using_tools): (GenerateRequest, bool) =
-        chat.try_into_generate(&infer)?;
+        chat.clone().try_into_generate(&infer)?;
     span.record("parameters", format!("{:?}", generate_request.parameters));
     let logprobs = logprobs.unwrap_or_default();
 
@@ -1178,21 +1178,41 @@ pub(crate) async fn chat_completions(
     let system_fingerprint = format!("{}-{}", info.version, info.docker_label.unwrap_or("native"));
     // switch on stream
     if stream {
-        let (headers, response_stream) =
-            generate_stream_internal(infer, compute_type, Json(generate_request), span).await;
+        let (headers, response_stream) = generate_stream_internal(
+            infer.clone(),
+            compute_type.clone(),
+            Json(generate_request),
+            span.clone(),
+        )
+        .await;
 
         let response_stream = async_stream::stream! {
             let mut response_stream = Box::pin(response_stream);
-            let mut state = ChatState::new(using_tools, stream_options, system_fingerprint, model_id, logprobs, id);
+            let mut state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone());
             while let Some(result) = response_stream.next().await {
                 match result{
                 Ok(stream_token) => {
                     let events = state.push(stream_token);
-                    for chat_complete in events{
-                        yield Ok(Event::default().json_data(chat_complete).unwrap_or_else(|e| {
-                            tracing::error!("Failed to serialize ChatCompletionChunk: {:?}", e);
-                            Event::default()
-                        }));
+                    match events{
+                        ChatEvent::NoTool => {
+                            chat.tools = None;
+                            chat.response_format = None;
+                            let (generate_request, using_tools): (GenerateRequest, bool) =
+                                chat.clone().try_into_generate(&infer).unwrap();
+                            assert!(!using_tools);
+                            let (_headers, response_stream2) =
+                                generate_stream_internal(infer.clone(), compute_type.clone(), Json(generate_request), span.clone()).await;
+                            state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone());
+                            response_stream = Box::pin(response_stream2);
+                        }
+                        ChatEvent::Events(events) => {
+                            for chat_complete in events{
+                                yield Ok(Event::default().json_data(chat_complete).unwrap_or_else(|e| {
+                                    tracing::error!("Failed to serialize ChatCompletionChunk: {:?}", e);
+                                    Event::default()
+                                }));
+                            }
+                        }
                     }
                 }
                 Err(err) => yield Ok(err.into_openai_event())
@@ -1204,8 +1224,13 @@ pub(crate) async fn chat_completions(
         let sse = Sse::new(response_stream).keep_alive(KeepAlive::default());
         Ok((headers, sse).into_response())
     } else {
-        let (headers, input_length, Json(generation)) =
-            generate_internal(Extension(infer), compute_type, Json(generate_request), span).await?;
+        let (mut headers, mut input_length, Json(generation)) = generate_internal(
+            Extension(infer.clone()),
+            compute_type.clone(),
+            Json(generate_request),
+            span.clone(),
+        )
+        .await?;
 
         let current_time = std::time::SystemTime::now()
             .duration_since(std::time::UNIX_EPOCH)
@@ -1213,7 +1238,26 @@ pub(crate) async fn chat_completions(
             .as_secs();
 
         let (tool_calls, output) = if using_tools {
-            crate::chat::parse_output(&generation.generated_text)?
+            match crate::chat::parse_output(&generation.generated_text)? {
+                ChatChoice::NoTool => {
+                    chat.tools = None;
+                    chat.response_format = None;
+                    let (generate_request, using_tools): (GenerateRequest, bool) =
+                        chat.clone().try_into_generate(&infer)?;
+                    assert!(!using_tools);
+                    let (headers_final, input_length_final, Json(generation)) = generate_internal(
+                        Extension(infer),
+                        compute_type,
+                        Json(generate_request),
+                        span,
+                    )
+                    .await?;
+                    headers = headers_final;
+                    input_length = input_length_final;
+                    (None, Some(generation.generated_text))
+                }
+                ChatChoice::ToolCalls(tool_calls) => (Some(tool_calls), None),
+            }
         } else {
             (None, Some(generation.generated_text))
         };

From f01dc9e7435734118b15090b1680227c65b4c7da Mon Sep 17 00:00:00 2001
From: David Corvoysier <david.corvoysier@gmail.com>
Date: Wed, 12 Mar 2025 09:53:15 +0100
Subject: [PATCH 143/183] Update neuron backend (#3098)

* feat(neuron): use AWS Neuron SDK 2.21.1

* feat(neuron): bump optimum-neuron version

* feat(neuron): tag latest image for local tests

* test(neuron): simplify sampling test
---
 Dockerfile.neuron                         | 24 +++++++++++------------
 backends/neuron/Makefile                  |  1 +
 integration-tests/neuron/test_generate.py | 14 ++++---------
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 5a22fab3..b2e0eb2c 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -5,7 +5,7 @@ RUN mkdir -p /tgi
 # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
 FROM alpine AS optimum-neuron
 RUN mkdir -p /optimum-neuron
-ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
+ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.1.0.tar.gz /optimum-neuron/sources.tar.gz
 RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
@@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
 # Install neuronx packages
 RUN apt-get update -y \
     && apt-get install -y --no-install-recommends \
-    aws-neuronx-dkms=2.18.20.0 \
-    aws-neuronx-collectives=2.22.33.0-d2128d1aa \
-    aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
-    aws-neuronx-tools=2.19.0.0 \
+    aws-neuronx-dkms=2.19.64.0 \
+    aws-neuronx-collectives=2.23.135.0-3e70920f2 \
+    aws-neuronx-runtime-lib=2.23.112.0-9b5179492 \
+    aws-neuronx-tools=2.20.204.0 \
     libxml2 \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
@@ -120,16 +120,16 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
 
 # Install manually torch CPU version to avoid pulling CUDA
 RUN pip3 install \
-    torch==2.1.2 \
-    torchvision==0.16.2 \
+    torch==2.5.1 \
+    torchvision==0.20.1 \
     --index-url https://download.pytorch.org/whl/cpu
 
 RUN pip3 install \
-    neuronx-cc==2.15.143.0 \
-    torch-neuronx==2.1.2.2.3.2 \
-    transformers-neuronx==0.12.313 \
-    neuronx-distributed==0.9.0 \
-    libneuronxla==2.0.5347.0 \
+    neuronx-cc==2.16.372.0 \
+    torch-neuronx==2.5.1.2.4.0 \
+    transformers-neuronx==0.13.322 \
+    neuronx-distributed==0.10.1 \
+    libneuronxla==2.1.681.0 \
     --extra-index-url=https://pip.repos.neuron.amazonaws.com
 
 # Install HuggingFace packages
diff --git a/backends/neuron/Makefile b/backends/neuron/Makefile
index 6c5002ce..06674971 100644
--- a/backends/neuron/Makefile
+++ b/backends/neuron/Makefile
@@ -25,6 +25,7 @@ image:
 				 --ulimit nofile=100000:100000 \
 				 --build-arg VERSION=$(VERSION) \
 				 -t text-generation-inference:$(VERSION)-neuron ${root_dir}
+	docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron
 
 install_server:
 	make -C ${mkfile_dir}/server install VERSION:=${VERSION}
diff --git a/integration-tests/neuron/test_generate.py b/integration-tests/neuron/test_generate.py
index 6a1b4990..f0804356 100644
--- a/integration-tests/neuron/test_generate.py
+++ b/integration-tests/neuron/test_generate.py
@@ -49,17 +49,11 @@ async def test_model_single_request(tgi_service):
         max_new_tokens=128,
         seed=42,
     )
-    sample_expectations = {
-        "gpt2": "Deep Learning",
-        "llama": "Deep Learning",
-        "mistral": "Deep learning",
-        "qwen2": "Deep Learning",
-        "granite": "Deep learning",
-    }
-    assert sample_expectations[service_name] in response
+    # The response must be different
+    assert not response.startswith(greedy_expectations[service_name])
 
-    # Sampling with stop sequence
-    stop_sequence = sample_expectations[service_name][-5:]
+    # Sampling with stop sequence (using one of the words returned from the previous test)
+    stop_sequence = response.split(" ")[-5]
     response = await tgi_service.client.text_generation(
         "What is Deep Learning?",
         do_sample=True,

From 4ac06ddf56ca30f13f8f89bdf009228d92c6ade4 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 12 Mar 2025 10:11:33 +0100
Subject: [PATCH 144/183] Preparing relase 3.2.0 (#3100)

* Preparing relase 3.2.0

* Forgot the README.

* Update doc.
---
 Cargo.lock                                       | 16 ++++++++--------
 Cargo.toml                                       |  2 +-
 README.md                                        |  6 +++---
 docs/openapi.json                                |  2 +-
 docs/source/backends/neuron.md                   |  2 +-
 .../source/basic_tutorials/gated_model_access.md |  2 +-
 docs/source/conceptual/quantization.md           |  6 +++---
 docs/source/installation_amd.md                  |  2 +-
 docs/source/installation_intel.md                |  4 ++--
 docs/source/installation_nvidia.md               |  2 +-
 docs/source/quicktour.md                         |  4 ++--
 docs/source/reference/api_reference.md           |  2 +-
 12 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9b036e69..5e13e855 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4617,7 +4617,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.1.2-dev0"
+version = "3.2.1-dev0"
 dependencies = [
  "async-trait",
  "clap 4.5.30",
@@ -4638,7 +4638,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-benchmark"
-version = "3.1.2-dev0"
+version = "3.2.1-dev0"
 dependencies = [
  "average",
  "clap 4.5.30",
@@ -4658,7 +4658,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "3.1.2-dev0"
+version = "3.2.1-dev0"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -4676,7 +4676,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "3.1.2-dev0"
+version = "3.2.1-dev0"
 dependencies = [
  "clap 4.5.30",
  "ctrlc",
@@ -4697,7 +4697,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "3.1.2-dev0"
+version = "3.2.1-dev0"
 dependencies = [
  "anyhow",
  "async-stream",
@@ -4749,7 +4749,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-llamacpp"
-version = "3.1.2-dev0"
+version = "3.2.1-dev0"
 dependencies = [
  "async-trait",
  "bindgen 0.71.1",
@@ -4767,7 +4767,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v2"
-version = "3.1.2-dev0"
+version = "3.2.1-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -4816,7 +4816,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v3"
-version = "3.1.2-dev0"
+version = "3.2.1-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
diff --git a/Cargo.toml b/Cargo.toml
index 09a3a4b2..d52adec4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "3.1.2-dev0"
+version = "3.2.1-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
diff --git a/README.md b/README.md
index e0633980..64499c07 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/docs/openapi.json b/docs/openapi.json
index 5888e9ff..85ca3f97 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "3.1.2-dev0"
+    "version": "3.2.1-dev0"
   },
   "paths": {
     "/": {
diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md
index e7ba8873..7a8b09e5 100644
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:
 
 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.1.1-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.2.0-neuron <service_parameters>
 ```
 
 - system parameters are used to map ports, volumes and devices between the host and the service,
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 2cbd7e06..449969af 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0 \
     --model-id $model
 ```
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index 77a64a88..b86a4261 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model --quantize bitsandbytes
 ```
 
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model --quantize bitsandbytes-nf4
 ```
 
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.1.1 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model --quantize gptq
 ```
 
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
index 20ef26a8..25b96fd3 100644
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
     --device=/dev/kfd --device=/dev/dri --group-add video \
     --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.1-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.2.0-rocm \
     --model-id $model
 ```
 
diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md
index a0bf11d1..f075cc50 100644
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.1-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.0-intel-xpu \
     --model-id $model --cuda-graphs 0
 ```
 
@@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.1-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.0-intel-cpu \
     --model-id $model --cuda-graphs 0
 ```
 
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
index 3b20c7e1..1adffb9a 100644
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.1 \
+    ghcr.io/huggingface/text-generation-inference:3.2.0 \
     --model-id $model
 ```
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index be905102..540a959e 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.1.1 \
+    ghcr.io/huggingface/text-generation-inference:3.2.0 \
     --model-id $model
 ```
 
@@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.1.1 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.2.0 --help
 ```
 
 </Tip>
diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
index bc4029e4..cf5ff2d7 100644
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@@ -163,7 +163,7 @@ hub = {
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.1.1"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.0"),
  env=hub,
  role=role,
 )

From d4c6faa67be35c7db64072934b86649d93ceef10 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 12 Mar 2025 10:12:24 +0100
Subject: [PATCH 145/183] Try to fix on main CI color. (#3101)

---
 .github/workflows/build.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index b7cc7955..e849f7c0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -224,7 +224,12 @@ jobs:
       - name: Final
         id: final
         run: |
-          echo "docker_image=docker.io/huggingface/text-generation-inference-ci:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
+
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "docker_image=docker.io/huggingface/text-generation-inference-ci:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "docker_image=ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
+          fi
           echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
           echo "docker_volume=${{ env.DOCKER_VOLUME }}" >> "$GITHUB_OUTPUT"
           echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"

From c73ae0bd88c47ad4daad294faa545d411cbae36e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 13 Mar 2025 10:36:29 +0100
Subject: [PATCH 146/183] Update to `kernels` 0.2.1 (#3084)

* Update to `kernels` 0.2.1

The package was renamed from `hf-kernels` to `kernels`. The new version
also updates the lockfile format.

* Download kernels in `install-cuda` target
---
 .github/workflows/tests.yaml                  |    2 +-
 Dockerfile                                    |    6 +-
 flake.lock                                    |    8 +-
 flake.nix                                     |    2 +-
 nix/server.nix                                |    4 +-
 server/Makefile                               |    1 +
 server/hf-kernels.lock                        | 7088 -----------------
 server/kernels.lock                           |  272 +
 server/pyproject.toml                         |    4 +-
 .../text_generation_server/utils/kernels.py   |    2 +-
 server/uv.lock                                |   30 +-
 11 files changed, 303 insertions(+), 7116 deletions(-)
 delete mode 100644 server/hf-kernels.lock
 create mode 100644 server/kernels.lock

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index f8c17dc7..895b4dd4 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -46,7 +46,7 @@ jobs:
       - name: Download locked kernels
         run: |
           source ./.venv/bin/activate
-          hf-kernels download server
+          kernels download server
       - name: Run server tests
         run: |
           source ./.venv/bin/activate
diff --git a/Dockerfile b/Dockerfile
index fb87968a..cbb0977f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -183,12 +183,12 @@ COPY server server
 COPY server/Makefile server/Makefile
 ENV HF_KERNELS_CACHE=/kernels
 RUN cd server && \
-	uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project --active && \
+	uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project --active && \
     make gen-server-raw && \
-    hf-kernels download .
+    kernels download .
 
 RUN cd server && \
-    uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --active --python=${PYTHON_VERSION} && \
+    uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --active --python=${PYTHON_VERSION} && \
     uv pip install nvidia-nccl-cu12==2.25.1 && \
     pwd && \
     text-generation-server --help
diff --git a/flake.lock b/flake.lock
index 719cdeea..d049a71d 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,16 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1740049068,
-        "narHash": "sha256-heYzYOt+TSnRKHIV24s74yEjLkTbBfjNCWHdQEX++eI=",
+        "lastModified": 1741617161,
+        "narHash": "sha256-cwKYAsIVSLtoLbG48+oi3NkSrvuZRLYs8lkJmpDsTw0=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "143e8451efa22b120f97e6698508e9a0aed82769",
+        "rev": "5946021ec6cb6aae18158a9dc27f893cfbab2925",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "hub-rotary",
+        "ref": "kernels-0.2.0",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 5058667a..ac4331c2 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/hub-rotary";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/kernels-0.2.0";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/nix/server.nix b/nix/server.nix
index 0640fe3a..1d00b978 100644
--- a/nix/server.nix
+++ b/nix/server.nix
@@ -16,8 +16,8 @@
   grpcio-reflection,
   grpcio-status,
   grpcio-tools,
-  hf-kernels,
   hf-transfer,
+  kernels,
   loguru,
   mamba-ssm,
   moe,
@@ -91,8 +91,8 @@ buildPythonPackage {
     grpcio-reflection
     grpcio-status
     grpcio-tools
-    hf-kernels
     hf-transfer
+    kernels
     loguru
     mamba-ssm
     moe
diff --git a/server/Makefile b/server/Makefile
index 0db6f89b..3abc917e 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -39,6 +39,7 @@ install: install-cuda
 install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
 	uv pip install -e ".[attention,bnb,marlin,moe]"
 	uv pip install nvidia-nccl-cu12==2.22.3
+	kernels download .
 
 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
 
diff --git a/server/hf-kernels.lock b/server/hf-kernels.lock
deleted file mode 100644
index 7dc75943..00000000
--- a/server/hf-kernels.lock
+++ /dev/null
@@ -1,7088 +0,0 @@
-[
-  {
-    "repo_id": "kernels-community/paged-attention",
-    "sha": "331b7e63a6b592799c8bc992f681bb1ee2c865a2",
-    "files": [
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "609570440c63122010e6254ac2f92d4e4e52ec02"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/_paged_attention_fao6f4gjjrpl6.abi3.so",
-        "blob_id": "a4e60f2c567eb63c84430e9b80acaa0aa6974b1e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "9e52382b912b4e2d07f84982f762345debdbbfc8"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/_paged_attention_eo7ts45r6k64y.abi3.so",
-        "blob_id": "c20f9501a41daa820dfda27434674d032931b51e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "5f01e3f8c4ae3a031f109f78e010014d34347647"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/_paged_attention_5odgyxqhwqtv2.abi3.so",
-        "blob_id": "74f9714690337f49661c641a4f60f6e1e1f56cfa"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "a3016a6b1cd7ae051012084bbd39d6f2e0913ace"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/_paged_attention_uy2moinaww2jc.abi3.so",
-        "blob_id": "445652acd4719542710cda86a2d08c70a56c8094"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "e2cd992a80d4b938f243f0e6060e863278aca7f6"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/_paged_attention_35dt23tewn2p2.abi3.so",
-        "blob_id": "1f6414c382a753edb7512927ac5f3e31b196531d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "150412d67365be8ae5668f83d1939148bb576050"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/_paged_attention_fhq57q56w3m5o.abi3.so",
-        "blob_id": "ee97eee26a4de8d14d7ccdadaf406eed8405de39"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "2bfef111c96308e595eb628bc88ab660a443089c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/_paged_attention_xvepb4loq5mm2.abi3.so",
-        "blob_id": "1ea51bd49f8ec76bbe306a261021da52fe6a980f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "8928daeec47128544cef187bf18f214fc2238019"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/_paged_attention_uyfdujhnc2xoe.abi3.so",
-        "blob_id": "cf8ebe40f27db0fa87c46d7b4066494e65843820"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "dff8537df63e1ef37769a6b7ba6b8c58192d7faa"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/_paged_attention_pervvqmod6pi4.abi3.so",
-        "blob_id": "77eb42e3471e9aa84d1f5d9854995c9737ed6bf3"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "543c64d1589cb1747d7dc1ac29bd8f2cbeb61ab7"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/_paged_attention_24rowhxd5ebcc.abi3.so",
-        "blob_id": "43ec3529d8eac816c31cc1eaad4cc2baa3cbd3d6"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "1d62b9bb1cfb040d7f68cd108ac9067100b4cf2d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/_paged_attention_5yleoqr3zje4w.abi3.so",
-        "blob_id": "ffed60cc0a3948bdea6aa7fb4d486d9b943215ec"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/__init__.py",
-        "blob_id": "9de56043369487facc1f163df6bd319c9806e5ca"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_custom_ops.py",
-        "blob_id": "a0c0b8db085468dee5100c98d14106a9ee917bf2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_ops.py",
-        "blob_id": "ee817d13be64b46e3cb44ad192af4a5f3817bbf7"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/_paged_attention_3rbp7xipfucgo.abi3.so",
-        "blob_id": "5d5b3ffda2fd6a830d12341bab26dc5ec03f4a86"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/paged_attention/platforms.py",
-        "blob_id": "aa06132e74cd7fb634044a76e528979b02a3559b"
-      }
-    ]
-  },
-  {
-    "repo_id": "kernels-community/moe",
-    "sha": "605a216f507b9a97b543140dee8937a4622069a8",
-    "files": [
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/_moe_2f2wzwk42r5t2.abi3.so",
-        "blob_id": "b1f0ac7d52d2cbb7b49dd4e3e23eaf0b6acd3364"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py",
-        "blob_id": "83a6a6a42d633c9b40e263b40b028086d2609b80"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/_moe_qx7m4hiw6tx7s.abi3.so",
-        "blob_id": "cfdb823fabc296c258f58ce8e03a347be7eb558f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py",
-        "blob_id": "0d77c5d3e29106cf62a45153770fafbff59b2932"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/_moe_ctvji3e7dq64w.abi3.so",
-        "blob_id": "db5c3be15bc329bc0aa5b87d34223b747751484e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py",
-        "blob_id": "b22bd5b27938464e6c7359b1974db9b472effa6b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/_moe_2dhx2xkm6c5wu.abi3.so",
-        "blob_id": "56dc9c91ddc02b281dcf7c996071bee341ef026c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py",
-        "blob_id": "97f4f6344f4a61a52c8077cdc7884400e56c558b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/_moe_wfwoejktckaue.abi3.so",
-        "blob_id": "a679d38667a74beffaca30bb9c6628c6b7d0b1c0"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py",
-        "blob_id": "9aebf2be86d230b6de5510163c7b53dcc3aa7c51"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_plhnk6yxrdq3c.abi3.so",
-        "blob_id": "a16cd30c7ff53b3d73fa081369b6443efa5fb184"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py",
-        "blob_id": "5b5d43b13c586ead5f177bcb71ba17c078eb016d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/_moe_mrhezivmofzdg.abi3.so",
-        "blob_id": "5423719c9ac75f96528fc0b7386a108aedc996b1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py",
-        "blob_id": "f78c4d78eceaa86a9d245eea4d5562167db8f59b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_lvqy7x44edhqo.abi3.so",
-        "blob_id": "51f9c5792c1d7bcb03a8906e9bc60e779ba1b343"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py",
-        "blob_id": "510c892dcc2479877c6c2fc5c20f6a534dc90d51"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_gasoel7noy6kw.abi3.so",
-        "blob_id": "cd914e2830fbe3fbdcf31c5fa2f37c384d2c36d5"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py",
-        "blob_id": "c69fb498baf329dda803ec0f90dc4b7756fb1ff0"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_cobe53r755p6a.abi3.so",
-        "blob_id": "0082e984366e264ad72eb429f4e138d45f5cbcaf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py",
-        "blob_id": "153d250d92d9f1bf4b6e318287370706aa5cd385"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/_moe_iqmfy23diekyw.abi3.so",
-        "blob_id": "25f80aad80865365eae32a1609be0219f7e6582e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py",
-        "blob_id": "73e0213d234c5717aea5d708cff8e0938f14bce9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/__init__.py",
-        "blob_id": "cc806778863c03ccb3157343cd6331c1c6ca332c"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/_moe_6xluzhr5x6fw4.abi3.so",
-        "blob_id": "63a2723397b7c031719534dd6c23e3eaa1a85a23"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py",
-        "blob_id": "82fd79cf07706341ad17f8ea5d841b6d018cd676"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "56c1a4e3af0b4a93fff71028d8e04bf73f0abb29"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3677bebb82a7f3f19344ef6471626493cf2c5bb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "265768fb900ccfe9612b4a0d25973e6618f22a79"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d3be23dfc903ba61d3d4d79c0230952b24d2ead0"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "589f5d39f31418d5121e7cbb2e6f2894b0a7ed32"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "2c78bfaba7890772bf266721f5577202ea443882"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "4da841e74a79f9589fecac1fa557ea132d34805f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "200356713c0d0a76e199671c7ec8f10d0e5ee0ac"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "e076615ee541a5043556f630ecf0946c4e2c1408"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "ee896554b921040d7810bb6e9368cc200777951d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "05aed8b1c81492151d128ef251afc510d8cc8ed5"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "d251f9b5accaec977fc87a0999cd56ee387fc650"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51ad5b299eb22465fa80530d12bdd5d7a03ce398"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "ee5119182556cf49434c10e56cf04e3baeb26408"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "68793c77b33c4f4b97d0a4b780fcbe8043c799de"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "612910720ed9439e56c4af4c03f30fee224fac80"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "039a10ed127b77836a7f41c03513292613852b30"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "3793fcafee60bc7e8f5f12d601cb3192abfa9ca8"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "51d03d8607122d7b9bc20ba48d8432d62367fa00"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "26f9abd6b789e9dd0f83ec7721fd1bae8aa76bec"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cd0cdbea0c3372674cb610870dd0b30325864549"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "64be6e6591422aa0f441c3747b6c49850929652e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0a6a6a73fa45e270f01ba7ebdc6d9d55bf9daad3"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "ba9041d008507e31ae4179ef2bc863a49c606582"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",
-        "blob_id": "7a7508aab04599cb06641c835d8b0a14f54d0716"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbf9a2dd6f048d8adee290961e2aea72035f7615"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",
-        "blob_id": "bbb2386046b1135a2cc7ab7cb26c1d0b039bcf3a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "57055453aa24c831dad9ac8e37fdab707c63ef91"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "8cc6c643f236d2f7f9ad29354d9e469d00b20d3f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "d4c9ddd12972ac0b5fd2be11a9cd1075906e3978"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "b2799ed3a866e25b78d60d92910c000ebb21ff71"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b8d3be2313fa14025d8aeb2fd11e0d1ee997ffa6"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "6a976788f9b10af19ebcfe582a69cbc627f9457b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "3f3ccdafa88f3452a695efad4cb9622d6ae79e6a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "0a46390b2e31bba6a7c3ab2c9f6c8de6004857bb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "f4c0f8417b384870050a95e0cf57edbdf6352b23"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "5c8185cfdeec167ec4b88de51b4b395e28769cc5"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "97c9f4445b166657ad29f1db9fc8281f9c463ec4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "0bb423b28f5ab3825929a4870b96393262a9dd9f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "55571873395464a3b58f549523905f439a8f1716"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "26bcbf26970c7a77c99e2c8eacd83eefa86967bf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "91011e64c7de4505e9bb462bc70e6a3e7affa878"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json",
-        "blob_id": "b41f9d443e50678334f906b44fce6d018d69500e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "edf2a38d12ad3f420f232d2cd61ab149ad138725"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "b2100cebb7f589747430be9ca8c8db368c152d78"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json",
-        "blob_id": "d720deb4bdd73d194b1023c99e190b8fcfecdaef"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "dbc624731f5cb9afcdc9213183d00d1e5edd4a00"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "cc614e635ea57327c610ce79e99ae5339614f22e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "32c0c9da471cbe479044095e0ed14a0f54b73620"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json",
-        "blob_id": "f807d4a5abaed9dd686df26837f2dd9f6161300f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",
-        "blob_id": "f578c8d0160ac3ef85b53c8539d3675455a97173"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "918f6839620cbab1f30b0f9383a9129c2cf2cf3d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json",
-        "blob_id": "e341a67917d5177bacb3f6767e7b6d92539826ad"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",
-        "blob_id": "34b916e574f88c65db1dac5889d74a990dc25e9b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6496a38fba8ae09b3025a75f357815b9d6a5e3f4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3618053b65831b95c4bb0f20ef3b9aa816b2d637"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "46a982f5ee9a4bd67ce244b101c576efeeb53b78"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "035ec027fa56622196b24a03a5042ce010deaebf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8b49f2781cb54d19a2789767ebb7e8c3fb55b981"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "851bc9f9f0b50b41451b929eaa518869b6a05412"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "d1227c2157990216d2ca51c69ad0944017f53b6a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1c61451fb34e52deec827f8f63c80fb15830c202"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63e661c80de6a7b1422f7a994a2ee7a4b724911c"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cf354037903c0d1fcd077c4647aabce026a723fb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a5d7bfdba4852da9ed08d1bc27cd7d521d09965"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cb91a279d423d0ca25197e0edd5e8c2f4da58720"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "88af48431d8b8791af8df03429704606b670f1f7"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "dd069726d7ed4dcbb449af243f4f4af21815f854"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7febe3d272b4bb76500f7c6b523396129fd53680"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "56b939e52fac3ed53a4e0ba640c40010cb3af30a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "63d9a0bf5d79ddaaad547d44338ad4b959ad72b1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "7fa398c15a2a535401709b0f25e20f6e4b23e58e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f15d8f64c7090bd71d0091a524c65d7818fec38e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "cd3e07804fdec10c2cfb291c1ede3ba67b753f9c"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9d5a329d7466a37c0ca68a65a089fbb99f9327a9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "03dba5ad15ba5f7f49100a5c78e8685e64334b2a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "96e1594a3eabbaedc792b84b07f05ae8752b7251"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5ffd367df833d773355590220598a3c7eceba4e0"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9a5ff48b8942957dde9b862aed848390dd267948"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "eabc423949a24c2a1fb2368a73e5249caf8d07df"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "386928de139ce718f28222b9c1a6555df3958491"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "51e237b91b8e775a36bcf783c078c2c1cecbcbd2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "6280219c9ee7d26f7e2fd3625dc92d847ddc7982"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "40c01c0b92b4b26fe480879dda33f18c5eb59a6d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "c6fd3659799bc31e17f3577e7f0e8d7268faf1fb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "160f12ed3f95a6967439ff53bc3e3a2cdc97c700"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "2bf5eb27e38208871d50348b170c8c74b80fc519"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "9c908e80406587da4d246ce4e3a8a98a14c875b1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "0a1e14cffbb2a894a701352193947d272427db0d"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "15b1c93f60fc5068ba11b82b6d5924dd2024a824"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "f78e7060e6840ff721d306db556636b0bbc8d9b3"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "8ff12e64c172f5a5d0fbdf900728fe60b33877e2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "4532f93681e2be175b1bf94f81bfde711821cd60"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "ca7f32b9552b479dc05495792b7e426db5eb1b56"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "5acea242cc0ad094cba8ee5f568ff88afb1b41ae"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "58cdd93e90b8c29bc7a211861711565dbeeb529a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "b72e0371d1421a1decc9d57860f83eea8f790942"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "3cb7eaa07c745fd3aa2b3242780a7061bedac1de"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",
-        "blob_id": "293adce387e066fce75b6e606d4b8b6a5aa10bdb"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fp8.py",
-        "blob_id": "23bd7d6703104b0020671cc6ba6f78a6df37e4bf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fp8_utils.py",
-        "blob_id": "acb4f3e3bb1a34f209fdac9ecca8c123aaf67f12"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fused_marlin_moe.py",
-        "blob_id": "b3e0a5c24599730faf973fad3cf3fb6031a30522"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py",
-        "blob_id": "af2d798cbe5d7c3c1760ce79f717ab5f6d7700ba"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/platforms.py",
-        "blob_id": "735fab87f2add390f7bf6408ebe31d1f5de6d02b"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/scalar_type.py",
-        "blob_id": "ea749fe8247b6846620ccbba30ddf48d914ca4e1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/marlin_utils.py",
-        "blob_id": "5037f774b8a8b7e88d822efacbb3b4ea5b95d356"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/marlin_utils_test.py",
-        "blob_id": "83faac032ca93b3564c620c5b4b1ef63c74aaddf"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/moe/utils/quant_utils.py",
-        "blob_id": "5819ab753e57655185572ce1e49c24e6268171b4"
-      }
-    ]
-  },
-  {
-    "repo_id": "kernels-community/quantization",
-    "sha": "95272c71ca71b1ddbacb0105dab54e5d5240bd5c",
-    "files": [
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py",
-        "blob_id": "07486f19bf899de0eef6e7de9a2a1b08f48e4530"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_diycyzqnjjd5k.abi3.so",
-        "blob_id": "4f729314a4b4a86bc21b05b5e50f6677e7a83a06"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py",
-        "blob_id": "5430a686ea7ef8be7217b48a28b5429606cf93d6"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_elb4wso45znfy.abi3.so",
-        "blob_id": "831303548db0b8341f67b8c281c5dff35d68c5e3"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py",
-        "blob_id": "f5217e75a63c8f07110478117396d92226c0c0ca"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_unicgkq3a7la6.abi3.so",
-        "blob_id": "c16ea0b257847bdac8068a18921e0a3b5dce6b92"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py",
-        "blob_id": "1e16b3f78f15e21f94d1241338a5ad6fcf9fa6dc"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_f4o2yj2oj7kni.abi3.so",
-        "blob_id": "bda41c03212d0ca513c15046c8b4ca07edf23544"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py",
-        "blob_id": "0d812eef750061481eba1b7ed5fa708cfec31f42"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_6nd6n6ctlfohq.abi3.so",
-        "blob_id": "3a69017602930aa71d55f179769d10bcba21b444"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py",
-        "blob_id": "e5bdaf0f73a5c870ed0d8ae6345cbec989274e16"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_cxckebwxmlb3i.abi3.so",
-        "blob_id": "b51fa64aa95165e295a32926f8ca1e9ecea61ed3"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/_ops.py",
-        "blob_id": "fd057ea2d2b103efdf01641c9767641e093ed947"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/_quantization_vsrdj55erbiei.abi3.so",
-        "blob_id": "eb6e2055b9b9330bcb04c316d2fb4570918d24c3"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/_ops.py",
-        "blob_id": "0527a08325accf310054df153fbcaf26151b921e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/_quantization_gsd2xjzq76rwy.abi3.so",
-        "blob_id": "46d5dc19eb270598aecb4654446449097da3ad8f"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/_ops.py",
-        "blob_id": "83eb9dbc2c91b6b305544ea9f84d41ef73f5ea01"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/_quantization_hbfrcozzte6aq.abi3.so",
-        "blob_id": "d8feec129f933b3eecb28862aec4167876fd5ca8"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/_ops.py",
-        "blob_id": "3b54f0bfc1030429b27a48d4e574778ebe86f820"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/_quantization_womr3pvjbirhe.abi3.so",
-        "blob_id": "f799e8bfc915d482daef4890ee6f8d6b342e0da1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/_ops.py",
-        "blob_id": "c83bf352ab1ca3283391b3dd8c209c1bf6a60eb1"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/_quantization_55qjuxe2uqrp6.abi3.so",
-        "blob_id": "4d1b45a1c98552ca7e4ba4c0e583e266e9f70060"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/__init__.py",
-        "blob_id": "c3ab3b032c29f7bbafd549915dbc677c45a33837"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/_ops.py",
-        "blob_id": "7552c079934b6f76bb1c221358e0ac2f1ca449be"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/_quantization_cyhrq7sx4uskw.abi3.so",
-        "blob_id": "0a4c95a688fcacd6a10c04e101e1deec5a03ffc9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/compressed_tensors.py",
-        "blob_id": "c3ba30bac87979a307fc5061a46f5d2cbf0efbf9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/cutlass.py",
-        "blob_id": "c378b846d0c59de183a321fcad4b403c47b3d750"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/marlin.py",
-        "blob_id": "44d5d28a2fb67af955c017af3cf1403feeecbd32"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/scalar_type.py",
-        "blob_id": "9d711b0debcd8aaa343818edc9d6bbca20587d0a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/__init__.py",
-        "blob_id": "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils.py",
-        "blob_id": "b1c94c38858a5cd6f02eb134d1a94b99a2b15566"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_fp8.py",
-        "blob_id": "b269fa6a4cee316e8299ecc86c3e7594b336b499"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_test.py",
-        "blob_id": "7d4f5f3cfbb872bf7b32e0972d6143b43f354a5e"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_test_24.py",
-        "blob_id": "927fa9016ba25f381c09d768db0c468066193a76"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/marlin_utils_test_qqq.py",
-        "blob_id": "cb58eb945836393c58c53f5c6d702d53861c33f9"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization/utils/quant_utils.py",
-        "blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
-      }
-    ]
-  },
-  {
-    "repo_id": "kernels-community/quantization-eetq",
-    "sha": "a80ce846d6270ddddeee109523ed947f594f246b",
-    "files": [
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "9c191845fb7acbd7ea6bae36ce8c237b168557e1"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_v7rnpcck3kry4.abi3.so",
-        "blob_id": "9edc9126b9ec8ce4f47a8e6688a5f0329c905329"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "ccec58b06a2282da51356fe5d04dd1e2757ce80c"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_zcfiojfkx55be.abi3.so",
-        "blob_id": "ea27fb040515267ec631cec5545b878da680e7cc"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "bb409419898138ffa9ade9ba505a167a067ea378"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_btymam4x7xvs6.abi3.so",
-        "blob_id": "0395dd048ccf10ed020a77fa04bcb026ba369d73"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "f250a00832d2044f7bbb87557a1c878d9c8dd24d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_yy3p6bsf622sq.abi3.so",
-        "blob_id": "c98d156835e442b039d38a82e9f111036750329c"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "b5259247e8fb3ed9429cf005a525edc8bcae4903"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_imijtykkseqze.abi3.so",
-        "blob_id": "c46908ce00d02376ae8e18efebb7fee55afbc3ac"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "79f8d42700ad34b9b46e6e328f90885d1ee9beab"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_4qerj3t7ddiry.abi3.so",
-        "blob_id": "9ba519d2fd4e347b784c21f4c171cbbab57c7774"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "805ec785b7f5196f78dfe77b6cd7c2603c02490e"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_j23ltbqvrnixg.abi3.so",
-        "blob_id": "77d53c16e57c658e8f9caa37b0084c4a3a7ffda1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "7b590a5a6ede67e0ae13f97dbd7a82a4674e1b23"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_p5neqtnhdgxv2.abi3.so",
-        "blob_id": "e3e5fbd8ce3232b6e9a7c3077eab9665b95bef49"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "0be7ffcb2e9590899683a197b977ec0b39ca7cb7"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_idk3dezy35dfk.abi3.so",
-        "blob_id": "61aa67cbe7ce810bf9792e6e8f19219c757ff181"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "998eba3eddd0520769a2b4ecb3402c024bde44ea"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_fpjoxzd7nm2qa.abi3.so",
-        "blob_id": "31d835db1d0348e3f35c23e6a8f2532fd7e9fea7"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "6d5320b05b03f2f3ddfd299d6e2a72aa6116264f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_k7mlunxe2ye4s.abi3.so",
-        "blob_id": "1946e4c2fab63243d051012cb12e19895828145f"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/__init__.py",
-        "blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_ops.py",
-        "blob_id": "9b15d85f44e4223ce1f16df987feafd6640dcc62"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_7m7hz3sbwkaio.abi3.so",
-        "blob_id": "eb1536ccd1dfa2655ea7de4445aa3c6790f3a0ae"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
-        "blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
-      }
-    ]
-  },
-  {
-    "repo_id": "kernels-community/rotary",
-    "sha": "4db658e027ec752840bb3f557ee076413b8db03f",
-    "files": [
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/rotary/_ops.py",
-        "blob_id": "4fe035c87ea1300ffedcfce17338167dd946e0e8"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu118-x86_64-linux/rotary/_rotary_5yzc45v7kk3yu.abi3.so",
-        "blob_id": "f315754ccb3e8b9dfb4d8954aefaac61b2a4e8bc"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/rotary/_ops.py",
-        "blob_id": "d45359065a7cdc43e2d512d38fc0bfcd88138835"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu121-x86_64-linux/rotary/_rotary_tbiepw2a2ep3e.abi3.so",
-        "blob_id": "9bc986ca760b6a57d05e891c3def1769341a2c29"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/rotary/_ops.py",
-        "blob_id": "7421978682125139f1169f3f71789e0cb44d3b45"
-      },
-      {
-        "filename": "build/torch25-cxx11-cu124-x86_64-linux/rotary/_rotary_6w5syhrhmerj6.abi3.so",
-        "blob_id": "35caf7d755000ca2afac33cc7d4f344112751f9f"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/rotary/_ops.py",
-        "blob_id": "d6f569b471eae628e738b3504c8a9a18b4973d97"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu118-x86_64-linux/rotary/_rotary_joujmbgvsytzg.abi3.so",
-        "blob_id": "30c4aaa83f2549f7363631a51b3341cdf0612f15"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/rotary/_ops.py",
-        "blob_id": "f1f71f34bf0f3c5dffb7c147b48f6396f8054310"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu121-x86_64-linux/rotary/_rotary_mi2o7e7sishyw.abi3.so",
-        "blob_id": "e022e9c1101bdb89d43913979107f7a56717ea6d"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/rotary/_ops.py",
-        "blob_id": "a46c19bd5adfb85d5b7795b3b9277e416f31d8ce"
-      },
-      {
-        "filename": "build/torch25-cxx98-cu124-x86_64-linux/rotary/_rotary_rngiohfhfwuge.abi3.so",
-        "blob_id": "1621cba0150465f67aa931fe3a55e38928b48bcb"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/rotary/_ops.py",
-        "blob_id": "3296d23431d1ec084e8644ff5d3d203a74d82ea1"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu118-x86_64-linux/rotary/_rotary_alv7mzltcxxpq.abi3.so",
-        "blob_id": "2f8b3b93bb7c8fae22c8e08c67771683f549f170"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/rotary/_ops.py",
-        "blob_id": "0bae33b64c71d6a6ad748be66a410a662bb5b28a"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu124-x86_64-linux/rotary/_rotary_c4eyapeep6gty.abi3.so",
-        "blob_id": "3a15076dbd1f9a05f1089cc2cb13c03e5838f2b9"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/rotary/_ops.py",
-        "blob_id": "5c5d7e4497962e0a3e9531ec6a5fdb18e995e0f8"
-      },
-      {
-        "filename": "build/torch26-cxx11-cu126-x86_64-linux/rotary/_rotary_lodp6xeztste6.abi3.so",
-        "blob_id": "efc4a4a0001fba7b0743d1d4a9774d1fc9089ee5"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/rotary/_ops.py",
-        "blob_id": "3cdda1ceda90e76b30b08cae6ad718aa2c2ec3ef"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu118-x86_64-linux/rotary/_rotary_z27mls7mz4e7m.abi3.so",
-        "blob_id": "43b0f08ea035cd2fadb6e119802e7d841a523246"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/rotary/_ops.py",
-        "blob_id": "914f1bb6f9499d0245c3f47345f3b95582be28b2"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu124-x86_64-linux/rotary/_rotary_3bktke4p3hz3a.abi3.so",
-        "blob_id": "29c0bba399e6f348ad8baf93daa5166a6ec6994a"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/rotary/__init__.py",
-        "blob_id": "eba8039e210c8b710c5c663ef4e7930757f271be"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/rotary/_ops.py",
-        "blob_id": "11056cd0ed09530830b614b8207cbb7fa7ef3288"
-      },
-      {
-        "filename": "build/torch26-cxx98-cu126-x86_64-linux/rotary/_rotary_fvednlzeqgg5s.abi3.so",
-        "blob_id": "8c7c34d9e603640576ba522dcbed341c0d780a9c"
-      }
-    ]
-  }
-]
diff --git a/server/kernels.lock b/server/kernels.lock
new file mode 100644
index 00000000..9e11de68
--- /dev/null
+++ b/server/kernels.lock
@@ -0,0 +1,272 @@
+[
+  {
+    "repo_id": "kernels-community/paged-attention",
+    "sha": "331b7e63a6b592799c8bc992f681bb1ee2c865a2",
+    "variants": {
+      "torch25-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-8e0aa39abab82f1d21b661d35e0470a24c3ebbdda38532ded805c18037a1ad1e",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu121-x86_64-linux": {
+        "hash": "sha256-b0c3aef6c4c9aac627975cb1a2bfc46a70390763c8165575b89d1651d007c38a",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-960fbc8998439d779adb47fb2a37cce68c7dc075d8a49893bd487be9ca2d1389",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-9d6d60c411c55aa2f9d7c681c2be96f4262d56c96f592f3d4fb35ce4f4f1e18e",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu121-x86_64-linux": {
+        "hash": "sha256-98c0a305b2cc9b7be757fab923d9aa406c686dcd0460e462926f87d051ef3d19",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-71e586416213c96ffbdeae0d077ba97bfde5b00005f2746d4cba2320cb53bf87",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-2f559312c54d558b33a4082ffc3fcf923f51da40ced19bfc8920e998ba2b71bf",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-6033b41a0f8a9509887c6171f0b42d9aa738490903b3fd5ea2c52703c5fb8fc3",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu126-x86_64-linux": {
+        "hash": "sha256-3139f66a53f2bf0c314b4d309893095746bdc9c3914c904fc31adfdf553ed219",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-2173d77e384d8e2881fc38603992c09e8be7bcd9da4cafdd4f2a5ce0ce22caf4",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-7b1aaef81e01ecce83e03c50872910680ff2953f7c6ffd3ff15e8d9497ca9239",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu126-x86_64-linux": {
+        "hash": "sha256-818b160a88b12b8e871099e40f76aa436ee828e2e060ecc35502dbe34a6ebd3b",
+        "hash_type": "git_lfs_concat"
+      }
+    }
+  },
+  {
+    "repo_id": "kernels-community/moe",
+    "sha": "605a216f507b9a97b543140dee8937a4622069a8",
+    "variants": {
+      "torch25-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-855d92f02be3bfba0758161fa1266159d76c172e7c5d43d30816d22cfba76074",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu121-x86_64-linux": {
+        "hash": "sha256-e6e780230477bbbc26fc40cc7fcff50298155998af4fc77a026c9f815ec984b1",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-52c1fb337033c4d1d7a279c5cb28aebbc7389976f21dc5803aeb16b2f7aeb94c",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-1fb654e8d02dda2a2382d1fb3a3ca9738d292eea674b30b80030cdcdfb6a0035",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu121-x86_64-linux": {
+        "hash": "sha256-0cf235f1de85d4ce7490c79aa64220f608f886f313b676d91c331a6a2fd67bbb",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-3def11fee9bf1ea9b1579206fd5f5ecbcaad47ac478e2c3aa7b2c9c7fd5db934",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-3a49ee03f675190a79c7c74a45cc403d491eceb63a943f47d52064a11ca6ef6f",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-dbf20cb11db7d53e11147ab13641eefaa235f9ac2fde1beaf8f56f850c11bd54",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu126-x86_64-linux": {
+        "hash": "sha256-8a07232ab316e8eab74747662cb7b86aac03f44ff158f275768fd59390df2525",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-cdd46301af997eeace5e016d8590969981b3a3f8647828d04baa5fa10c696746",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-c865188e9d2c17f3358f3d343fb40340232457572744bf85efd6b20af545d5f3",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu126-x86_64-linux": {
+        "hash": "sha256-2a8b09f3272ea80491e78a39ff886680471d99f7ba571581809adfe918013898",
+        "hash_type": "git_lfs_concat"
+      }
+    }
+  },
+  {
+    "repo_id": "kernels-community/quantization",
+    "sha": "95272c71ca71b1ddbacb0105dab54e5d5240bd5c",
+    "variants": {
+      "torch25-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-2d0a274cf0117bf7880d6040adafa1b70fe8bff3a00ef2834ed5435a6b525a49",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu121-x86_64-linux": {
+        "hash": "sha256-116458beac63ea5eeb1e7fba7edc68d160cd8ac28f55b926d79035551aac7d5f",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-cace644c6fb04470384796c18987135cb051dfb90a14e902c51a3786fc07c599",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-104c6961cd3e1a74efdf14ea2172acc6647846852fccafe3698a27a6cf37941d",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu121-x86_64-linux": {
+        "hash": "sha256-cdc95b41aa91a803f11f8cd53001895c2b69550b5af2fb278d6f124381229d0b",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-d5388469cb6074f196f20b1e1e4805bb3c967a8147b31ca2c0461aa87b50604e",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-70c4bb3792c4c3207d4963173d8d0ef3b2bda677151aef140662dd87bfa1b69f",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-bcacbb2232f49345f27e07fa821b48a7e3df643c01af37281fcafc74c471f682",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu126-x86_64-linux": {
+        "hash": "sha256-344d20964f7eb133e5ec6fda976fa5ee62807b739a4361f236aca5ae53beb9ac",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-dfaec226550254fbce1a5c7e2f547e85700958a1a4087e1c873d22e6f71a5ceb",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-0abe6460d0a2202b0086e3663092595e5b93b9a9cbb85c10034180cc9bfebc6e",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu126-x86_64-linux": {
+        "hash": "sha256-68e156f94c3c0c9523773b62eaeced93766e0d9ee67d8191fb9570fb5af30d5b",
+        "hash_type": "git_lfs_concat"
+      }
+    }
+  },
+  {
+    "repo_id": "kernels-community/quantization-eetq",
+    "sha": "a80ce846d6270ddddeee109523ed947f594f246b",
+    "variants": {
+      "torch25-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-e06beb00799b1e656583eb0496f09fc0bf1b26f75e9864a2fe19ebd5b62c3671",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu121-x86_64-linux": {
+        "hash": "sha256-c128d3ef6558cfedf045c4a713891792708851b7f6f027de835d9083cb3b297d",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-c7e2e14fc114788634b34a4f670f7bf4d27321e5ed40ff446f5a25eef70222c7",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-58dad53cfbf1315af464f9d8ba7be9012089c839d4f06a8d2cf8ce0deaf5949a",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu121-x86_64-linux": {
+        "hash": "sha256-6519af49c0f689744a7b49497ad2bea1524b69e4095446087d7ab622b898aa30",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-94e0731b58a9ba0e5e2f37b100c8d987c80b5d349008ef625917d020b6c52d25",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-e5b04475538f49d7b4ffded080e4c9c86a658abc12667e3838ebcc410ab1eef4",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-783c02db737a6ec9958b3090f164b87888d3b26e30a4fb6e1cd0c1a635753fab",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu126-x86_64-linux": {
+        "hash": "sha256-a3d81f82f9cfe9d8a6d46758758b3a1b3055d902f41917b4ef2976373db843d6",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-f1de67e17944a9816f778c72ae73bbbc90d795cb4885c2f9ee5e0b9a3c57583b",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-789b50d767a5121a7e5a52eaf0c8e897bf1787f049ca08faffb220e5053a5f10",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu126-x86_64-linux": {
+        "hash": "sha256-7c7fe57fea7b9be253085d506f01b2487b2306f22bdffe1de44397fc9f8a3613",
+        "hash_type": "git_lfs_concat"
+      }
+    }
+  },
+  {
+    "repo_id": "kernels-community/rotary",
+    "sha": "4db658e027ec752840bb3f557ee076413b8db03f",
+    "variants": {
+      "torch25-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-907df2035267a65793985bb7f69fb2a975955fb08c2bbc78c58def43d02801da",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu121-x86_64-linux": {
+        "hash": "sha256-b614735ae61ee2c1825a3c823fa0cdd3aa07d0bb3f4106001b9e1a557c0ca9b9",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-f2e98ec72faaebc1cae25f83ccdbb151868b6902fb5a0623e09d700a514c2a7e",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-421214c5a576fac2e0b7998395dccd7f66010f65a6fc647ce06b106ea91105d2",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu121-x86_64-linux": {
+        "hash": "sha256-9d1c464cf7f391975afa48f2254a639f41582155ad1b50c25bb122418ce8db58",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch25-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-82f8012d78304efaa7318f106907630294d10c8b5c9f56923c71df0b03e09f14",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu118-x86_64-linux": {
+        "hash": "sha256-a3247919dcc392efc7e54725dfbce9ee8a796fe4ee53d113048b313de074d3da",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu124-x86_64-linux": {
+        "hash": "sha256-a21c9734d15946f4cc967d0555d45d7effc6624990c6889fc49162af744fbbe9",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx11-cu126-x86_64-linux": {
+        "hash": "sha256-01cdda160425b29db0d9bb084874ade4ac081735f9717f272aaefe5bcb379ae1",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu118-x86_64-linux": {
+        "hash": "sha256-17be5b770418ad47101c49d8945b5aa32af9eb5a840bdffb0514d0e264edd860",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu124-x86_64-linux": {
+        "hash": "sha256-3cd4b9f63cc903e01325b7e5b204e40fc6600c0685f2e19e6f1fa604a599d82d",
+        "hash_type": "git_lfs_concat"
+      },
+      "torch26-cxx98-cu126-x86_64-linux": {
+        "hash": "sha256-c569f4a4f9b64792507c58d7cfa31dde1285b52125ef07cc98d9f23636af09ca",
+        "hash_type": "git_lfs_concat"
+      }
+    }
+  }
+]
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 07ea1048..e3ec734a 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "grpcio>=1.67.0",
     "grpcio-reflection>=1.67.0",
     "grpcio-status>=1.67.0",
-    "hf-kernels>=0.1.5",
+    "kernels>=0.2.1",
     "hf-transfer>=0.1.8",
     "loguru>=0.7.3",
     "numpy>=1.26,<3",
@@ -36,7 +36,7 @@ dependencies = [
 ]
 
 [build-system]
-requires = ["hf-kernels>=0.1.2", "setuptools"]
+requires = ["kernels>=0.1.7", "setuptools"]
 build-backend = "setuptools.build_meta"
 
 [tool.kernels.dependencies]
diff --git a/server/text_generation_server/utils/kernels.py b/server/text_generation_server/utils/kernels.py
index 42745c71..3943d9a5 100644
--- a/server/text_generation_server/utils/kernels.py
+++ b/server/text_generation_server/utils/kernels.py
@@ -1,7 +1,7 @@
 import importlib
 
 from loguru import logger
-from hf_kernels import load_kernel as hf_load_kernel
+from kernels import load_kernel as hf_load_kernel
 
 from text_generation_server.utils.log import log_once
 
diff --git a/server/uv.lock b/server/uv.lock
index a2293a5c..d4bbb955 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -1,4 +1,5 @@
 version = 1
+revision = 1
 requires-python = ">=3.9"
 resolution-markers = [
     "python_full_version >= '3.12'",
@@ -674,18 +675,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/be/66/7c1a552545a9597fbd33d77c817f1f0cc56736ca64aa0821948f945118d6/grpcio_tools-1.70.0-cp39-cp39-win_amd64.whl", hash = "sha256:840ec536ab933db2ef8d5acaa6b712d0e9e8f397f62907c852ec50a3f69cdb78", size = 1119339 },
 ]
 
-[[package]]
-name = "hf-kernels"
-version = "0.1.6"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "huggingface-hub" },
-    { name = "packaging" },
-    { name = "tomli", marker = "python_full_version < '3.11'" },
-    { name = "torch" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/01/fe/5aa3ea1b66bcc7d81aff19683ea04d4a9cd414c8d4ff05b150fc1f196ccd/hf_kernels-0.1.6.tar.gz", hash = "sha256:5effee5046552ce226ff86d3870a799f4ecae399bcb2beb4046c28c2dd736d2f", size = 8704 }
-
 [[package]]
 name = "hf-transfer"
 version = "0.1.9"
@@ -814,6 +803,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/0f/8910b19ac0670a0f80ce1008e5e751c4a57e14d2c4c13a482aa6079fa9d6/jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf", size = 18459 },
 ]
 
+[[package]]
+name = "kernels"
+version = "0.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "packaging" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+    { name = "torch" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/99/41af9dce502bb1682977fee1bc487a73fa8418cebbce16b8d27733947375/kernels-0.2.1.tar.gz", hash = "sha256:918942332819b28377b9d07070daddecfd8a5e7bab574dd3dc64a209ca6008b2", size = 9395 }
+
 [[package]]
 name = "lark"
 version = "1.2.2"
@@ -2561,9 +2562,9 @@ dependencies = [
     { name = "grpcio" },
     { name = "grpcio-reflection" },
     { name = "grpcio-status" },
-    { name = "hf-kernels" },
     { name = "hf-transfer" },
     { name = "huggingface-hub" },
+    { name = "kernels" },
     { name = "loguru" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
     { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
@@ -2626,9 +2627,9 @@ requires-dist = [
     { name = "grpcio-status", specifier = ">=1.67.0" },
     { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" },
     { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
-    { name = "hf-kernels", specifier = ">=0.1.5" },
     { name = "hf-transfer", specifier = ">=0.1.8" },
     { name = "huggingface-hub", specifier = ">=0.29.0" },
+    { name = "kernels", specifier = ">=0.2.1" },
     { name = "loguru", specifier = ">=0.7.3" },
     { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
     { name = "numpy", specifier = ">=1.26,<3" },
@@ -2651,6 +2652,7 @@ requires-dist = [
     { name = "transformers", specifier = ">=4.49.0" },
     { name = "typer", specifier = ">=0.15.1" },
 ]
+provides-extras = ["accelerate", "bnb", "compressed-tensors", "peft", "outlines", "dev", "quantize", "gen"]
 
 [[package]]
 name = "texttable"

From 83b7b7bb9204e93f87784a14330ca4494e0e55c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 13 Mar 2025 10:41:33 +0100
Subject: [PATCH 147/183] Router: add `gemma3-text` model type (#3107)

---
 router/src/config.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/router/src/config.rs b/router/src/config.rs
index 244bbb1f..4460eb00 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -263,6 +263,7 @@ pub enum Config {
     Gemma,
     Gemma2,
     Gemma3(Gemma3),
+    Gemma3Text,
     Cohere,
     Drbx,
     Falcon,

From 83ef36417775ec779e96c04a469d77434173c5a3 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 13 Mar 2025 10:45:47 +0100
Subject: [PATCH 148/183] We need gcc during runtime to enable triton to
 compile kernels. (#3103)

* We need gcc during runtime to enable triton to compile kernels.

* Fixing the docker build.
---
 nix/docker.nix | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/nix/docker.nix b/nix/docker.nix
index c4b1d899..e89e5b99 100644
--- a/nix/docker.nix
+++ b/nix/docker.nix
@@ -1,12 +1,18 @@
 {
+  stdenv,
   dockerTools,
   cacert,
   text-generation-inference,
+  runCommand,
   stream ? false,
 }:
 
 let
   build = if stream then dockerTools.streamLayeredImage else dockerTools.buildLayeredImage;
+  tmp = runCommand "tmp" { } ''
+    mkdir $out
+    mkdir -m 1777 $out/tmp
+  '';
 in
 build {
   name = "tgi-docker";
@@ -19,5 +25,9 @@ build {
     ];
 
   };
-  contents = [ cacert ];
+  contents = [
+    cacert
+    stdenv.cc
+    tmp
+  ];
 }

From 27ed8486768fc0ebd72a36a3a47183980e209135 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Thu, 13 Mar 2025 10:56:01 +0100
Subject: [PATCH 149/183] Release of Gaudi Backend for TGI (#3091)

* feat(gaudi): release ready (docs, docker image and vlm ready)

* fix(gaudi): add default argument for the dockerfile

* fix(gaudi): remove use of latest for gaudi docker image + redid gaudi benchmarking section to include best practices
---
 .github/workflows/build.yaml                  |   9 +
 .github/workflows/ci_build.yaml               |   3 +-
 .gitignore                                    |   1 +
 Dockerfile_gaudi                              |   4 +-
 Makefile                                      |   3 +
 .../docker_commands/docker_commands.md        | 283 ++++++++++++++++
 .../models/vlm_causal_lm.py                   |  25 +-
 docs/source/_toctree.yml                      |   2 +
 docs/source/backends/gaudi.mdx                | 315 ++++++++++++++++++
 docs/source/installation_gaudi.md             |   2 +-
 10 files changed, 635 insertions(+), 12 deletions(-)
 create mode 100644 backends/gaudi/examples/docker_commands/docker_commands.md
 create mode 100644 docs/source/backends/gaudi.mdx

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e849f7c0..99f29d7e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -124,6 +124,15 @@ jobs:
                 export extra_pytest="--neuron"
                 export target=""
                 ;;
+            gaudi)
+                export dockerfile="Dockerfile_gaudi"
+                export label_extension="-gaudi"
+                export docker_volume="/mnt/cache"
+                export docker_devices=""
+                export runs_on="ubuntu-latest"
+                export platform=""
+                export extra_pytest=""
+                export target=""
           esac
           echo $dockerfile
           echo "Dockerfile=${dockerfile}"
diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml
index 752c6ddd..f0d39399 100644
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@@ -21,6 +21,7 @@ on:
       - "Dockerfile_amd"
       - "Dockerfile_intel"
       - "Dockerfile.neuron"
+      - "Dockerfile_gaudi"
     branches:
       - "main"
   workflow_dispatch:
@@ -38,7 +39,7 @@ jobs:
       # fail-fast is true by default
       fail-fast: false
       matrix:
-        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron"]
+        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron", "gaudi"]
     uses: ./.github/workflows/build.yaml # calls the one above ^
     permissions:
       contents: write
diff --git a/.gitignore b/.gitignore
index 7d6c7564..8a6bda72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ server/fbgemmm
 hl-smi_log*.txt
 .graph_dumps
 out
+hqt_output
diff --git a/Dockerfile_gaudi b/Dockerfile_gaudi
index 6d37c6ae..c814e912 100644
--- a/Dockerfile_gaudi
+++ b/Dockerfile_gaudi
@@ -1,6 +1,6 @@
 # Those arguments are required to build the image
-ARG HABANA_VERSION
-ARG PYTORCH_VERSION
+ARG HABANA_VERSION=1.19.0
+ARG PYTORCH_VERSION=2.5.1
 
 # Rust builder
 FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
diff --git a/Makefile b/Makefile
index 3068a06f..2ecdd45c 100644
--- a/Makefile
+++ b/Makefile
@@ -53,3 +53,6 @@ run-falcon-7b-instruct-quantize:
 
 clean:
 	rm -rf target aml
+
+preview_doc:
+	doc-builder preview text-generation-inference docs/source --not_python_module
diff --git a/backends/gaudi/examples/docker_commands/docker_commands.md b/backends/gaudi/examples/docker_commands/docker_commands.md
new file mode 100644
index 00000000..59701289
--- /dev/null
+++ b/backends/gaudi/examples/docker_commands/docker_commands.md
@@ -0,0 +1,283 @@
+# Examples of Docker Commands for Gaudi Backend
+
+This page gives a list of examples of docker run commands for some of the most popular models.
+
+> **Note:** The parameters are chosen for Gaudi2 hardware to maximize performance on this given hardware, please adjust the parameters based on your hardware. For example, if you are using Gaudi3, you may want to increase the batch size.
+
+## Default Precision (BF16)
+
+### Llama3.1-8B on 1 card (BF16)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+### Llama3.1-70B 8 cards (BF16)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-70B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+### Llama2-7B on 1 Card (BF16)
+
+```bash
+model=meta-llama/Llama-2-7b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+### Llama2-70B on 8 cards (BF16)
+
+```bash
+model=meta-llama/Llama-2-70b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+### Llava-v1.6-Mistral-7B on 1 card (BF16)
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```
+
+## FP8 Precision
+
+Please refer to the [FP8 Precision](https://huggingface.co/docs/text-generation-inference/backends/gaudi_new#how-to-use-different-precision-formats) section for more details. You need to measure the statistics of the model first before running the model in FP8 precision.
+
+## Llama3.1-8B on 1 Card (FP8)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+## Llama3.1-70B on 8 cards (FP8)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-70B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+## Llama2-7B on 1 Card (FP8)
+
+```bash
+model=meta-llama/Llama-2-7b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+## Llama2-70B on 8 Cards (FP8)
+
+```bash
+model=meta-llama/Llama-2-70b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+## Llava-v1.6-Mistral-7B on 1 Card (FP8)
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```
+
+## Llava-v1.6-Mistral-7B on 8 Cards (FP8)
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```
diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
index d4f4c1af..cef761b4 100644
--- a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
@@ -68,10 +68,14 @@ IDEFICS2_IMAGE_TOKEN = "<image>"
 IMAGES = re.compile(r"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)")
 BASE_IMAGE_TOKENS = int(os.environ.get("BASE_IMAGE_TOKENS", 2048))
 MAX_TOTAL_TOKENS = int(os.environ.get("MAX_TOTAL_TOKENS", 8192))
-MAX_BATCH_TOTAL_TOKENS = int(os.environ.get("MAX_BATCH_TOTAL_TOKENS", 131072))
 PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 256))
 CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
 LAZY_MODE = int(os.environ.get("PT_HPU_LAZY_MODE", 1))
+MAX_BATCH_SIZE = (
+    int(os.environ.get("MAX_BATCH_SIZE"))
+    if os.environ.get("MAX_BATCH_SIZE") is not None
+    else None
+)
 
 PREFILL_WARMUP_BATCH_SIZE_LIST = []
 PREFILL_WARMUP_SEQLEN_LIST = []
@@ -1197,7 +1201,9 @@ class VlmCausalLM(Model):
 
         return self.batch_from_pb(batch, is_warmup)
 
-    def warmup(self, request) -> None:
+    def warmup(
+        self, request: generate_pb2.WarmupRequest
+    ) -> Tuple[Optional[int], Optional[int], Optional[int]]:
         is_warmup = True
         batch = self.batch_from_pb(request.batch, is_warmup)
 
@@ -1210,7 +1216,8 @@ class VlmCausalLM(Model):
                 f"You need to decrease `--max-batch-prefill-tokens`"
             )
 
-        global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, MAX_BATCH_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST
+        global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST
+        MAX_TOTAL_TOKENS = request.max_total_tokens
         max_input_length = batch.input_ids.shape[1]
         max_prefill_batch_size = batch.input_ids.shape[0]
         PREFILL_WARMUP_BATCH_SIZE_LIST = []
@@ -1264,7 +1271,7 @@ class VlmCausalLM(Model):
             f"Memory stats: {mem_stats} "
         )
 
-        max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS)
+        max_decode_batch_size = MAX_BATCH_SIZE
         batch_size = max_prefill_batch_size * 2
         # Decode warmup with bigger batch_size
         try:
@@ -1310,14 +1317,12 @@ class VlmCausalLM(Model):
                         batches.append(prefill_batch)
                     _, decode_batch, _ = self.generate_token(batches, is_warmup)
                     DECODE_WARMUP_BATCH_SIZE_LIST.append(max_decode_batch_size)
-                max_batch_total_tokens = max_decode_batch_size * MAX_TOTAL_TOKENS
-                MAX_BATCH_TOTAL_TOKENS = max_batch_total_tokens
         except Exception:
             raise RuntimeError(
                 f"Not enough memory to handle batch_size({batch_size}) decode warmup."
                 f"Decode batch size list:{DECODE_WARMUP_BATCH_SIZE_LIST}"
                 f"max_decode_batch_size is {max_decode_batch_size}"
-                f"You need to decrease env `MAX_BATCH_TOTAL_TOKENS` or '--max_batch_total_tokens'"
+                f"You need to decrease env `MAX_BATCH_SIZE` or '--max_batch_size'"
             )
 
         mem_stats = get_hpu_memory_stats(self.device)
@@ -1327,4 +1332,8 @@ class VlmCausalLM(Model):
             f"Memory stats: {mem_stats}"
         )
 
-        return MAX_BATCH_TOTAL_TOKENS
+        max_supported_total_tokens = MAX_BATCH_SIZE * MAX_TOTAL_TOKENS
+        max_input_tokens = max_input_length
+        max_total_tokens = MAX_TOTAL_TOKENS
+
+        return max_supported_total_tokens, max_input_tokens, max_total_tokens
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 37b57d6f..4c6f0151 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -52,6 +52,8 @@
 - sections:
   - local: backends/neuron
     title: Neuron
+  - local: backends/gaudi
+    title: Gaudi
   - local: backends/trtllm
     title: TensorRT-LLM
   - local: backends/llamacpp
diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx
new file mode 100644
index 00000000..c7f73618
--- /dev/null
+++ b/docs/source/backends/gaudi.mdx
@@ -0,0 +1,315 @@
+# Gaudi Backend for Text Generation Inference
+
+## Overview
+Text Generation Inference (TGI) has been optimized to run on Gaudi hardware via the Gaudi backend for TGI.
+
+## Supported Hardware
+- **Gaudi1**: Available on [AWS EC2 DL1 instances](https://aws.amazon.com/ec2/instance-types/dl1/)
+- **Gaudi2**: Available on [Intel Cloud](https://console.cloud.intel.com/docs/reference/ai_instances.html)
+- **Gaudi3**: Available on [Intel Cloud](https://console.cloud.intel.com/docs/reference/ai_instances.html)
+
+## Tutorial: Getting Started with TGI on Gaudi
+
+### Basic Usage
+The easiest way to run TGI on Gaudi is to use the official Docker image:
+
+```bash
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+hf_token=YOUR_HF_ACCESS_TOKEN
+
+docker run --runtime=habana --cap-add=sys_nice --ipc=host \
+    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+    --model-id $model
+```
+
+Once you see the `connected` log, the server is ready to accept requests:
+> 2024-05-22T19:31:48.302239Z  INFO text_generation_router: router/src/main.rs:378: Connected
+
+You can find your `YOUR_HF_ACCESS_TOKEN` at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). This is necessary to access gated models like llama3.1.
+
+### Making Your First Request
+You can send a request from a separate terminal:
+
+```bash
+curl 127.0.0.1:8080/generate \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":32}}' \
+    -H 'Content-Type: application/json'
+```
+
+## How-to Guides
+
+### How to Run Specific Models
+
+The following models have been validated on Gaudi2:
+
+| Model                 | Model ID                               | BF16        |            | FP8         |            |
+|-----------------------|----------------------------------------|-------------|------------|-------------|------------|
+|                       |                                        | Single Card | Multi-Card | Single Card | Multi-Card |
+| Llama2-7B             | meta-llama/Llama-2-7b-chat-hf          | ✔           | ✔          | ✔           | ✔          |
+| Llama2-70B            | meta-llama/Llama-2-70b-chat-hf         |             | ✔          |             | ✔          |
+| Llama3-8B             | meta-llama/Meta-Llama-3.1-8B-Instruct  | ✔           | ✔          | ✔           | ✔          |
+| Llama3-70B            | meta-llama/Meta-Llama-3-70B-Instruct   |             | ✔          |             | ✔          |
+| Llama3.1-8B           | meta-llama/Meta-Llama-3.1-8B-Instruct  | ✔           | ✔          | ✔           | ✔          |
+| Llama3.1-70B          | meta-llama/Meta-Llama-3.1-70B-Instruct |             | ✔          |             | ✔          |
+| CodeLlama-13B         | codellama/CodeLlama-13b-hf             | ✔           | ✔          | ✔           | ✔          |
+| Mixtral-8x7B          | mistralai/Mixtral-8x7B-Instruct-v0.1   | ✔           | ✔          | ✔           | ✔          |
+| Mistral-7B            | mistralai/Mistral-7B-Instruct-v0.3     | ✔           | ✔          | ✔           | ✔          |
+| Falcon-180B           | tiiuae/falcon-180B-chat                |             | ✔          |             | ✔          |
+| Qwen2-72B             | Qwen/Qwen2-72B-Instruct                |             | ✔          |             | ✔          |
+| Starcoder2-3b         | bigcode/starcoder2-3b                  | ✔           | ✔          | ✔           |            |
+| Starcoder2-15b        | bigcode/starcoder2-15b                 | ✔           | ✔          | ✔           |            |
+| Starcoder             | bigcode/starcoder                      | ✔           | ✔          | ✔           | ✔          |
+| Gemma-7b              | google/gemma-7b-it                     | ✔           | ✔          | ✔           | ✔          |
+| Llava-v1.6-Mistral-7B | llava-hf/llava-v1.6-mistral-7b-hf      | ✔           | ✔          | ✔           | ✔          |
+
+To run any of these models:
+
+```bash
+model=MODEL_ID_THAT_YOU_WANT_TO_RUN
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+hf_token=YOUR_ACCESS_TOKEN
+
+docker run --runtime=habana --cap-add=sys_nice --ipc=host \
+    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+    --model-id $model
+    <text-generation-inference-launcher-arguments>
+```
+
+For the full list of service parameters, refer to the [launcher-arguments page](https://huggingface.co/docs/text-generation-inference/reference/launcher).
+
+The validated docker commands can be found in the [examples/docker_commands folder](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/docker_commands).
+
+> Note: `--runtime=habana --cap-add=sys_nice --ipc=host ` is required to enable docker to use the Gaudi hardware (more details [here](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html)).
+
+### How to Enable Multi-Card Inference (Sharding)
+
+TGI-Gaudi supports sharding for multi-card inference, allowing you to distribute the load across multiple Gaudi cards.
+
+For example, on a machine with 8 Gaudi cards, you can run:
+
+```bash
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
+    tgi-gaudi \
+    --model-id $model --sharded true --num-shard 8
+```
+
+<Tip>
+We recommend always using sharding when running on a multi-card machine.
+</Tip>
+
+### How to Use Different Precision Formats
+
+#### BF16 Precision (Default)
+By default, all models run with BF16 precision on Gaudi hardware.
+
+#### FP8 Precision
+TGI-Gaudi supports FP8 precision inference with [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html).
+
+To run FP8 Inference:
+
+1. Measure statistics using [Optimum Habana measurement script](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8)
+2. Run the model in TGI with QUANT_CONFIG setting - e.g. `-e QUANT_CONFIG=./quantization_config/maxabs_quant.json`.
+
+The following commmand example for FP8 inference is based on the assumption that measurement is done via the first step above.
+
+Example for Llama3.1-70B on 8 cards with FP8 precision:
+
+```bash
+model=meta-llama/Meta-Llama-3.1-70B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+### How to Run Vision-Language Models (VLMs)
+
+Gaudi supports VLM inference.
+
+Example for Llava-v1.6-Mistral-7B on 1 card:
+
+Start the TGI server via the following command:
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```
+
+You can then send a request to the server via the following command:
+```bash
+curl -N 127.0.0.1:8080/generate \
+    -X POST \
+    -d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":32}}' \
+    -H 'Content-Type: application/json'
+```
+
+> Note: In Llava-v1.6-Mistral-7B, an image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated with the image. Otherwise the image may be truncated. We set `BASE_IMAGE_TOKENS=2048` as the default image token value. This is the minimum value of `max-input-tokens`. You can override the environment variable `BASE_IMAGE_TOKENS` to change this value. The warmup will generate graphs with input length from `BASE_IMAGE_TOKENS` to `max-input-tokens`. For Llava-v1.6-Mistral-7B, the value of `max-batch-prefill-tokens` is 16384, which is calcualted as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`.
+
+### How to Benchmark Performance
+
+We recommend using the [inference-benchmarker tool](https://github.com/huggingface/inference-benchmarker) to benchmark performance on Gaudi hardware.
+
+This benchmark tool simulates user requests and measures the performance of the model on realistic scenarios.
+
+To run it on the same machine, you can do the following:
+```bash
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+HF_TOKEN=<your HF READ token>
+# run a benchmark to evaluate the performance of the model for chat use case
+# we mount results to the current directory
+docker run \
+    --rm \
+    -it \
+    --net host \
+    -v $(pwd):/opt/inference-benchmarker/results \
+    -e "HF_TOKEN=$HF_TOKEN" \
+    ghcr.io/huggingface/inference-benchmarker:latest \
+    inference-benchmarker \
+    --tokenizer-name "$MODEL" \
+    --url http://localhost:8080 \
+    --profile chat
+```
+
+Please refer to the [inference-benchmarker README](https://github.com/huggingface/inference-benchmarker) for more details.
+
+### How to Profile Performance
+
+To collect performance profiling, you need to set the following environment variables:
+
+| Name               | Value(s)   | Default          | Description                                              |
+|--------------------| :--------- | :--------------- | :------------------------------------------------------- |
+| PROF_WAITSTEP      | integer    | 0                | Control profile wait steps                               |
+| PROF_WARMUPSTEP    | integer    | 0                | Control profile warmup steps                             |
+| PROF_STEP          | integer    | 0                | Enable/disable profile, control profile active steps     |
+| PROF_PATH          | string     | /tmp/hpu_profile | Define profile folder                                    |
+| PROF_RANKS         | string     | 0                | Comma-separated list of ranks to profile                 |
+| PROF_RECORD_SHAPES | True/False | False            | Control record_shapes option in the profiler             |
+
+To use these environment variables, add them to your docker run command with the -e flag. For example:
+
+```bash
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
+    -e PROF_WAITSTEP=10 \
+    -e PROF_WARMUPSTEP=10 \
+    -e PROF_STEP=1 \
+    -e PROF_PATH=/tmp/hpu_profile \
+    -e PROF_RANKS=0 \
+    -e PROF_RECORD_SHAPES=True \
+    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+    --model-id $model
+```
+
+## Explanation: Understanding TGI on Gaudi
+
+### The Warmup Process
+
+To ensure optimal performance, warmup is performed at the beginning of each server run. This process creates queries with various input shapes based on provided parameters and runs basic TGI operations (prefill, decode, concatenate).
+
+Note: Model warmup can take several minutes, especially for FP8 inference. For faster subsequent runs, refer to [Disk Caching Eviction Policy](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#disk-caching-eviction-policy).
+
+### Understanding Parameter Tuning
+
+#### Sequence Length Parameters
+- `--max-input-tokens` is the maximum possible input prompt length. Default value is `4095`.
+- `--max-total-tokens` is the maximum possible total length of the sequence (input and output). Default value is `4096`.
+
+#### Batch Size Parameters
+- For prefill operation, please set `--max-batch-prefill-tokens` as `bs * max-input-tokens`, where `bs` is your expected maximum prefill batch size.
+- For decode operation, please set `--max-batch-size` as `bs`, where `bs` is your expected maximum decode batch size.
+- Please note that batch size will be always padded to the nearest multiplication of `BATCH_BUCKET_SIZE` and `PREFILL_BATCH_BUCKET_SIZE`.
+
+#### Performance and Memory Parameters
+- `PAD_SEQUENCE_TO_MULTIPLE_OF` determines sizes of input length buckets. Since warmup creates several graphs for each bucket, it's important to adjust that value proportionally to input sequence length. Otherwise, some out of memory issues can be observed.
+- `ENABLE_HPU_GRAPH` enables HPU graphs usage, which is crucial for performance results. Recommended value to keep is `true`.
+
+#### Sequence Length Parameters
+- `--max-input-tokens`: Maximum possible input prompt length (default: 4095)
+- `--max-total-tokens`: Maximum possible total sequence length (input + output) (default: 4096)
+
+#### Batch Size Parameters
+- `--max-batch-prefill-tokens`: Set as `bs * max-input-tokens` where `bs` is your expected maximum prefill batch size
+- `--max-batch-size`: Set as `bs` where `bs` is your expected maximum decode batch size
+- Note: Batch sizes are padded to the nearest multiple of `BATCH_BUCKET_SIZE` and `PREFILL_BATCH_BUCKET_SIZE`
+
+## Reference
+
+This section contains reference information about the Gaudi backend.
+
+### Environment Variables
+
+The following table contains the environment variables that can be used to configure the Gaudi backend:
+
+| Name                        | Value(s)   | Default          | Description                                                                                                                      | Usage                        |
+|-----------------------------| :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- |
+| ENABLE_HPU_GRAPH            | True/False | True             | Enable hpu graph or not                                                                                                          | add -e in docker run command |
+| LIMIT_HPU_GRAPH             | True/False | True             | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212)                 | add -e in docker run command |
+| BATCH_BUCKET_SIZE           | integer    | 8                | Batch size for decode operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs  | add -e in docker run command |
+| PREFILL_BATCH_BUCKET_SIZE   | integer    | 4                | Batch size for prefill operation will be rounded to the nearest multiple of this number. This limits the number of cached graphs | add -e in docker run command |
+| PAD_SEQUENCE_TO_MULTIPLE_OF | integer    | 128              | For prefill operation, sequences will be padded to a multiple of provided value.                                                 | add -e in docker run command |
+| SKIP_TOKENIZER_IN_TGI       | True/False | False            | Skip tokenizer for input/output processing                                                                                       | add -e in docker run command |
+| WARMUP_ENABLED              | True/False | True             | Enable warmup during server initialization to recompile all graphs. This can increase TGI setup time.                            | add -e in docker run command |
+| QUEUE_THRESHOLD_MS          | integer    | 120              | Controls the threshold beyond which the request are considered overdue and handled with priority. Shorter requests are prioritized otherwise. | add -e in docker run command |
+| USE_FLASH_ATTENTION         | True/False | True             | Whether to enable Habana Flash Attention, provided that the model supports it. Please refer to https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html?highlight=fusedsdpa#using-fused-scaled-dot-product-attention-fusedsdpa | add -e in docker run command |
+| FLASH_ATTENTION_RECOMPUTE   | True/False | True             | Whether to enable Habana Flash Attention in recompute mode on first token generation.                                           | add -e in docker run command |
+
+## Contributing
+
+Contributions to the TGI-Gaudi project are welcome. Please refer to the [contributing guide](https://github.com/huggingface/text-generation-inference/blob/main/CONTRIBUTING.md).
+
+### Building the Docker Image from Source
+
+To build the Docker image from source:
+
+```bash
+make -C backends/gaudi image
+```
+
+This builds the image and saves it as `tgi-gaudi`. You can then run TGI-Gaudi with this image:
+
+```bash
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
+volume=$PWD/data
+hf_token=YOUR_ACCESS_TOKEN
+
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
+    tgi-gaudi \
+    --model-id $model
+```
+
+For more details, see the [README of the Gaudi backend](https://github.com/huggingface/text-generation-inference/blob/main/backends/gaudi/README.md) and the [Makefile of the Gaudi backend](https://github.com/huggingface/text-generation-inference/blob/main/backends/gaudi/Makefile).
diff --git a/docs/source/installation_gaudi.md b/docs/source/installation_gaudi.md
index 1ddf2b47..51aa667d 100644
--- a/docs/source/installation_gaudi.md
+++ b/docs/source/installation_gaudi.md
@@ -1,3 +1,3 @@
 # Using TGI with Intel Gaudi
 
-Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index).
+You can use TGI on Intel Gaudi using the [TGI gaudi backend](https://huggingface.co/docs/text-generation-inference/backends/gaudi).

From 8b91f92978b6be529df6594a376a454e71009078 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 13 Mar 2025 11:26:44 +0100
Subject: [PATCH 150/183] Fixing the docker build. (#3108)

* Fixing the docker build.

* Apply suggestions from code review
---
 nix/docker.nix | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/nix/docker.nix b/nix/docker.nix
index e89e5b99..dc4c823d 100644
--- a/nix/docker.nix
+++ b/nix/docker.nix
@@ -9,10 +9,6 @@
 
 let
   build = if stream then dockerTools.streamLayeredImage else dockerTools.buildLayeredImage;
-  tmp = runCommand "tmp" { } ''
-    mkdir $out
-    mkdir -m 1777 $out/tmp
-  '';
 in
 build {
   name = "tgi-docker";
@@ -25,9 +21,12 @@ build {
     ];
 
   };
+  extraCommands = ''
+    mkdir -p tmp
+    chmod -R 1777 tmp
+  '';
   contents = [
     cacert
     stdenv.cc
-    tmp
   ];
 }

From f91434e99b31294ba2bb13ce2de94472028ab3e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 13 Mar 2025 14:02:45 +0100
Subject: [PATCH 151/183] Make the Nix-based Docker container work on non-NixOS
 (#3109)

On NixOS, the CUDA driver shim gets mounted on /run/opengl-driver,
where Nix packages expect the shim to be. However, on other
distributions, some FHS paths are mounted. This is a small change
to make the dynamic loader find the shim.
---
 flake.nix      | 8 ++++++--
 nix/docker.nix | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/flake.nix b/flake.nix
index ac4331c2..7ddd3b92 100644
--- a/flake.nix
+++ b/flake.nix
@@ -176,11 +176,15 @@
             '';
           };
 
-          dockerImage = pkgs.callPackage nix/docker.nix {
+          # Use plain nixpkgs without overlays for dockerTools. dockerTools
+          # uses a Python package for computing the layers from the transitive
+          # closure. However, this needs a lot of rebuilds due to our overlay.
+
+          dockerImage = nixpkgs.legacyPackages.${system}.callPackage nix/docker.nix {
             text-generation-inference = default;
           };
 
-          dockerImageStreamed = pkgs.callPackage nix/docker.nix {
+          dockerImageStreamed = nixpkgs.legacyPackages.${system}.callPackage nix/docker.nix {
             text-generation-inference = default;
             stream = true;
           };
diff --git a/nix/docker.nix b/nix/docker.nix
index dc4c823d..a8fc256e 100644
--- a/nix/docker.nix
+++ b/nix/docker.nix
@@ -18,6 +18,10 @@ build {
     Env = [
       "HF_HOME=/data"
       "PORT=80"
+      # The CUDA container toolkit will mount the driver shim into the
+      # container. We just have to ensure that the dynamic loader finds
+      # the libraries.
+      "LD_LIBRARY_PATH=/usr/lib64"
     ];
 
   };

From 0b3e3db043e0373f97efe893218bada171708889 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Mon, 17 Mar 2025 20:48:48 +0800
Subject: [PATCH 152/183] xpu 2.6 update (#3051)

* xpu 2.6 update

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* install whl

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* update get xpu memory api

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* int

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* fix awq crash if modules_to_not_convert is None

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_intel                              | 25 ++++++++-----------
 .../utils/import_utils.py                     | 10 +++-----
 .../utils/quantization.py                     |  2 ++
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index bdff0290..bacefd02 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -45,7 +45,7 @@ RUN cargo build --profile release-opt --frozen
 
 # Text Generation Inference base image for Intel
 
-FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS xpu
+FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 AS xpu
 
 USER root
 
@@ -87,7 +87,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/
 
 RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d
 
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-ocloc
 
 # Text Generation Inference base env
 ENV HF_HOME=/data \
@@ -96,13 +96,11 @@ ENV HF_HOME=/data \
 
 
 
-WORKDIR /usr/src
-RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp311-cp311-linux_x86_64.whl --no-cache-dir
-RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchaudio-2.5.0a0%2B56bc006-cp311-cp311-linux_x86_64.whl --no-cache-dir
-RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torchvision-0.20.0a0%2B8e8a208-cp311-cp311-linux_x86_64.whl --no-cache-dir
-RUN pip install https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp311-cp311-linux_x86_64.whl --no-cache-dir
 
-RUN pip install triton-xpu==3.0.0b2 --no-cache-dir
+WORKDIR /usr/src
+RUN pip install torch==2.6.0 torchvision torchaudio  --index-url https://download.pytorch.org/whl/test/xpu
+
+RUN pip install triton-xpu==3.2.0b1 --no-cache-dir
 
 # Install server
 COPY proto proto
@@ -114,15 +112,14 @@ RUN cd server && \
     pip install -U pip uv && \
     uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
 ENV CCL_ZE_IPC_EXCHANGE=sockets
-#ENV TORCH_LLM_ALLREDUCE=1
-#ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
+ENV TORCH_LLM_ALLREDUCE=1
+ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
 ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
 
-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 1ccf72b2d11cd00b47aef6d6cd054c088aa6f083
-RUN cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc,ats-m150' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch
-
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.6.0%2Bxpu-cp311-cp311-linux_x86_64.whl
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.6.10%2Bxpu-cp311-cp311-linux_x86_64.whl
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router
diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py
index b693258c..730ac6cb 100644
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@@ -18,14 +18,10 @@ def get_cuda_free_memory(device, memory_fraction):
 
 
 def get_xpu_free_memory(device, memory_fraction):
-    total_memory = torch.xpu.get_device_properties(device).total_memory
-    device_id = device.index
-    memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "1.0"))
+    total_free_memory, total_xpu_memory = torch.xpu.mem_get_info(device)
+    memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "0.9"))
     free_memory = max(
-        0,
-        int(
-            total_memory * 0.9 * memory_fraction - torch.xpu.memory_reserved(device_id)
-        ),
+        0, int(total_free_memory - (1 - memory_fraction) * total_xpu_memory)
     )
     return free_memory
 
diff --git a/server/text_generation_server/utils/quantization.py b/server/text_generation_server/utils/quantization.py
index e460361a..92111b19 100644
--- a/server/text_generation_server/utils/quantization.py
+++ b/server/text_generation_server/utils/quantization.py
@@ -79,6 +79,8 @@ def _get_quantizer_config(model_id, revision):
         modules_to_not_convert = data["quantization_config"].get(
             "modules_to_not_convert", []
         )
+        if modules_to_not_convert is None:
+            modules_to_not_convert = []
     except Exception:
         filename = "quantize_config.json"
         try:

From 095775e05cf97234c7249e73de445541348e05c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 17 Mar 2025 18:19:37 +0100
Subject: [PATCH 153/183] launcher: correctly get the head dimension for VLMs
 (#3116)

* launcher: correctly get the head dimension for VLMs

For most (?) VLMs, the head dimension is in the `text_config`
configuration section. However, since we only queried the top-level
`head_dim` (which typically doesn't exist in VLMs), we would never use
flashinfer. This change adds a method that gets the head dimension from
the top-level `Config` struct or `text_config` when that fails.

* fix: bump org name in gemma3 test

---------

Co-authored-by: drbh <david.richard.holtz@gmail.com>
---
 integration-tests/models/test_flash_gemma3.py |  2 +-
 launcher/src/main.rs                          | 27 +++++++++++++++----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/integration-tests/models/test_flash_gemma3.py b/integration-tests/models/test_flash_gemma3.py
index 022d2c47..ab812d64 100644
--- a/integration-tests/models/test_flash_gemma3.py
+++ b/integration-tests/models/test_flash_gemma3.py
@@ -3,7 +3,7 @@ import pytest
 
 @pytest.fixture(scope="module")
 def flash_gemma3_handle(launcher):
-    with launcher("gg-hf-g/gemma-3-4b-it", num_shard=2) as handle:
+    with launcher("google/gemma-3-4b-it", num_shard=2) as handle:
         yield handle
 
 
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index d1041e26..fde1472f 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -152,7 +152,7 @@ fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<String>) ->
             "flashdecoding"
         };
 
-        match config.head_dim {
+        match config.get_head_dim() {
             Some(h) if h == 64 || h == 128 || h == 256 => {
                 if lora_adapters.is_some() && prefix_caching.is_none() {
                     tracing::info!("Disabling prefix caching because of lora adapters");
@@ -214,6 +214,7 @@ struct RawConfig {
     num_key_value_heads: Option<usize>,
     num_hidden_layers: Option<usize>,
     head_dim: Option<usize>,
+    text_config: Option<TextConfig>,
     vision_config: Option<VisionConfig>,
     is_encoder_decoder: Option<bool>,
     #[serde(rename = "num_experts_per_tok")]
@@ -233,6 +234,11 @@ struct QuantizationConfig {
 #[derive(Debug, Deserialize)]
 struct VisionConfig {}
 
+#[derive(Debug, Deserialize)]
+struct TextConfig {
+    head_dim: Option<usize>,
+}
+
 #[derive(Debug, Deserialize)]
 struct Config {
     max_position_embeddings: Option<usize>,
@@ -244,6 +250,7 @@ struct Config {
     intermediate_size: Option<usize>,
     hidden_size: Option<usize>,
     model_type: Option<String>,
+    text_config: Option<TextConfig>,
     vision_config: Option<VisionConfig>,
     is_encoder_decoder: bool,
     num_experts_per_token: usize,
@@ -253,6 +260,14 @@ struct Config {
 }
 
 impl Config {
+    fn get_head_dim(&self) -> Option<usize> {
+        self.head_dim.or_else(|| {
+            self.text_config
+                .as_ref()
+                .and_then(|text_config| text_config.head_dim)
+        })
+    }
+
     fn flop(&self) -> Option<u64> {
         if self.vision_config.is_some() {
             // VLM are much harder to predict and VRAM requirements
@@ -261,7 +276,7 @@ impl Config {
         }
         let num_heads = self.num_heads? as u64;
         let num_kv_heads = self.num_kv_heads? as u64;
-        let head_dim = self.head_dim? as u64;
+        let head_dim = self.get_head_dim()? as u64;
         let hidden_size = self.hidden_size? as u64;
         let intermediate_size = (self.intermediate_size?
             * (self.num_experts_per_token + self.num_shared_experts))
@@ -289,7 +304,7 @@ impl Config {
         }
         // 2 for key and values
         // 2 for f16 dtype?
-        Some(self.num_kv_heads? * 2 * self.head_dim? * 2 * self.num_layers?)
+        Some(self.num_kv_heads? * 2 * self.get_head_dim()? * 2 * self.num_layers?)
     }
 
     fn mlp_vram_per_tok(&self) -> Option<usize> {
@@ -310,8 +325,8 @@ impl Config {
     }
 
     fn model_vram(&self) -> Option<usize> {
-        let attn_vram = (self.num_heads? + 2 * self.num_kv_heads?) * self.head_dim?;
-        let o_vram = self.num_heads? * self.head_dim? * self.hidden_size?;
+        let attn_vram = (self.num_heads? + 2 * self.num_kv_heads?) * self.get_head_dim()?;
+        let o_vram = self.num_heads? * self.get_head_dim()? * self.hidden_size?;
         // gate + up + down = 3
         let mlp_vram = 3 * self.intermediate_size? * self.num_experts * self.hidden_size?;
         let layer_vram = mlp_vram + attn_vram + o_vram;
@@ -349,6 +364,7 @@ impl From<RawConfig> for Config {
         let num_kv_heads = other.num_key_value_heads.or(other.num_attention_heads);
         let intermediate_size = other.intermediate_size;
         let model_type = other.model_type;
+        let text_config = other.text_config;
         let vision_config = other.vision_config;
         let is_encoder_decoder = other.is_encoder_decoder.unwrap_or(false);
         let num_experts_per_token = other.num_experts_per_token.unwrap_or(1);
@@ -360,6 +376,7 @@ impl From<RawConfig> for Config {
             quantize,
             head_dim,
             model_type,
+            text_config,
             vision_config,
             is_encoder_decoder,
             hidden_size,

From 8c2c348f3c4d1101e862af493b44e4986119b557 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Tue, 18 Mar 2025 09:45:52 +0100
Subject: [PATCH 154/183] Gaudi: Sync TGI with the latest changes from the
 TGI-Gaudi fork (#3117)

feat(gaudi): add all the changes from tgi-gaudi fork up to PR #289
---
 Dockerfile_gaudi                              |    5 +-
 backends/gaudi/Makefile                       |    4 +-
 backends/gaudi/server/pyproject.toml          |    2 +-
 backends/gaudi/server/requirements.txt        |   17 +-
 .../text_generation_server/models/__init__.py |   22 +-
 .../models/causal_lm.py                       |    3 +
 .../models/custom_modeling/llava_next.py      |  153 ++-
 .../models/custom_modeling/mllama.py          | 1211 ++++-------------
 .../models/vlm_causal_lm.py                   |  738 ++++++----
 docs/source/backends/gaudi.mdx                |    2 +
 10 files changed, 914 insertions(+), 1243 deletions(-)

diff --git a/Dockerfile_gaudi b/Dockerfile_gaudi
index c814e912..9009f95b 100644
--- a/Dockerfile_gaudi
+++ b/Dockerfile_gaudi
@@ -1,6 +1,6 @@
 # Those arguments are required to build the image
-ARG HABANA_VERSION=1.19.0
-ARG PYTORCH_VERSION=2.5.1
+ARG HABANA_VERSION=1.20.0
+ARG PYTORCH_VERSION=2.6.0
 
 # Rust builder
 FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
@@ -92,7 +92,6 @@ RUN cd server && \
     make gen-server && \
     pip install --no-deps -r requirements.txt && \
     bash ./dill-0.3.8-patch.sh && \
-    pip install outlines~=0.0.34 && \
     pip install "git+https://github.com/HabanaAI/DeepSpeed.git@${HABANA_VERSION}" && \
     BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \
     pip install . --no-cache-dir
diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile
index ce3be25d..6e38c19e 100644
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@@ -2,8 +2,8 @@ mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 root_dir := "${mkfile_dir}/../.."
 
-HABANA_VERSION := 1.19.0
-PYTORCH_VERSION := 2.5.1
+HABANA_VERSION := 1.20.0
+PYTORCH_VERSION := 2.6.0
 
 .PHONY:	image run-local-dev-container install-dependencies install-server install-router install-launcher local-dev-install
 
diff --git a/backends/gaudi/server/pyproject.toml b/backends/gaudi/server/pyproject.toml
index c61ac030..b38f4562 100644
--- a/backends/gaudi/server/pyproject.toml
+++ b/backends/gaudi/server/pyproject.toml
@@ -22,7 +22,7 @@ opentelemetry-instrumentation-grpc = "^0.36b0"
 hf-transfer = "^0.1.2"
 sentencepiece = "^0.1.97"
 peft = "^0.10"
-optimum-habana = "1.15.0"
+optimum-habana = "1.16.0"
 transformers = "4.45.2"
 numpy = "1.26.4"
 accelerate = "0.33.0"
diff --git a/backends/gaudi/server/requirements.txt b/backends/gaudi/server/requirements.txt
index 07414490..b71d29d9 100644
--- a/backends/gaudi/server/requirements.txt
+++ b/backends/gaudi/server/requirements.txt
@@ -46,7 +46,7 @@ opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_versi
 opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-optimum-habana==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+optimum-habana==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
 optimum==1.23.2 ; python_version >= "3.9" and python_version < "3.13"
 packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
 pandas==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
@@ -87,3 +87,18 @@ wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
 xxhash==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
 yarl==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
 zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
+outlines==0.0.34 ; python_version >= "3.9" and python_version < "3.13"
+interegular==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
+lark==1.2.2 ; python_version >= "3.9" and python_version < "3.13"
+cloudpickle==3.1.0 ; python_version >= "3.9" and python_version < "3.13"
+diskcache==5.6.3 ; python_version >= "3.9" and python_version < "3.13"
+numba==0.60.0 ; python_version >= "3.9" and python_version < "3.13"
+llvmlite==0.43.0 ; python_version >= "3.9" and python_version < "3.13"
+jsonschema==4.23.0 ; python_version >= "3.9" and python_version < "3.13"
+annotated-types==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
+jsonschema-specifications==2024.10.1 ; python_version >= "3.9" and python_version < "3.13"
+nest-asyncio==1.6.0; python_version >= "3.9" and python_version < "3.13"
+pydantic==2.10.6; python_version >= "3.9" and python_version < "3.13"
+pydantic-core==2.27.2 ; python_version >= "3.9" and python_version < "3.13"
+referencing==0.36.2 ; python_version >= "3.9" and python_version < "3.13"
+rpds-py==0.22.3 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/backends/gaudi/server/text_generation_server/models/__init__.py b/backends/gaudi/server/text_generation_server/models/__init__.py
index 651b71ec..346016c2 100644
--- a/backends/gaudi/server/text_generation_server/models/__init__.py
+++ b/backends/gaudi/server/text_generation_server/models/__init__.py
@@ -17,16 +17,14 @@ from text_generation_server.models.causal_lm import CausalLM
 from text_generation_server.models.bloom import BLOOM
 from text_generation_server.models.starcoder import StarCoder
 from text_generation_server.models.vlm_causal_lm import VlmCausalLM
-
-# from text_generation_server.models.mllama_causal_lm import MllamaCausalLM
+from text_generation_server.models.custom_modeling.mllama import (
+    MllamaForConditionalGeneration,
+)
 from text_generation_server.models.custom_modeling.llava_next import (
     LlavaNextForConditionalGeneration,
 )
 
 # from text_generation_server.models.mllama_causal_lm import MllamaCausalLMBatch
-# from text_generation_server.models.custom_modeling.mllama import (
-#     MllamaForConditionalGeneration,
-# )
 from text_generation_server.utils.adapter import (
     AdapterParameters,
     build_layer_weight_lookup,
@@ -39,6 +37,7 @@ from text_generation_server.adapters.lora import LoraWeights
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
 
+SDP_ON_BF16 = int(os.environ.get("SDP_ON_BF16", 0))
 # Disable gradients
 torch.set_grad_enabled(False)
 
@@ -55,6 +54,8 @@ def get_model(
     max_input_tokens: int,
 ) -> Model:
     adapt_transformers_to_gaudi()
+    if SDP_ON_BF16 == 1:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
     if speculate is not None:
         set_speculate(speculate)
@@ -199,6 +200,17 @@ def get_model(
             trust_remote_code=trust_remote_code,
         )
 
+    if model_type == "mllama":
+        return VlmCausalLM(
+            model_class=MllamaForConditionalGeneration,
+            model_id=model_id,
+            revision=revision,
+            quantize=None,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+        )
+
     if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
         return CausalLM(
             model_id,
diff --git a/backends/gaudi/server/text_generation_server/models/causal_lm.py b/backends/gaudi/server/text_generation_server/models/causal_lm.py
index 8fda0517..776c109f 100644
--- a/backends/gaudi/server/text_generation_server/models/causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/causal_lm.py
@@ -704,6 +704,9 @@ class CausalLM(Model):
             htorch.core.hpu_set_env()
 
         if world_size > 1:
+            os.environ.setdefault(
+                "DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1"
+            )
             model = self.get_deepspeed_model(model_id, dtype, revision)
             model = hq_env.prepare_model_for_quantization(model)
         else:
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py
index f98dab91..70449f6b 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -14,10 +14,11 @@
 # limitations under the License.
 """ PyTorch Llava-NeXT model."""
 
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import torch
 import torch.utils.checkpoint
+import numpy as np
 
 from transformers.models.llava_next.modeling_llava_next import (
     unpad_image,
@@ -49,6 +50,46 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
     return height // patch_size, width // patch_size
 
 
+# Copied from https://github.com/huggingface/transformers/blob/6966fa190172b48b2fb46fe4552a13b943e692cf/src/transformers/models/llava_next/modeling_llava_next.py#L79
+def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
+    """
+    Calculate the number of patches after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
+            The size of the input image in the format (height, width). ?
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        int: the number of patches
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise TypeError("grid_pinpoints should be a list of tuples or lists")
+
+    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
+    if not isinstance(image_size, (list, tuple)):
+        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(
+                f"image_size invalid type {type(image_size)} with value {image_size}"
+            )
+        image_size = image_size.tolist()
+
+    best_resolution = select_best_resolution(image_size, grid_pinpoints)
+    height, width = best_resolution
+    num_patches = 0
+    # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            num_patches += 1
+    # add the base patch
+    num_patches += 1
+    return num_patches
+
+
 class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration):
 
     def _merge_input_ids_with_image_features(
@@ -128,6 +169,76 @@ class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration):
 
         return outputs
 
+    # Copied from https://github.com/huggingface/transformers/blob/6966fa190172b48b2fb46fe4552a13b943e692cf/src/transformers/models/llava_next/modeling_llava_next.py#L479
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        vision_feature_layer: Union[int, List[int]],
+        vision_feature_select_strategy: str,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+               The tensors corresponding to the input images.
+            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
+                Actual image size of each images (H, W).
+            vision_feature_layer (`Union[int, List[int]]`):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
+            and are of shape `(num_patches, image_length, embed_dim)`).
+        """
+        # ! infer image_num_patches from image_sizes
+        image_num_patches = [
+            image_size_to_num_patches(
+                image_size=imsize,
+                grid_pinpoints=self.config.image_grid_pinpoints,
+                patch_size=self.config.vision_config.image_size,
+            )
+            for imsize in image_sizes
+        ]
+        if pixel_values.dim() == 5:
+            # stacked if input is (batch_size, num_patches, num_channels, height, width)
+            _pixel_values_list = [
+                pix_val[:num_patch]
+                for pix_val, num_patch in zip(pixel_values, image_num_patches)
+            ]
+            pixel_values = torch.cat(_pixel_values_list, dim=0)
+        elif pixel_values.dim() != 4:
+            # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
+            raise ValueError(
+                f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions"
+            )
+
+        image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_features.hidden_states[vision_feature_layer]
+        else:
+            hs_pool = [
+                image_features.hidden_states[layer_idx]
+                for layer_idx in vision_feature_layer
+            ]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = torch.split(image_features, image_num_patches, dim=0)
+        return image_features
+
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -184,35 +295,12 @@ class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration):
                 # 1. Extract the input embeddings
                 inputs_embeds = self.get_input_embeddings()(input_ids)
                 # 2. Merge text and images
-                batch_size, num_patches, num_channels, height, width = (
-                    pixel_values.shape
+                image_features = self.get_image_features(
+                    pixel_values,
+                    image_sizes,
+                    vision_feature_layer=vision_feature_layer,
+                    vision_feature_select_strategy=vision_feature_select_strategy,
                 )
-                reshaped_pixel_values = pixel_values.view(
-                    batch_size * num_patches, num_channels, height, width
-                )
-                image_features = self.vision_tower(
-                    reshaped_pixel_values,
-                    output_hidden_states=True,
-                    use_flash_attention=use_flash_attention,
-                    flash_attention_recompute=flash_attention_recompute,
-                )
-
-                selected_image_feature = image_features.hidden_states[
-                    vision_feature_layer
-                ]
-
-                if vision_feature_select_strategy == "default":
-                    selected_image_feature = selected_image_feature[:, 1:]
-                elif vision_feature_select_strategy == "full":
-                    selected_image_feature = selected_image_feature
-
-                image_features = self.multi_modal_projector(selected_image_feature)
-
-                # split up image_features for each of the individual images
-                # hence we get a list of image_features, each of shape (5, num_patches, hidden_size)
-                # if we assume each image has 5 image features (base image + 4 patches)
-                split_sizes = [image.shape[0] for image in pixel_values]
-                image_features = torch.split(image_features, split_sizes, dim=0)
 
                 # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
                 height = width = (
@@ -266,13 +354,10 @@ class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration):
                             (image_feature, self.image_newline[None]), dim=0
                         )
                     new_image_features.append(image_feature)
-                image_features = torch.stack(new_image_features, dim=0)
+                image_features = torch.cat(new_image_features, dim=0)
                 inputs_embeds = self._merge_input_ids_with_image_features(
                     inputs_embeds, image_features, input_ids
                 )
-                self.image_offset = (
-                    image_features.shape[1] - 1
-                )  # image_token has occupied 1 token position.
             # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
             # generation with cache
             elif past_key_values is not None:
@@ -282,12 +367,10 @@ class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration):
                 # Retrieve the first layer to inspect the logits and mask out the hidden states
                 # that are set to 0
                 first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
                 # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
                 batch_index, non_attended_tokens = torch.where(
                     first_layer_past_key_value.float().sum(-2) == 0
                 )
-
                 # Get the target length
                 past_length = first_layer_past_key_value.shape[-1]
                 extended_attention_mask = torch.ones(
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py
index 73536bd6..6ba0ffff 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/mllama.py
@@ -14,982 +14,279 @@
 # limitations under the License.
 """PyTorch Mllama model."""
 
-from typing import Optional, Tuple, List
+from typing import Optional, Tuple, List, Union
 
 import torch
 import torch.utils.checkpoint
-from torch import nn
-import flash_attn_2_cuda
 
-from transformers.activations import ACT2FN
-import torch.nn.functional as F
-
-from text_generation_server.layers import (
-    TensorParallelColumnLinear,
-    TensorParallelEmbedding,
-    TensorParallelRowLinear,
-    FastLinear,
-)
-from text_generation_server.layers.attention import (
-    Seqlen,
-)
-from text_generation_server.models.custom_modeling.flash_llama_modeling import (
-    FlashLlamaForCausalLM,
+from optimum.habana.transformers.models import GaudiMllamaForConditionalGeneration
+from optimum.habana.transformers.models.mllama.modeling_mllama import (
+    _prepare_cross_attention_mask,
 )
+from transformers.modeling_outputs import CausalLMOutputWithPast
 
 
-def _prepare_aspect_ratio_attention_mask(
-    aspect_ratio_mask: torch.Tensor,
-    num_patches: int,
-    target_length: int,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    # Expand aspect ratio mask to target_length
-    batch_size, max_num_tiles = aspect_ratio_mask.shape
-    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype)
-    attention_mask = attention_mask.repeat(1, 1, target_length, 1)
-
-    # Mask padding patches
-    pad_patches = target_length - num_patches
-    attention_mask[:, :, -pad_patches:] = 0
-
-    # Invert the mask (0 -> 1, 1 -> 0)
-    attention_mask = 1 - attention_mask
-
-    # Reshape to 2D and create 4D attention mask
-    # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length)
-    attention_mask = attention_mask.reshape(
-        batch_size, max_num_tiles * target_length, 1
-    )
-    attention_mask = (
-        attention_mask @ attention_mask.transpose(-1, -2) * torch.finfo(dtype).min
-    )
-    attention_mask = attention_mask.unsqueeze(1)
-
-    return attention_mask
-
-
-# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
-def _prepare_4d_causal_attention_mask_with_cache_position(
-    attention_mask: torch.Tensor,
-    sequence_length: int,
-    target_length: int,
-    dtype: torch.dtype,
-    device: torch.device,
-    min_dtype: float,
-    cache_position: torch.Tensor,
-    batch_size: int,
-):
-    """
-    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-    Args:
-        attention_mask (`torch.Tensor`):
-            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
-        sequence_length (`int`):
-            The sequence length being processed.
-        target_length (`int`):
-            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
-        dtype (`torch.dtype`):
-            The dtype to use for the 4D attention mask.
-        device (`torch.device`):
-            The device to plcae the 4D attention mask on.
-        min_dtype (`float`):
-            The minimum value representable with the dtype `dtype`.
-        cache_position (`torch.Tensor`):
-            Indices depicting the position of the input sequence tokens in the sequence.
-        batch_size (`torch.Tensor`):
-            Batch size.
-    """
-    if attention_mask is not None and attention_mask.dim() == 4:
-        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-        causal_mask = attention_mask
-    else:
-        causal_mask = torch.full(
-            (sequence_length, target_length),
-            fill_value=min_dtype,
-            dtype=dtype,
-            device=device,
-        )
-        if sequence_length != 1:
-            causal_mask = torch.triu(causal_mask, diagonal=1)
-        causal_mask *= torch.arange(
-            target_length, device=device
-        ) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-        if attention_mask is not None:
-            causal_mask = (
-                causal_mask.clone()
-            )  # copy to contiguous memory for in-place edit
-            mask_length = attention_mask.shape[-1]
-            padding_mask = (
-                causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-            )
-            padding_mask = padding_mask == 0
-            causal_mask[:, :, :, :mask_length] = causal_mask[
-                :, :, :, :mask_length
-            ].masked_fill(padding_mask, min_dtype)
-
-    return causal_mask
-
-
-def _prepare_cross_attention_mask(
-    cross_attention_mask: torch.Tensor,
-    num_vision_tokens: int,
-    dtype: str,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    # reshape so it can be used by attn module
-    batch_size, text_total_length, *_ = cross_attention_mask.shape
-    cross_attention_mask = cross_attention_mask.repeat_interleave(
-        num_vision_tokens, dim=3
-    )
-    cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1)
-    cross_attention_mask = cross_attention_mask.unsqueeze(1)
-
-    # invert the mask
-    inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype)
-    cross_attention_mask = inverted_cross_attn_mask.masked_fill(
-        inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min
-    )
-
-    # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's
-    # last dimension contains negative infinity values, otherwise it's 1
-    negative_inf_value = torch.finfo(dtype).min
-    full_text_row_masked_out_mask = (
-        (cross_attention_mask != negative_inf_value)
-        .any(dim=-1)
-        .type_as(cross_attention_mask)[..., None]
-    )
-    cross_attention_mask *= full_text_row_masked_out_mask
-
-    return cross_attention_mask, full_text_row_masked_out_mask
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->MllamaVision
-class MllamaVisionMLP(nn.Module):
-    def __init__(self, *, prefix, config, weights):
-        super().__init__()
-        self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = TensorParallelColumnLinear.load(
-            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
-        )
-        self.fc2 = TensorParallelRowLinear.load(
-            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
-        )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
-
-
-class MllamaVisionSdpaAttention(nn.Module):
-    def __init__(self, *, prefix, config, weights):
-        super().__init__()
-
-        self.embed_dim = config.hidden_size
-        self.head_dim = config.hidden_size // config.attention_heads
-        self.num_heads = config.attention_heads // weights.process_group.size()
-
-        self.qkv_proj = TensorParallelColumnLinear.load_multi(
-            config,
-            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-            dim=0,
-            weights=weights,
-            bias=False,
-        )
-        self.o_proj = TensorParallelRowLinear.load(
-            config,
-            prefix=f"{prefix}.o_proj",
-            weights=weights,
-            bias=False,
-        )
+class MllamaForConditionalGeneration(GaudiMllamaForConditionalGeneration):
 
     def forward(
         self,
-        hidden_state: torch.Tensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        aspect_ratio_mask: Optional[torch.Tensor] = None,
+        aspect_ratio_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        qkv = self.qkv_proj(hidden_state)
-        query, key, value = qkv.split(
-            [
-                self.head_dim * self.num_heads,
-                self.head_dim * self.num_heads,
-                self.head_dim * self.num_heads,
-            ],
-            dim=2,
-        )
-
-        batch_size, q_seq_len, _ = query.shape
-        _, kv_seq_len, _ = key.shape
-
-        query = query.view(batch_size, q_seq_len, self.num_heads, self.head_dim)
-        key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
-        value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
-
-        query = query.transpose(1, 2)
-        key = key.transpose(1, 2)
-        value = value.transpose(1, 2)
-
-        attn_output = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_seq_len, -1)
-
-        output = self.o_proj(attn_output)
-        return output
-
-
-class MllamaVisionEncoderLayer(nn.Module):
-    def __init__(self, *, prefix, config, weights, is_gated: bool):
-        super().__init__()
-
-        self.hidden_size = config.hidden_size
-        self.num_attention_heads = config.attention_heads
-        self.is_gated = is_gated
-        self.intermediate_size = config.intermediate_size
-
-        self.self_attn = MllamaVisionSdpaAttention(
-            prefix=f"{prefix}.self_attn", config=config, weights=weights
-        )
-        self.mlp = MllamaVisionMLP(
-            prefix=f"{prefix}.mlp", config=config, weights=weights
-        )
-
-        self.input_layernorm = nn.LayerNorm.load(
-            prefix=f"{prefix}.input_layernorm", weights=weights, eps=1e-05
-        )
-        self.post_attention_layernorm = nn.LayerNorm.load(
-            prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=1e-05
-        )
-
-        # there used to be an if else here, no code path
-        if is_gated:
-            self.gate_attn = nn.Parameter(
-                weights.get_tensor(f"{prefix}.gate_attn"), requires_grad=False
-            )
-            self.gate_ffn = nn.Parameter(
-                weights.get_tensor(f"{prefix}.gate_ffn"), requires_grad=False
-            )
-
-    def forward(
-        self,
-        hidden_state: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-    ):
-        # Self Attention
-        residual = hidden_state
-        hidden_state = self.input_layernorm(hidden_state)
-        hidden_state = self.self_attn(hidden_state, attention_mask=attention_mask)
-        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
-        hidden_state = residual + gate_attn * hidden_state
-
-        # Feed forward
-        residual = hidden_state
-        hidden_state = self.post_attention_layernorm(hidden_state)
-        hidden_state = self.mlp(hidden_state)
-        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
-        hidden_state = residual + gate_ffn * hidden_state
-        return hidden_state
-
-
-class MllamaVisionEncoder(nn.Module):
-    def __init__(self, *, prefix, config, weights, is_gated: bool, num_layers: int):
-        super().__init__()
-        self.config = config
-        self.layers = [
-            MllamaVisionEncoderLayer(
-                prefix=f"{prefix}.layers.{i}",
-                config=config,
-                weights=weights,
-                is_gated=is_gated,
-            )
-            for i in range(num_layers)
-        ]
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-    ):
-        encoder_states = [hidden_states]
-        for encoder_layer in self.layers:
-            layer_outputs = encoder_layer(
-                hidden_states,
-                attention_mask,
-            )
-
-            hidden_states = layer_outputs
-            encoder_states.append(hidden_states)
-
-        return hidden_states, encoder_states
-
-
-class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
-    def __init__(self, *, prefix, config, weights):
-        super().__init__()
-        self.max_num_tiles = config.max_num_tiles
-        self.hidden_size = config.hidden_size
-        self.max_aspect_ratio_id = config.max_aspect_ratio_id
-
-        self.embedding = TensorParallelEmbedding(
-            prefix=f"{prefix}.embedding", weights=weights
-        )
-        self.gate = nn.Parameter(
-            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
-        )
-
-    def forward(
-        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
-    ) -> torch.Tensor:
-        embeddings = self.embedding(aspect_ratio_ids)
-        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)
-
-        # Always gated.
-        embeddings = embeddings * self.gate.tanh()
-
-        hidden_state = hidden_state + embeddings
-        return hidden_state
-
-
-class MllamaPrecomputedPositionEmbedding(nn.Module):
-    def __init__(self, *, prefix, config, weights):
-        super().__init__()
-        self.max_num_tiles = config.max_num_tiles
-        self.max_aspect_ratio_id = config.max_aspect_ratio_id
-        self.num_patches = (config.image_size // config.patch_size) ** 2 + 1
-        self.hidden_size = config.hidden_size
-        self.scale = config.hidden_size**-0.5
-
-        self.gate = nn.Parameter(
-            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
-        )
-
-        # position embedding
-        embedding = nn.Parameter(
-            weights.get_tensor(f"{prefix}.embedding"), requires_grad=False
-        )
-        self.gated_position_embedding = (1 - self.gate.tanh()) * embedding
-        self.tile_embedding = TensorParallelEmbedding(
-            prefix=f"{prefix}.tile_embedding", weights=weights
-        )
-
-    def forward(
-        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
-    ) -> torch.Tensor:
-        # position embeddings
-        hidden_state = hidden_state + self.gated_position_embedding.view(
-            1, 1, self.num_patches, self.hidden_size
-        )
-
-        # precomputed tile position embeddings
-        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
-        batch_size = hidden_state.shape[0]
-        tile_position_embedding = tile_position_embedding.reshape(
-            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size
-        )
-        gated_tile_position_embedding = self.gate.tanh() * tile_position_embedding
-        hidden_state = hidden_state + gated_tile_position_embedding
-
-        return hidden_state
-
-
-class MllamaVisionModel(nn.Module):
-    def __init__(self, *, prefix, config, weights):
-        super().__init__()
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-        self.max_num_tiles = config.max_num_tiles
-        self.hidden_size = config.hidden_size
-        self.num_channels = config.num_channels
-        self.intermediate_layers_indices = config.intermediate_layers_indices
-
-        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
-        self.scale = config.hidden_size**-0.5
-        self.dtype = weights.dtype
-
-        self.patch_embedding = nn.Conv2d(
-            in_channels=config.num_channels,
-            out_channels=self.hidden_size,
-            kernel_size=self.patch_size,
-            stride=self.patch_size,
-            padding="valid",
-            bias=False,
-        )
-        self.patch_embedding.weight = nn.Parameter(
-            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
-        )
-
-        self.class_embedding = nn.Parameter(
-            weights.get_tensor(f"{prefix}.class_embedding"), requires_grad=False
-        )
-
-        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(
-            prefix=f"{prefix}.gated_positional_embedding",
-            config=config,
-            weights=weights,
-        )
-
-        self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
-            prefix=f"{prefix}.pre_tile_positional_embedding",
-            config=config,
-            weights=weights,
-        )
-        self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
-            prefix=f"{prefix}.post_tile_positional_embedding",
-            config=config,
-            weights=weights,
-        )
-
-        ## layer norms
-        self.layernorm_pre = nn.LayerNorm.load(
-            prefix=f"{prefix}.layernorm_pre",
-            weights=weights,
-            # torch default
-            eps=1e-05,
-        )
-        self.layernorm_post = nn.LayerNorm.load(
-            prefix=f"{prefix}.layernorm_post",
-            weights=weights,
-            # torch default
-            eps=1e-05,
-        )
-
-        ## encoders
-        self.transformer = MllamaVisionEncoder(
-            prefix=f"{prefix}.transformer",
-            config=config,
-            weights=weights,
-            is_gated=False,
-            num_layers=config.num_hidden_layers,
-        )
-        self.global_transformer = MllamaVisionEncoder(
-            prefix=f"{prefix}.global_transformer",
-            config=config,
-            weights=weights,
-            is_gated=True,
-            num_layers=config.num_global_layers,
-        )
-
-    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        batch_size, _, hidden_size = hidden_state.shape
-        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
-        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
-        return hidden_state
-
-    def forward(
-        self,
-        pixel_values: torch.Tensor,
-        aspect_ratio_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        batch_size, num_concurrent_media, num_tiles, num_channels, height, width = (
-            pixel_values.shape
-        )
-
-        pixel_values = pixel_values.reshape(
-            batch_size * num_concurrent_media * num_tiles, num_channels, height, width
-        )
-        aspect_ratio_ids = aspect_ratio_ids.reshape(
-            batch_size * num_concurrent_media, -1
-        )
-
-        # patch embedding
-        patch_embeds = self.patch_embedding(pixel_values)
-        hidden_state = patch_embeds.flatten(2).transpose(1, 2)
-
-        # tile embeddings
-        _, num_patches, dim = hidden_state.shape
-        hidden_state = hidden_state.reshape(
-            batch_size * num_concurrent_media, num_tiles, -1, dim
-        )
-        hidden_state = self.pre_tile_positional_embedding(
-            hidden_state, aspect_ratio_ids
-        )
-
-        # apply cls token
-        hidden_state = hidden_state.reshape(
-            batch_size * num_concurrent_media * num_tiles, num_patches, dim
-        )
-        hidden_state = self.apply_class_embedding(hidden_state)
-        num_patches += 1
-
-        # apply position embeddings
-        hidden_state = hidden_state.reshape(
-            batch_size * num_concurrent_media, num_tiles, num_patches, dim
-        )
-        hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids)
-
-        # apply encoder
-        hidden_state = self.layernorm_pre(hidden_state)
-
-        # Compute the number of tokens to pad
-        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
-        # Compute padding tuple for pad function
-        padding = (
-            0,
-            0,
-            0,
-            num_padding_patches,
-        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
-        # Pad the tensor
-        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
-        slice_index = -num_padding_patches if num_padding_patches > 0 else None
-
-        if attention_mask is not None:
-            attention_mask = attention_mask.reshape(
-                batch_size * num_concurrent_media, -1
-            )
-            attention_mask = _prepare_aspect_ratio_attention_mask(
-                aspect_ratio_mask=attention_mask,
-                num_patches=self.num_patches,
-                target_length=hidden_state.shape[2],
-                dtype=self.dtype,
-            )
-
-        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim)
-        hidden_state, all_intermediate_hidden_states = self.transformer(
-            hidden_state,
-            attention_mask=attention_mask,
-        )
-        intermediate_hidden_states = [
-            hidden_state
-            for idx, hidden_state in enumerate(all_intermediate_hidden_states)
-            if idx in self.intermediate_layers_indices
-        ]
-        intermediate_hidden_states = torch.stack(intermediate_hidden_states, dim=-1)
-
-        # apply global encoder
-        hidden_state = self.layernorm_post(hidden_state)
-        hidden_state = hidden_state.reshape(
-            batch_size * num_concurrent_media,
-            num_tiles,
-            num_patches + num_padding_patches,
-            dim,
-        )
-        hidden_state = self.post_tile_positional_embedding(
-            hidden_state, aspect_ratio_ids
-        )
-        hidden_state = hidden_state.reshape(
-            batch_size * num_concurrent_media,
-            num_tiles * (num_patches + num_padding_patches),
-            dim,
-        )
-        hidden_state, _ = self.global_transformer(
-            hidden_state, attention_mask=attention_mask
-        )
-        hidden_state = hidden_state.reshape(
-            batch_size * num_concurrent_media,
-            num_tiles,
-            num_patches + num_padding_patches,
-            dim,
-        )
-        hidden_state = hidden_state[:, :, :slice_index]
-
-        # adding intermediate layer outputs
-        hidden_state = hidden_state.reshape(
-            batch_size, num_concurrent_media, num_tiles, num_patches, dim
-        )
-        intermediate_hidden_states = intermediate_hidden_states.reshape(
-            batch_size * num_concurrent_media,
-            num_tiles,
-            num_patches + num_padding_patches,
-            -1,
-        )
-        intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index]
-        intermediate_hidden_states = intermediate_hidden_states.reshape(
-            batch_size, num_concurrent_media, num_tiles, num_patches, -1
-        )
-        hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1)
-        return hidden_state
-
-
-class MllamaTextCrossAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, *, prefix, config, weights, layer_idx):
-        super().__init__()
-        self.config = config
-        self.num_heads = self.config.num_attention_heads
-        self.num_key_value_heads = self.config.num_key_value_heads
-        self.dropout = config.dropout
-        self.hidden_size = config.hidden_size
-        self.head_size = config.hidden_size // self.num_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.layer_idx = layer_idx
-
-        self.num_heads = self.num_heads // weights.process_group.size()
-        self.num_key_value_heads = (
-            self.num_key_value_heads // weights.process_group.size()
-        )
-
-        self.q_proj = TensorParallelColumnLinear.load(
-            config,
-            prefix=f"{prefix}.q_proj",
-            weights=weights,
-            bias=False,
-        )
-        self.k_proj = TensorParallelColumnLinear.load(
-            config,
-            prefix=f"{prefix}.k_proj",
-            weights=weights,
-            bias=False,
-        )
-        self.v_proj = TensorParallelColumnLinear.load(
-            config,
-            prefix=f"{prefix}.v_proj",
-            weights=weights,
-            bias=False,
-        )
-        self.o_proj = TensorParallelRowLinear.load(
-            config,
-            prefix=f"{prefix}.o_proj",
-            weights=weights,
-            bias=False,
-        )
-
-        self.q_norm = MllamaTextRMSNorm.load(
-            prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps
-        )
-        self.k_norm = MllamaTextRMSNorm.load(
-            prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps
-        )
-        self.softmax_scale = self.head_size**-0.5
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
+        cross_attention_mask: Optional[torch.Tensor] = None,
         cross_attention_states: Optional[torch.Tensor] = None,
-        # past_key_value=None,
-        # attention_mask: Optional[torch.Tensor] = None,
-        # cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-        # hidden_states = hidden_states.unsqueeze(0)
-        # bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        query_states = query_states.view(-1, self.num_heads, self.head_size)
-        query_states = self.q_norm(query_states)
-
-        (
-            cross_attention_states,
-            cu_seqlen_q,
-            cu_seqlen_k,
-            max_q,
-            max_k,
-            indices,
-        ) = cross_attention_states
-
-        key_states = self.k_proj(cross_attention_states)
-        value_states = self.v_proj(cross_attention_states)
-        key_states = key_states.view(-1, self.num_key_value_heads, self.head_size)
-        value_states = value_states.view(-1, self.num_key_value_heads, self.head_size)
-        key_states = self.k_norm(key_states)
-
-        # key_states = key_states.repeat(1, self.num_key_value_groups, 1)
-        # value_states = value_states.repeat(1, self.num_key_value_groups, 1)
-
-        causal = False
-        # logger.info(
-        #     f"Q: {query_states.shape} -K {key_states.shape} - V{value_states.shape}"
-        # )
-        attn_output = flash_attn_2_cuda.varlen_fwd(
-            query_states,
-            key_states,
-            value_states,
-            None,
-            cu_seqlen_q,
-            cu_seqlen_k,
-            None,
-            None,
-            None,  # block_tables
-            None,
-            max_q,
-            max_k,
-            0.0,
-            self.softmax_scale,
-            False,
-            causal,  # Causal
-            -1,  # window_size_left,
-            -1,
-            0.0,  # softcap
-            False,
-            None,
-        )[0]
-        attn_output = self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
-
-        return attn_output
-
-
-# Copied from transformers.models.gemma2.modeling_gemma2.Gemma2MLP with Gemma2->MllamaText
-class MllamaTextMLP(nn.Module):
-    def __init__(self, *, prefix, config, weights):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = (
-            config.intermediate_size // weights.process_group.size()
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        token_idx: Optional[torch.Tensor] = None,
+        use_flash_attention: Optional[bool] = True,
+        flash_attention_recompute: Optional[bool] = True,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """
+        Copied from MllamaForConditionalGeneration::forward: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/mllama/modeling_mllama.py#L2077
+        The only differences are:
+            - add token_idx input
+            - add use_flash_attention and flash_attention_recompute
+        """
+        full_text_row_masked_out_mask = kwargs.get(
+            "full_text_row_masked_out_mask", None
         )
-        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
-            config,
-            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
-            weights=weights,
-            dim=0,
-            bias=False,
-        )
-        self.down_proj = TensorParallelRowLinear.load(
-            config,
-            prefix=f"{prefix}.down_proj",
-            weights=weights,
-            bias=False,
-        )
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        shape = x.shape
-        gate_up_states = self.gate_up_proj(x)
-        gate_up_states = gate_up_states.view(*shape[:-1], 2, self.intermediate_size)
-        result = self.down_proj(
-            self.act_fn(gate_up_states[:, 0]) * gate_up_states[:, 1]
-        )
-        return result
-
-
-class FlashLlamaCrossLayer(torch.nn.Module):
-    """Cross-attention transformer block with tanh-gated attention and feedforward."""
-
-    def __init__(self, *, prefix, config, weights, index) -> None:
-        layer_idx = index
-        super().__init__()
-        self.cross_attn = MllamaTextCrossAttention(
-            prefix=f"{prefix}.cross_attn",
-            config=config,
-            weights=weights,
-            layer_idx=layer_idx,
-        )
-
-        self.input_layernorm = MllamaTextRMSNorm.load(
-            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
-        )
-        self.cross_attn_attn_gate = torch.nn.Parameter(
-            weights.get_tensor(f"{prefix}.cross_attn_attn_gate"), requires_grad=False
-        )
-
-        self.mlp = MllamaTextMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
-        self.post_attention_layernorm = MllamaTextRMSNorm.load(
-            prefix=f"{prefix}.post_attention_layernorm",
-            weights=weights,
-            eps=config.rms_norm_eps,
-        )
-        self.cross_attn_mlp_gate = torch.nn.Parameter(
-            weights.get_tensor(f"{prefix}.cross_attn_mlp_gate"), requires_grad=False
-        )
-        self.layer_idx = layer_idx
-
-    def forward(
-        self,
-        hidden_states,
-        residual,
-        cos,
-        sin,
-        cu_seqlen_prefill,
-        kv_cache,
-        block_tables,
-        slots,
-        seqlen,
-        max_s,
-        adapter_data,
-        cross_attention_states,  # [ IB, ...]
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if cross_attention_states is None:
-            return hidden_states, residual
-        if residual is not None:
-            hidden_states += residual
-
-        indices = cross_attention_states[-1]
-        out_hidden_states = hidden_states[:]
-        if len(indices) > 0:
-            assert max(indices) < hidden_states.shape[0]
-        hidden_states = hidden_states[indices]
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-
-        hidden_states = self.cross_attn(
-            hidden_states=hidden_states,
-            # attention_mask=cross_attention_mask,
-            cross_attention_states=cross_attention_states,
-        )
-        hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states
-
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states
-
-        out_hidden_states[indices] = hidden_states
-        hidden_states = out_hidden_states
-
-        return hidden_states, None
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MllamaText
-class MllamaTextRMSNorm(nn.Module):
-    def __init__(self, weight, eps):
-        super().__init__()
-        self.weight = weight
-        self.variance_epsilon = eps
-
-    @classmethod
-    def load(cls, *, prefix, weights, eps):
-        weight = nn.Parameter(
-            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
-        )
-        return cls(weight=weight, eps=eps)
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-    def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-
-
-class MllamaForConditionalGeneration(nn.Module):
-    def __init__(self, prefix, config, weights):
-        super().__init__()
-        config.vision_config.quantize = None
-        config.vision_config.speculator = config.speculator
-        config.text_config.quantize = config.quantize
-        config.text_config.speculator = config.speculator
-        config.text_config._attn_implementation = "sdpa"
-        self.hidden_size = config.text_config.hidden_size
-        self.vision_model = MllamaVisionModel(
-            prefix="vision_model", config=config.vision_config, weights=weights
-        )
-        self.multi_modal_projector = FastLinear.load(
-            prefix="multi_modal_projector", config=config, weights=weights, bias=True
-        )
-        self.text_model = FlashLlamaForCausalLM(
-            prefix="language_model", config=config.text_config, weights=weights
-        )
-        self.config = config
-        self.dtype = weights.dtype
-        self.device = weights.device
-
-    def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
-        if aspect_ratio_ids is None:
+        if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
-                "`aspect_ratio_ids` must be provided if `pixel_values` is provided"
-            )
-        # logger.info(f"PIxel values {pixel_values.shape}")
-        batch_size = pixel_values.shape[0]
-        vision_states = self.vision_model(
-            pixel_values, aspect_ratio_ids, aspect_ratio_mask
-        )
-        cross_attention_states = self.multi_modal_projector(vision_states).reshape(
-            -1, vision_states.shape[-2], self.hidden_size
-        )
-        _, _, h = cross_attention_states.shape
-        cross_attention_states = cross_attention_states.view(batch_size, -1, h)
-        # logger.info(f"cross {cross_attention_states.shape}")
-        return cross_attention_states
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        cu_seqlen_prefill: Optional[torch.Tensor],
-        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
-        slots: torch.Tensor,
-        seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
-        lm_head_indices: Optional[torch.Tensor],
-        adapter_data: Optional[torch.Tensor] = None,
-        # XXX: Putting these as optional so that the cuda warmup calls can go through.
-        cross_attention_states: Optional[torch.Tensor] = None,
-        image_indices=None,
-    ):
-        if cross_attention_states is not None:
-            seqlen_q = len(image_indices)
-            n_images = cross_attention_states.shape[0]
-            seqlen_k = cross_attention_states.shape[1]
-            device = cross_attention_states.device
-            if cu_seqlen_prefill is not None:
-                offset = 0
-                cu_q = []
-                indices = []
-                for index in image_indices:
-                    cu_q.append(offset)
-                    length = seqlen.input_lengths[index].item()
-                    assert index < seqlen.cu_seqlen_q.shape[0]
-                    input_ids_offset = seqlen.cu_seqlen_q[index]
-                    indices.extend(range(input_ids_offset, input_ids_offset + length))
-                    offset += length
-                cu_q.append(offset)
-                cu_seqlen_q = torch.Tensor(cu_q).to(device=device, dtype=torch.int32)
-
-                assert max(indices) < input_ids.shape[0]
-
-                cu_seqlen_k = (
-                    torch.arange(
-                        n_images + 1,
-                        device=device,
-                        dtype=torch.int32,
-                    )
-                    * seqlen_k
-                )
-                max_q = cu_seqlen_q[-1].item()
-                max_k = seqlen_k
-            else:
-                cu_seqlen_q = torch.arange(
-                    seqlen_q + 1, device=device, dtype=torch.int32
-                )
-                seqlen_k = cross_attention_states.shape[1]
-                n_images = cross_attention_states.shape[0]
-                cu_seqlen_k = (
-                    torch.arange(
-                        n_images + 1,
-                        device=device,
-                        dtype=torch.int32,
-                    )
-                    * seqlen_k
-                )
-                max_q = seqlen_q
-                max_k = seqlen_k
-                indices = image_indices[:]
-
-            cross_attention_states = (
-                cross_attention_states,
-                cu_seqlen_q,
-                cu_seqlen_k,
-                max_q,
-                max_k,
-                indices,
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
-        outputs = self.text_model(
+        outputs = self.language_model(
             input_ids=input_ids,
+            attention_mask=attention_mask,
             position_ids=position_ids,
-            cu_seqlen_prefill=cu_seqlen_prefill,
-            kv_cache=kv_cache,
-            block_tables=block_tables,
-            slots=slots,
-            seqlen=seqlen,
-            max_s=max_s,
-            prefill_cache_indices=prefill_cache_indices,
-            lm_head_indices=lm_head_indices,
-            adapter_data=adapter_data,
             cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+            token_idx=token_idx,
+            use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
         )
 
+        logits = outputs[0]
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return output
+
         return outputs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_ids=None,
+        pixel_values=None,
+        aspect_ratio_ids=None,
+        aspect_ratio_mask=None,
+        cross_attention_mask=None,
+        past_key_values=None,
+        use_cache=False,
+        cache_position=None,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        """
+        Copied from MllamaForConditionalGeneration::prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/mllama/modeling_mllama.py#L2208
+        The only differences are:
+            - add token_idx handling
+            - add bucket_internal handling
+            - add use_flash_attention and flash_attention_recompute
+        """
+
+        token_idx = kwargs.get("token_idx", None)
+        if token_idx is None:
+            return super().prepare_inputs_for_generation(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                pixel_values=pixel_values,
+                aspect_ratio_ids=aspect_ratio_ids,
+                aspect_ratio_mask=aspect_ratio_mask,
+                cross_attention_mask=cross_attention_mask,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+        else:
+            use_flash_attention = kwargs.get("use_flash_attention", True)
+            flash_attention_recompute = kwargs.get("flash_attention_recompute", True)
+            position_ids = kwargs.get("position_ids", None)
+            output_attentions = kwargs.get("output_attentions", None)
+            output_hidden_states = kwargs.get("output_hidden_states", None)
+            return_dict = kwargs.get("return_dict", None)
+            labels = kwargs.get("labels", None)
+            cross_attention_states = kwargs.get("cross_attention_states", None)
+
+            output_attentions = (
+                output_attentions
+                if output_attentions is not None
+                else self.config.output_attentions
+            )
+            output_hidden_states = (
+                output_hidden_states
+                if output_hidden_states is not None
+                else self.config.output_hidden_states
+            )
+            return_dict = (
+                return_dict if return_dict is not None else self.config.use_return_dict
+            )
+            bucket_internal = kwargs.get("bucket_internal", None)
+
+            if past_key_values is not None:
+                if token_idx is not None:
+                    input_ids = torch.index_select(input_ids, 1, token_idx - 1)
+                elif inputs_embeds is not None:  # Exception 1
+                    input_ids = input_ids[:, -cache_position.shape[0] :]
+                elif (
+                    input_ids.shape[1] != cache_position.shape[0]
+                ):  # Default case (the "else", a no op, is Exception 2)
+                    input_ids = input_ids[:, cache_position]
+            elif bucket_internal and token_idx is not None:
+                # for the 1st token we can slice the inputs till token idx for the fwd pass.
+                input_ids = input_ids[:, :token_idx]
+                attention_mask = attention_mask[:, :token_idx]
+                if cross_attention_mask is not None:
+                    cross_attention_mask = cross_attention_mask[:, :token_idx, ...]
+
+            # TODO: we have no attention_mask so this won't work, check if we really won't need attention mask and find another way
+            if attention_mask is not None and position_ids is None:
+                # create position_ids on the fly for batch generation
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                if past_key_values:
+                    if token_idx is not None:
+                        position_ids = torch.index_select(
+                            position_ids, 1, token_idx - 1
+                        )
+                    else:
+                        position_ids = position_ids[:, -input_ids.shape[1] :]
+
+                    # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                    position_ids = position_ids.clone(
+                        memory_format=torch.contiguous_format
+                    )
+
+            if pixel_values is not None and inputs_embeds is not None:
+                raise ValueError(
+                    "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                )
+
+            if pixel_values is not None and cross_attention_states is not None:
+                raise ValueError(
+                    "`pixel_values` and `cross_attention_states` cannot be provided simultaneously"
+                )
+
+            if pixel_values is not None:
+                if aspect_ratio_ids is None:
+                    raise ValueError(
+                        "`aspect_ratio_ids` must be provided if `pixel_values` is provided"
+                    )
+                # get vision tokens from vision model
+                vision_outputs = self.vision_model(
+                    pixel_values=pixel_values,
+                    aspect_ratio_ids=aspect_ratio_ids,
+                    aspect_ratio_mask=aspect_ratio_mask,
+                    output_hidden_states=output_hidden_states,
+                    output_attentions=output_attentions,
+                    return_dict=return_dict,
+                    use_flash_attention=use_flash_attention,
+                )
+                cross_attention_states = vision_outputs[0]
+                cross_attention_states = self.multi_modal_projector(
+                    cross_attention_states
+                ).reshape(-1, cross_attention_states.shape[-2], self.hidden_size)
+
+            if cross_attention_mask is not None:
+                cross_attention_mask, full_text_row_masked_out_mask = (
+                    _prepare_cross_attention_mask(
+                        cross_attention_mask,
+                        num_vision_tokens=self.vision_model.num_patches,
+                        dtype=self.dtype,
+                        token_idx=token_idx,
+                    )
+                )
+            else:
+                full_text_row_masked_out_mask = None
+
+            if cross_attention_mask is not None:
+                if cache_position is not None:
+                    cross_attention_mask = cross_attention_mask[:, :, cache_position]
+                    full_text_row_masked_out_mask = full_text_row_masked_out_mask[
+                        :, :, cache_position
+                    ]
+                elif past_key_values is not None:
+                    if token_idx is not None:
+                        cross_attention_mask = torch.index_select(
+                            cross_attention_mask, -2, token_idx - 1
+                        )
+                        full_text_row_masked_out_mask = torch.index_select(
+                            full_text_row_masked_out_mask, -2, token_idx - 1
+                        )
+                    else:
+                        cross_attention_mask = cross_attention_mask[:, :, -1:]
+                        full_text_row_masked_out_mask = full_text_row_masked_out_mask[
+                            :, :, -1:
+                        ]
+
+            # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+            if inputs_embeds is not None and past_key_values is None:
+                model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+            else:
+                # The clone here is for the same reason as for `position_ids`.
+                model_inputs = {
+                    "input_ids": input_ids.clone(memory_format=torch.contiguous_format),
+                    "inputs_embeds": None,
+                }
+
+            if num_logits_to_keep is not None:
+                model_inputs["num_logits_to_keep"] = num_logits_to_keep
+
+            # keep cache_position implementation as None for HPU
+            cache_position = None
+
+            model_inputs.update(
+                {
+                    "position_ids": position_ids,
+                    "past_key_values": past_key_values,
+                    "use_cache": kwargs.get("use_cache"),
+                    "attention_mask": attention_mask,
+                    "token_idx": token_idx,
+                    "labels": labels,
+                    "return_dict": kwargs.get("return_dict"),
+                    "full_text_row_masked_out_mask": full_text_row_masked_out_mask,
+                    "use_flash_attention": use_flash_attention,
+                    "cross_attention_mask": cross_attention_mask,
+                    "cross_attention_states": cross_attention_states,
+                    "output_attentions": output_attentions,
+                    "flash_attention_recompute": flash_attention_recompute,
+                }
+            )
+
+            return model_inputs
diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
index cef761b4..66e00171 100644
--- a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
@@ -8,7 +8,6 @@ from io import BytesIO
 from opentelemetry import trace
 from loguru import logger
 from typing import Iterable, Optional, Tuple, List, Type, Dict
-import itertools
 import tempfile
 import copy
 from text_generation_server.models import Model
@@ -19,7 +18,6 @@ from text_generation_server.models.causal_lm import (
     CausalLMBatch,
     CausalLMRequest,
     remove_kv_cache_from_output,
-    biggest_single_chunk,
 )
 
 from transformers.models.llava_next.modeling_llava_next import (
@@ -68,18 +66,19 @@ IDEFICS2_IMAGE_TOKEN = "<image>"
 IMAGES = re.compile(r"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)")
 BASE_IMAGE_TOKENS = int(os.environ.get("BASE_IMAGE_TOKENS", 2048))
 MAX_TOTAL_TOKENS = int(os.environ.get("MAX_TOTAL_TOKENS", 8192))
-PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 256))
+PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 128))
 CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
 LAZY_MODE = int(os.environ.get("PT_HPU_LAZY_MODE", 1))
-MAX_BATCH_SIZE = (
-    int(os.environ.get("MAX_BATCH_SIZE"))
-    if os.environ.get("MAX_BATCH_SIZE") is not None
-    else None
-)
+max_batch_size_str = os.environ.get("MAX_BATCH_SIZE")
+if max_batch_size_str is not None:
+    MAX_BATCH_SIZE = int(max_batch_size_str)
+else:
+    raise ValueError("MAX_BATCH_SIZE is not set")
 
 PREFILL_WARMUP_BATCH_SIZE_LIST = []
 PREFILL_WARMUP_SEQLEN_LIST = []
 DECODE_WARMUP_BATCH_SIZE_LIST = []
+CROSS_ATTENTION_LAYERS = []
 
 
 def round_up(warmup_list: list, num):
@@ -87,7 +86,7 @@ def round_up(warmup_list: list, num):
     for i in warmup_list:
         if num <= i:
             break
-    return i
+    return i if i > 0 else num
 
 
 def split(string) -> List[Dict[str, str]]:
@@ -107,20 +106,17 @@ def split(string) -> List[Dict[str, str]]:
     return parts
 
 
-def image_text_replacement(processor, image_input, config, image_id: int) -> str:
+def image_text_replacement(config) -> str:
     if config.model_type == "idefics2":
         image_seq_len = 64
         image_str = f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_IMAGE_TOKEN * image_seq_len}{IDEFICS2_FAKE_TOKEN}"
-        if processor.image_processor.do_image_splitting:
-            image_str *= 5
         return image_str
     elif config.model_type == "llava_next":
-        height, width = image_input["image_sizes"][image_id]
-        num_features = get_number_of_features(height, width, config)
-        return "<image>" * num_features
-
+        return "<image>"
     elif config.model_type == "paligemma":
-        return "<image>" * config.text_config.num_image_tokens
+        return "<image>"
+    elif config.model_type == "mllama":
+        return "<|image|>"
     else:
         raise RuntimeError(f"Unknown config {config.model_type} for multimodal")
 
@@ -192,6 +188,100 @@ class VlmCausalLMBatch(CausalLMBatch):
     pixel_values: Optional[List[torch.Tensor]]
     pixel_attention_mask: Optional[List[torch.Tensor]]
     image_sizes: Optional[List[Tuple[int, int]]]
+    aspect_ratio_ids: Optional[torch.Tensor] = None
+    aspect_ratio_mask: Optional[torch.Tensor] = None
+    cross_attention_mask: Optional[torch.Tensor] = None
+    prefilling: bool = True
+    token_idx: torch.Tensor = None
+
+    def __init__(
+        self,
+        batch_id,
+        requests,
+        input_ids,
+        attention_mask,
+        position_ids,
+        past_key_values,
+        merged_kv_cache,
+        next_token_chooser,
+        top_n_tokens,
+        top_n_tokens_tensor,
+        input_length,
+        pixel_values: Optional[List[torch.Tensor]] = None,
+        pixel_attention_mask: Optional[List[torch.Tensor]] = None,
+        image_sizes: Optional[List[Tuple[int, int]]] = None,
+        aspect_ratio_ids: Optional[torch.Tensor] = None,
+        aspect_ratio_mask: Optional[torch.Tensor] = None,
+        cross_attention_mask: Optional[torch.Tensor] = None,
+        prefilling: Optional[bool] = True,
+    ):
+        super().__init__(
+            batch_id=batch_id,
+            requests=requests,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            merged_kv_cache=merged_kv_cache,
+            next_token_chooser=next_token_chooser,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            input_length=input_length,
+        )
+
+        self.pixel_values = pixel_values
+        self.pixel_attention_mask = pixel_attention_mask
+        self.image_sizes = image_sizes
+        self.aspect_ratio_ids = aspect_ratio_ids
+        self.aspect_ratio_mask = aspect_ratio_mask
+        self.cross_attention_mask = cross_attention_mask
+        self.prefilling = prefilling
+
+    @property
+    def token_idx(self):
+        if self.prefilling:
+            # no right padding for prefill
+            token_idx_scalar = self.attention_mask.shape[-1] - 1
+            return torch.tensor(token_idx_scalar).to(self.attention_mask.device)
+        else:
+            token_idx_scalar = self.attention_mask.shape[-1] - self.right_padding
+            return torch.tensor(token_idx_scalar).to(self.attention_mask.device)
+
+    def padding_process(self, pad_id: int):
+        # self.input_ids = torch.index_select(self.input_ids, 1, self.token_idx - 1)
+        right_padding = MAX_TOTAL_TOKENS - self.attention_mask.shape[1]
+        self.input_ids = torch.nn.functional.pad(
+            self.input_ids, (0, right_padding), value=pad_id
+        )
+        self.attention_mask = torch.nn.functional.pad(
+            self.attention_mask, (0, right_padding), value=0
+        )
+        # if self.position_ids is not None:
+        #     self.position_ids = torch.index_select(self.position_ids, 1, self.token_idx - 1) + 1
+        if self.cross_attention_mask is not None:
+            self.cross_attention_mask = torch.nn.functional.pad(
+                self.cross_attention_mask, (0, 0, 0, 0, 0, right_padding), value=0
+            )
+        if self.past is not None:
+            past_key_values_list = list(self.past_key_values)
+            for layer_id in range(len(self.past)):
+                past_key_value_list = list(self.past_key_values[layer_id])
+                if layer_id not in CROSS_ATTENTION_LAYERS:
+                    past_key_value_list[0] = torch.nn.functional.pad(
+                        self.past_key_values[layer_id][0],
+                        (0, 0, 0, right_padding),
+                        value=0,
+                    )
+                    past_key_value_list[1] = torch.nn.functional.pad(
+                        self.past_key_values[layer_id][1],
+                        (0, 0, 0, right_padding),
+                        value=0,
+                    )
+                past_key_values_list[layer_id] = tuple(past_key_value_list)
+            self.past_key_values = tuple(past_key_values_list)
+
+        self.prefilling = False
+        self.input_length = self.input_length
 
     @classmethod
     def from_tokenized(
@@ -239,23 +329,23 @@ class VlmCausalLMBatch(CausalLMBatch):
         bucket_size = max_input_length
         left_padding = max_input_length - input_len
         if is_warmup is False:
-            if input_len < max_input_length:
-                rounded_seq_len = round_up(PREFILL_WARMUP_SEQLEN_LIST, input_len + 1)
-                if rounded_seq_len <= max_input_length:
-                    bucket_size = rounded_seq_len - 1
-                else:
-                    bucket_size = max_input_length - 1
-                left_padding = bucket_size - input_len
+            rounded_seq_len = round_up(PREFILL_WARMUP_SEQLEN_LIST, input_len + 1)
+            bucket_size = rounded_seq_len - 1
+            left_padding = bucket_size - input_len
 
         input_ids = tokenized_inputs["input_ids"]
         attention_mask = tokenized_inputs["attention_mask"]
+        cross_attention_mask = tokenized_inputs.get("cross_attention_mask", None)
         # Allocate space for first token
-        if left_padding > 0:
-            input_ids = torch.nn.functional.pad(
-                input_ids, (left_padding, 1), value=tokenizer.pad_token_id
-            )
-            attention_mask = torch.nn.functional.pad(
-                attention_mask, (left_padding, 1), value=0
+        input_ids = torch.nn.functional.pad(
+            input_ids, (left_padding, 1), value=tokenizer.pad_token_id
+        )
+        attention_mask = torch.nn.functional.pad(
+            attention_mask, (left_padding, 1), value=0
+        )
+        if cross_attention_mask is not None:
+            cross_attention_mask = torch.nn.functional.pad(
+                cross_attention_mask, (0, 0, 0, 0, left_padding, 1), value=0
             )
         all_input_ids = torch.nn.functional.pad(
             input_ids, (0, max_new_tokens), value=tokenizer.pad_token_id
@@ -270,9 +360,13 @@ class VlmCausalLMBatch(CausalLMBatch):
             r.all_input_ids = all_input_ids[r.idx]
         input_ids = input_ids.to(device)
         attention_mask = attention_mask.to(device)
+        cross_attention_mask = (
+            cross_attention_mask.to(device)
+            if cross_attention_mask is not None
+            else None
+        )
         position_ids = attention_mask.long().cumsum(-1) - 1
         position_ids.masked_fill_(attention_mask == 0, 1)
-
         htorch.core.mark_step()
 
         return cls(
@@ -287,6 +381,7 @@ class VlmCausalLMBatch(CausalLMBatch):
             top_n_tokens=top_n_tokens,
             top_n_tokens_tensor=top_n_tokens_tensor,
             input_length=input_len,
+            cross_attention_mask=cross_attention_mask,
         )
 
     @classmethod
@@ -298,46 +393,40 @@ class VlmCausalLMBatch(CausalLMBatch):
         config,
         is_warmup,
     ):
-        # Process images first. We need all of them so that the processor
-        # can make the image splits the same size. And we need the final
-        # sizes to insert correct number of image tokens.
+        image_inputs = {}
+        texts = []
         images = []
-        for r in requests:
+        batch_tokenized_inputs = {}
+
+        for i, r in enumerate(requests):
+            # Each input is encoded into a list, where each element of this input list is either a string or a URL
+            curr_text = ""
+            curr_image = None
             for chunk in r.input_chunks.chunks:
                 chunk_type = chunk.WhichOneof("chunk")
                 if chunk_type == "text":
-                    pass
+                    curr_text += chunk.text
                 elif chunk_type == "image":
                     image = Image.open(BytesIO(chunk.image.data))
-                    if config.model_type == "llava_next":
-                        images.append(image)
-                    else:
-                        images.append([image])
+                    # TODO unsure about BOS
+                    curr_image = image
                 else:
                     raise RuntimeError(f"Invalid chunk type {chunk_type}")
 
-        image_inputs = None
-        if images:
-            image_inputs = processor.image_processor(images, return_tensors="pt")
-
-        batch_inputs = []
-        max_truncation = 0
-        image_id = 0
-        for r in requests:
-            full_text = ""
-            for chunk in r.input_chunks.chunks:
-                chunk_type = chunk.WhichOneof("chunk")
-                if chunk_type == "text":
-                    full_text += chunk.text
-                elif chunk_type == "image":
-                    full_text += image_text_replacement(
-                        processor, image_inputs, config, image_id
+            if image_text_replacement(config) not in curr_text:
+                if "<image>" in curr_text:
+                    curr_text = curr_text.replace(
+                        "<image>", image_text_replacement(config)
                     )
-                    image_id += 1
-            full_text = image_text_replacement_fixup(config, full_text)
+                else:
+                    curr_text = image_text_replacement(config) + curr_text
 
-            batch_inputs.append(full_text)
-            max_truncation = max(max_truncation, r.truncate)
+            texts.append(curr_text)
+            if curr_image is not None:
+                if config.model_type == "mllama":
+                    images.append([curr_image])
+                else:
+                    images.append(curr_image)
 
         missing_inputs = 0
         dummy_images = None
@@ -346,45 +435,48 @@ class VlmCausalLMBatch(CausalLMBatch):
             missing_inputs = new_bs - len(requests)
             if missing_inputs > 0:
                 dummy_inputs = []
-                if len(batch_inputs) > 0:
-                    dummy_inputs = [batch_inputs[0]] * missing_inputs
+                if len(texts) > 0:
+                    dummy_inputs = [texts[0]] * missing_inputs
+                    dummy_images = [images[0]] * missing_inputs
+                texts += dummy_inputs
+                images += dummy_images
 
-                batch_inputs += dummy_inputs
-
-        batch_tokenized_inputs = tokenizer(
-            batch_inputs,
+        processor_output = processor(
+            images,
+            texts,
             truncation=True,
-            max_length=max_truncation,
-            add_special_tokens=not config.model_type == "paligemma",
+            max_length=r.truncate,
+            add_special_tokens=r.add_special_tokens,
             return_tensors="pt",
+            padding_side="left",
             padding="longest",
-            return_token_type_ids=False,
         )
-
-        if missing_inputs > 0 and image_inputs is not None:
-            dummy_shape = list(image_inputs["pixel_values"].shape)
-            dummy_shape[0] = missing_inputs
-            dummy_images = torch.rand(dummy_shape)
-            new_image_inputs = {
-                "pixel_values": torch.cat(
-                    (image_inputs["pixel_values"], dummy_images), dim=0
-                ),
-            }
-            if "pixel_attention_mask" in image_inputs:
-                dummy_shape = list(image_inputs["pixel_attention_mask"].shape)
-                dummy_shape[0] = missing_inputs
-                dummy_attention = torch.zeros(dummy_shape)
-                new_image_inputs["pixel_attention_mask"] = torch.cat(
-                    (image_inputs["pixel_attention_mask"], dummy_attention), dim=0
-                )
-            if "image_sizes" in image_inputs:
-                dummy_shape = list(list(image_inputs["image_sizes"])[0])
-                dummy_shape = missing_inputs * [dummy_shape]
-                dummy_sizes = torch.IntTensor(dummy_shape)
-                new_image_inputs["image_sizes"] = torch.cat(
-                    (image_inputs["image_sizes"], dummy_sizes), dim=0
-                )
-            image_inputs = new_image_inputs
+        if "input_ids" in processor_output:
+            batch_tokenized_inputs.update({"input_ids": processor_output["input_ids"]})
+        if "attention_mask" in processor_output:
+            batch_tokenized_inputs.update(
+                {"attention_mask": processor_output["attention_mask"]}
+            )
+        if "cross_attention_mask" in processor_output:
+            batch_tokenized_inputs.update(
+                {"cross_attention_mask": processor_output["cross_attention_mask"]}
+            )
+        if "pixel_values" in processor_output:
+            image_inputs.update({"pixel_values": processor_output["pixel_values"]})
+        if "pixel_attention_mask" in processor_output:
+            image_inputs.update(
+                {"pixel_attention_mask": processor_output["pixel_attention_mask"]}
+            )
+        if "aspect_ratio_ids" in processor_output:
+            image_inputs.update(
+                {"aspect_ratio_ids": processor_output["aspect_ratio_ids"]}
+            )
+        if "aspect_ratio_mask" in processor_output:
+            image_inputs.update(
+                {"aspect_ratio_mask": processor_output["aspect_ratio_mask"]}
+            )
+        if "image_sizes" in processor_output:
+            image_inputs.update({"image_sizes": processor_output["image_sizes"]})
 
         return batch_tokenized_inputs, image_inputs
 
@@ -402,7 +494,9 @@ class VlmCausalLMBatch(CausalLMBatch):
         batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs(
             pb.requests, tokenizer, processor, config, is_warmup
         )
-        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
+        batch = cls.from_tokenized(
+            pb, tokenizer, batch_tokenized_inputs, dtype, device, is_warmup=is_warmup
+        )
         if image_inputs is not None:
             batch.pixel_values = image_inputs["pixel_values"].to(device=device)
             if "pixel_attention_mask" in image_inputs:
@@ -415,10 +509,26 @@ class VlmCausalLMBatch(CausalLMBatch):
                 batch.image_sizes = image_inputs["image_sizes"].to(device=device)
             else:
                 batch.image_sizes = None
+            if "aspect_ratio_ids" in image_inputs:
+                batch.aspect_ratio_ids = image_inputs["aspect_ratio_ids"].to(
+                    device=device
+                )
+            else:
+                batch.aspect_ratio_ids = None
+            if "aspect_ratio_mask" in image_inputs:
+                batch.aspect_ratio_mask = image_inputs["aspect_ratio_mask"].to(
+                    device=device
+                )
+            else:
+                batch.aspect_ratio_mask = None
         else:
             batch.pixel_values = None
             batch.pixel_attention_mask = None
             batch.image_sizes = None
+            batch.aspect_ratio_ids = None
+            batch.aspect_ratio_mask = None
+            batch.cross_attention_mask = None
+
         return batch
 
     @classmethod
@@ -440,107 +550,231 @@ class VlmCausalLMBatch(CausalLMBatch):
     ) -> "VlmCausalLMBatch":
         if not all(b.past_key_values is not None for b in batches):
             raise ValueError("KV cache not allocated! Cannot recombine before prefill!")
+            # Used for padding
 
         total_requests = sum(len(b) for b in batches)
         new_bs = total_requests
-        if is_warmup is False:
+        if not is_warmup:
             new_bs = round_up(DECODE_WARMUP_BATCH_SIZE_LIST, total_requests)
-        batch_id = batches[0].batch_id
-        device = batches[0].input_ids.device
 
-        input_lengths = [b.input_length for b in batches]
-        max_input_length = max(input_lengths)
-        offsets = [max_input_length - b.input_length for b in batches]
-
-        cur_padding = [b.right_padding for b in batches]
-        # For prefill there is a space allocated only for first token
-        # Need to add padding to the max total tokens before first decode
-
-        moves_needed = [
-            total_requests - len(b) if b.batch_size == new_bs else total_requests
-            for b in batches
-        ]
-        dst_batch_idx = min(enumerate(moves_needed), key=lambda idx_val: idx_val[1])[0]
-        reshape = batches[dst_batch_idx].batch_size < new_bs
-
-        # TODO: Add support for changing max seq len, i.e. due to output length bucketing
-        # FIXME: max_seq_len for non optimized code
         if len(batches) > 1:
             scenario = "CONCAT"
-        elif reshape:
-            scenario = "RESHAPE"
-        elif cur_padding[dst_batch_idx] <= 0:
+        elif batches[0].prefilling:
             scenario = "SHIFT"
-            offsets = [
-                biggest_single_chunk(b.max_input_length - max_input_length)
-                for b in batches
-            ]
-            max_input_length = max_input_length + offsets[dst_batch_idx]
         else:
-            # Nothing to do
             return batches[0]
 
         dbg_trace(
             scenario,
             f"bs:{[b.batch_size for b in batches]}->{new_bs}"
-            f" reqs:{[len(b) for b in batches]}"
-            f" offsets:{offsets}"
-            f" input_lengths:{input_lengths}"
-            f" cur_padding:{cur_padding}"
-            f" dst_batch:{dst_batch_idx}",
+            f" reqs:{[len(b) for b in batches]}",
         )
 
-        grouped_requests = [[req for req in batch.requests] for batch in batches]
-        flat_requests = list(itertools.chain(*grouped_requests))
+        if scenario == "SHIFT":
+            batch = batches[0]
+            batch.padding_process(pad_token_id)
+            return batch
 
-        for i in range(len(batches)):
-            target_bs = new_bs if i == dst_batch_idx else batches[i].batch_size
-            batches[i].merge_kv_cache_if_needed(target_bs, offsets[i])
-            batches[i].realign(target_bs, offsets[i], pad_token_id)
-            batches[i].split_kv_cache_if_needed(i == dst_batch_idx)
-        batches[dst_batch_idx].expand_bs(new_bs)
-        batches[dst_batch_idx].move_data(
-            [batches[i] for i in range(len(batches)) if i != dst_batch_idx]
-        )
+        total_batch_size = 0
+        max_input_length = 0
+        for i, batch in enumerate(batches):
+            total_batch_size += len(batch)
+            max_input_length = max(max_input_length, batch.input_length)
+        # Batch attributes
+        requests = []
+        input_lengths = []
+        top_n_tokens = []
+        parameters = []
+        fsm_grammar_states = []
 
-        top_n_tokens = [r.data.top_n_tokens for r in flat_requests]
-        top_n_tokens_tensor = torch.tensor(
-            top_n_tokens, device=device, dtype=torch.int64
-        )
+        # Batch tensors
+        input_ids = None
+        attention_mask = None
+        position_ids = None
+        past_key_values = []
+        top_n_tokens_tensor = None
+        cross_attention_mask = None
+        # Used for slicing correctly inside the tensors
+        # Equivalent to a cumsum on batch sizes
+        start_index = 0
+        for i, batch in enumerate(batches):
+            keep_indices = []
+            for req in batch.requests:
+                keep_indices.append(req.idx)
 
-        parameters = [r.data.parameters for r in flat_requests]
-        # append the dummy parameters for dummy requests
-        batch_size = batches[dst_batch_idx].batch_size
-        parameters = pad_next_token_chooser_parameters(parameters, batch_size)
+            requests.extend(batch.requests)
+            parameters.extend([r.data.parameters for r in batch.requests])
+            fsm_grammar_states.extend(
+                [batch.next_token_chooser.fsm_grammar_states[i] for i in keep_indices]
+            )
+            input_lengths.extend([batch.input_length])
+            top_n_tokens.extend([batch.top_n_tokens[i] for i in keep_indices])
 
-        # update past grammar states
-        fsm_grammar_states = [0] * batch_size
-        for batch in batches:
-            for i, req in enumerate(batch.requests):
-                fsm_grammar_states[req.idx] = (
-                    batch.next_token_chooser.fsm_grammar_states[i]
+            # Slicing end index for this batch
+            end_index = start_index + len(batch)
+
+            # We only concatenate batches that did at least one step
+            if batch.past_key_values is None:
+                raise ValueError("only concatenate prefilled batches")
+
+            # Create empty tensor
+            # input_ids is always of shape [batch_size, 1]
+            # We do not need to pad it
+            if input_ids is None:
+                input_ids = batch.input_ids.new_empty((new_bs, MAX_TOTAL_TOKENS))
+            # # Copy to correct indices
+
+            left_offset = max_input_length - batch.input_length
+            right_padding = MAX_TOTAL_TOKENS - max_input_length
+            input_ids[start_index:end_index, left_offset:-right_padding] = (
+                batch.input_ids[keep_indices, : batch.input_length]
+            )
+
+            # Create padded tensor
+            if top_n_tokens_tensor is None:
+                top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
+                    new_bs,
+                )
+            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor[
+                keep_indices
+            ]
+
+            if attention_mask is None:
+                attention_mask = batch.attention_mask.new_zeros(
+                    (new_bs, MAX_TOTAL_TOKENS),
                 )
 
+            attention_mask[
+                start_index:end_index,
+                left_offset:-right_padding,
+            ] = batch.attention_mask[
+                keep_indices,
+                : batch.input_length,
+            ]
+
+            if batch.cross_attention_mask is not None:
+                cross_attention_mask_shape = list(batch.cross_attention_mask.shape)
+                cross_attention_mask_shape[1] = MAX_TOTAL_TOKENS
+                cross_attention_mask_shape[0] = new_bs
+                cross_attention_mask_shape = torch.Size(cross_attention_mask_shape)
+                if cross_attention_mask is None:
+                    cross_attention_mask = batch.cross_attention_mask.new_zeros(
+                        cross_attention_mask_shape,
+                    )
+                cross_attention_mask[
+                    start_index:end_index,
+                    left_offset:-right_padding,
+                ] = batch.cross_attention_mask[
+                    keep_indices,
+                    : batch.input_length,
+                ]
+
+            # Create empty tensor
+            # position_ids is always of shape [batch_size, 1]
+            if position_ids is None:
+                position_ids = batch.position_ids.new_empty((new_bs, 1))
+            position_ids[start_index:end_index] = batch.position_ids[keep_indices, :]
+
+            # Shenanigans to get dimensions because BLOOM outputs a past with a different shape
+            # BLOOM Keys:   [batch_size * num_heads, head_dim, seq_length]
+            # BLOOM Values: [batch_size * num_heads, seq_length, head_dim]
+            # And ensure that we can update tensors in-place
+            if isinstance(batch.past_key_values, tuple):
+                batch.past_key_values = [
+                    [t.view(batch.batch_size, -1, *t.shape[-2:]) for t in layer]
+                    for layer in batch.past_key_values
+                ]
+            elif len(batch.past_key_values[0][0].shape) == 3:
+                for layer in batch.past_key_values:
+                    for k, t in enumerate(layer):
+                        layer[k] = t.view(batch.batch_size, -1, *t.shape[-2:])
+
+            start_index = end_index
+
+        first_past_kvs = batches[0].past_key_values
+        _, num_heads, padded_sequence_length, head_dim = first_past_kvs[0][1].shape
+        past_key_values = []
+        for layer_id in range(len(batches[0].past_key_values)):
+            if layer_id in CROSS_ATTENTION_LAYERS:
+                padded_past_keys_shape = list(
+                    batches[0].past_key_values[layer_id][0].shape
+                )
+                padded_past_keys_shape[0] = new_bs
+                padded_past_keys_shape = torch.Size(padded_past_keys_shape)
+            else:
+                padded_past_keys_shape = (
+                    new_bs,
+                    num_heads,
+                    MAX_TOTAL_TOKENS,
+                    head_dim,
+                )
+
+            padded_past_keys = first_past_kvs[layer_id][0].new_zeros(
+                padded_past_keys_shape
+            )
+            padded_past_values = first_past_kvs[layer_id][1].new_zeros(
+                padded_past_keys_shape
+            )
+            start_index = 0
+            for batch in batches:
+                keep_indices = []
+                for req in batch.requests:
+                    keep_indices.append(req.idx)
+
+                left_offset = max_input_length - batch.input_length
+                right_padding = MAX_TOTAL_TOKENS - max_input_length
+                past_keys = batch.past_key_values[layer_id][0]
+                past_values = batch.past_key_values[layer_id][1]
+                # Clear reference to the original tensor
+                batch.past_key_values[layer_id] = None
+
+                # Slicing end index for this batch
+                end_index = start_index + len(batch)
+                # We slice the keys to remove the padding from previous batches
+                if layer_id in CROSS_ATTENTION_LAYERS:
+                    padded_past_keys[start_index:end_index, :, :, :] = past_keys[
+                        keep_indices, :, :, :
+                    ]
+                    padded_past_values[start_index:end_index, :, :, :] = past_values[
+                        keep_indices, :, :, :
+                    ]
+
+                else:
+                    padded_past_keys[
+                        start_index:end_index, :, left_offset:-right_padding, :
+                    ] = past_keys[keep_indices, :, : batch.input_length, :]
+                    padded_past_values[
+                        start_index:end_index, :, left_offset:-right_padding, :
+                    ] = past_values[keep_indices, :, : batch.input_length, :]
+
+                start_index = end_index
+
+            past_key_values.append(tuple([padded_past_keys, padded_past_values]))
+        past_key_values = tuple(past_key_values)
+
+        batch_id = batches[0].batch_id
+        top_n_tokens.extend([-1] * (new_bs - total_batch_size))
+        fsm_grammar_states.extend([-1] * (new_bs - total_batch_size))
+
+        for idx, req in enumerate(requests):
+            req.idx = idx
+
+        parameters = pad_next_token_chooser_parameters(parameters, new_bs)
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             parameters,
-            batches[dst_batch_idx].next_token_chooser.dtype,
-            batches[dst_batch_idx].next_token_chooser.device,
-            batches[dst_batch_idx].next_token_chooser.tokenizer,
+            batches[0].next_token_chooser.dtype,
+            batches[0].next_token_chooser.device,
+            batches[0].next_token_chooser.tokenizer,
             fsm_grammar_states,
             quantization_enabled=hq_env.is_quantization_enabled,
         )
-
-        input_ids = batches[dst_batch_idx].input_ids
-        attention_mask = batches[dst_batch_idx].attention_mask
-        position_ids = batches[dst_batch_idx].position_ids
-        past_key_values = batches[dst_batch_idx].past_key_values
         input_length = max_input_length
 
         htorch.core.mark_step()
 
         return cls(
             batch_id=batch_id,
-            requests=flat_requests,
+            requests=requests,
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -550,6 +784,13 @@ class VlmCausalLMBatch(CausalLMBatch):
             top_n_tokens=top_n_tokens,
             top_n_tokens_tensor=top_n_tokens_tensor,
             input_length=input_length,
+            pixel_values=None,
+            pixel_attention_mask=None,
+            image_sizes=None,
+            aspect_ratio_ids=None,
+            aspect_ratio_mask=None,
+            cross_attention_mask=cross_attention_mask,
+            prefilling=False,
         )
 
 
@@ -601,6 +842,9 @@ class VlmCausalLM(Model):
             htorch.core.hpu_set_env()
 
         if world_size > 1:
+            os.environ.setdefault(
+                "DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1"
+            )
             model = self.get_deepspeed_model(model_class, model_id, dtype, revision)
             model = hq_env.prepare_model_for_quantization(model)
         else:
@@ -678,6 +922,11 @@ class VlmCausalLM(Model):
                 self.kwargs["flash_attention_recompute"] = True
 
         self.speculate = get_speculate()
+        if model.config.model_type == "mllama":
+            global CROSS_ATTENTION_LAYERS, BASE_IMAGE_TOKENS
+            CROSS_ATTENTION_LAYERS = model.config.text_config.cross_attention_layers
+            BASE_IMAGE_TOKENS = 0
+
         super(VlmCausalLM, self).__init__(
             model_id=model_id,
             model=model,
@@ -806,39 +1055,39 @@ class VlmCausalLM(Model):
 
     def forward(
         self,
-        input_ids,
-        attention_mask,
-        position_ids,
-        token_idx,
-        past_key_values: Optional[List[Tuple]] = None,
-        pixel_values: Optional[List[torch.Tensor]] = None,
-        image_sizes: Optional[List[Tuple[int, int]]] = None,
+        batch: VlmCausalLMBatch,
         bypass_hpu_graph: Optional[bool] = None,
     ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
         # Model Forward
         kwargs = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "token_idx": token_idx,
-            "pixel_values": pixel_values,
-            "image_sizes": image_sizes,
+            "input_ids": batch.input_ids,
+            "attention_mask": batch.attention_mask,
+            "past_key_values": batch.past_key_values,
+            "token_idx": batch.token_idx,
+            "pixel_values": batch.pixel_values,
         }
 
+        if self.model.config.model_type == "mllama":
+            kwargs["aspect_ratio_ids"] = batch.aspect_ratio_ids
+            kwargs["aspect_ratio_mask"] = batch.aspect_ratio_mask
+            kwargs["cross_attention_mask"] = batch.cross_attention_mask
+        else:
+            kwargs["image_sizes"] = batch.image_sizes
+
         hpu_kwargs = {}
         # Optimum Habana got "lazy_mode" key-val only supported for llama type of models
         if self.model.config.model_type == "llama":
             hpu_kwargs["lazy_mode"] = LAZY_MODE == 1
 
         if self.has_position_ids:
-            kwargs["position_ids"] = position_ids
-
+            kwargs["position_ids"] = batch.position_ids
         if bypass_hpu_graph is not None:
             hpu_kwargs["bypass_hpu_graphs"] = bypass_hpu_graph
 
         kwargs.update(self.kwargs)
         model_inputs = self.model.prepare_inputs_for_generation(**kwargs)
-        if past_key_values is not None:
+
+        if batch.past_key_values is not None:
             return self.model.forward(**model_inputs, **hpu_kwargs)
         else:
             outputs = self.model.forward(**model_inputs, **hpu_kwargs)
@@ -846,8 +1095,9 @@ class VlmCausalLM(Model):
 
     @tracer.start_as_current_span("generate_token")
     def generate_token(
-        self, batches: List[VlmCausalLMBatch], is_warmup: bool = False
-    ) -> Tuple[List[Generation], Optional[CausalLMBatch], Tuple[int, int]]:
+        self, batches: list[VlmCausalLMBatch], is_warmup: bool = False
+    ) -> Tuple[List[Generation], Optional[VlmCausalLMBatch], Tuple[int, int]]:
+
         start = time.time_ns()
         # Results
         generations: List[Generation] = []
@@ -927,9 +1177,18 @@ class VlmCausalLM(Model):
                 # Update attention_mask as we added a new token to input_ids
                 batch.attention_mask.index_fill_(1, token_idx, 1)
 
+                # add cross-attn mask for new token
+                if batch.cross_attention_mask is not None:
+                    cross_attention_mask_prev = batch.cross_attention_mask
+                    if token_idx is not None:
+                        mask = cross_attention_mask_prev[
+                            :, token_idx - 2 : token_idx - 1, ...
+                        ]
+                        cross_attention_mask_prev.index_copy_(1, token_idx - 1, mask)
+                        batch.cross_attention_mask = cross_attention_mask_prev
+
                 # Adjust lengths
                 batch.input_length += 1
-
                 # Update position_ids
                 if prefill:
                     batch.position_ids = (
@@ -955,7 +1214,7 @@ class VlmCausalLM(Model):
 
         # Check if we need to do any bookkeeping first
         if not prefill:
-            batch = batch.__class__.recombine(
+            batch = self.batch_type.recombine(
                 [batch], self.tokenizer.pad_token_id, is_warmup
             )
 
@@ -977,38 +1236,34 @@ class VlmCausalLM(Model):
         # Execute batch
         if prefill:
             # no right padding for prefill
-            token_idx = torch.tensor(batch.attention_mask.shape[-1] - 1).to(self.device)
+            # token_idx = torch.tensor(batch.attention_mask.shape[-1] - 1).to(self.device)
             batch.logits, batch.past = self.forward(
-                batch.input_ids,
-                batch.attention_mask,
-                batch.position_ids,
-                token_idx,
-                batch.past_key_values,
-                batch.pixel_values,
-                batch.image_sizes,
+                batch,
                 bypass_hpu_graph=(
                     prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
                 ),
             )
+
         elif all([req.stopping_criteria.max_new_tokens == 1 for req in batch.requests]):
             # Don't schedule next forward if max_new_tokens for all requests equals 1
             # - we've already generated the first and only needed token in the prefill phase
             pass
         else:
-            token_idx = torch.tensor(
-                batch.attention_mask.shape[-1] - batch.right_padding
-            ).to(self.device)
+            # token_idx = torch.tensor(batch.attention_mask.shape[-1] - batch.right_padding).to(self.device)
             batch.logits = self.forward(
-                batch.input_ids,
-                batch.attention_mask,
-                batch.position_ids,
-                token_idx,
-                batch.past_key_values,
+                batch,
                 bypass_hpu_graph=(
                     prefill and self.limit_hpu_graph if self.enable_hpu_graph else None
                 ),
             )
 
+        if batch.pixel_values is not None:
+            batch.pixel_values = None
+        if batch.aspect_ratio_ids is not None:
+            batch.aspect_ratio_ids = None
+        if batch.aspect_ratio_mask is not None:
+            batch.aspect_ratio_mask = None
+
         htorch.core.mark_step()
 
         start_decode = time.time_ns()
@@ -1181,7 +1436,7 @@ class VlmCausalLM(Model):
         return generations, batch if not stopped else None, (forward_ns, decode_ns)
 
     def batch_from_pb(self, batch, is_warmup):
-        return VlmCausalLMBatch.from_pb_processor(
+        return self.batch_type.from_pb_processor(
             batch,
             self.tokenizer,
             self.processor,
@@ -1204,22 +1459,22 @@ class VlmCausalLM(Model):
     def warmup(
         self, request: generate_pb2.WarmupRequest
     ) -> Tuple[Optional[int], Optional[int], Optional[int]]:
-        is_warmup = True
-        batch = self.batch_from_pb(request.batch, is_warmup)
+        global MAX_TOTAL_TOKENS
+        MAX_TOTAL_TOKENS = request.max_total_tokens
+        batch = self.batch_from_pb(request.batch, is_warmup=True)
+        max_input_tokens = request.max_input_tokens
+        max_prefill_batch_size = batch.input_ids.shape[0]
 
         try:
             # max prefill batch size warmup
-            _, prefill_batch, _ = self.generate_token([batch], is_warmup)
+            _, prefill_batch, _ = self.generate_token([batch], is_warmup=True)
         except Exception:
             raise RuntimeError(
                 f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
                 f"You need to decrease `--max-batch-prefill-tokens`"
             )
 
-        global BASE_IMAGE_TOKENS, MAX_TOTAL_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST
-        MAX_TOTAL_TOKENS = request.max_total_tokens
-        max_input_length = batch.input_ids.shape[1]
-        max_prefill_batch_size = batch.input_ids.shape[0]
+        global BASE_IMAGE_TOKENS, PREFILL_WARMUP_BATCH_SIZE_LIST, PREFILL_WARMUP_SEQLEN_LIST, DECODE_WARMUP_BATCH_SIZE_LIST
         PREFILL_WARMUP_BATCH_SIZE_LIST = []
         batch_size = 1
         while batch_size <= max_prefill_batch_size:
@@ -1228,15 +1483,19 @@ class VlmCausalLM(Model):
         if PREFILL_WARMUP_BATCH_SIZE_LIST[-1] < max_prefill_batch_size:
             PREFILL_WARMUP_BATCH_SIZE_LIST.append(max_prefill_batch_size)
 
-        seq_len = BASE_IMAGE_TOKENS
+        if self.model.config.model_type == "mllama":
+            seq_len = PAD_SEQUENCE_TO_MULTIPLE_OF
+        else:
+            seq_len = BASE_IMAGE_TOKENS
+
         PREFILL_WARMUP_SEQLEN_LIST = []
         i = 0
-        while seq_len <= max_input_length:
+        while seq_len <= max_input_tokens:
             PREFILL_WARMUP_SEQLEN_LIST.append(seq_len)
             seq_len += PAD_SEQUENCE_TO_MULTIPLE_OF * (2**i)
             i += 1
-        if PREFILL_WARMUP_SEQLEN_LIST[-1] < max_input_length:
-            PREFILL_WARMUP_SEQLEN_LIST.append(max_input_length)
+        if PREFILL_WARMUP_SEQLEN_LIST[-1] < max_input_tokens:
+            PREFILL_WARMUP_SEQLEN_LIST.append(max_input_tokens)
 
         # Prefill and decode warmup
         DECODE_WARMUP_BATCH_SIZE_LIST = []
@@ -1246,10 +1505,13 @@ class VlmCausalLM(Model):
             for batch_size in PREFILL_WARMUP_BATCH_SIZE_LIST:
                 for seq_len in PREFILL_WARMUP_SEQLEN_LIST:
                     batch = self.generate_warmup_batch(
-                        request, seq_len, batch_size, is_warmup
+                        request, seq_len, batch_size, is_warmup=True
+                    )
+                    _, prefill_batch, _ = self.generate_token([batch], is_warmup=True)
+                    assert prefill_batch is not None
+                    _, decode_batch, _ = self.generate_token(
+                        [prefill_batch], is_warmup=True
                     )
-                    _, prefill_batch, _ = self.generate_token([batch], is_warmup)
-                    _, decode_batch, _ = self.generate_token([prefill_batch], is_warmup)
 
                 DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size)
 
@@ -1280,43 +1542,41 @@ class VlmCausalLM(Model):
                 and batch_size <= max_decode_batch_size
             ):
                 batches = []
-                for i in range(int(batch_size / max_prefill_batch_size)):
-                    batch = self.generate_warmup_batch(
-                        request,
-                        PREFILL_WARMUP_SEQLEN_LIST[0],
-                        DECODE_WARMUP_BATCH_SIZE_LIST[-1],
-                        is_warmup,
-                    )
-                    _, prefill_batch, _ = self.generate_token([batch], is_warmup)
-                    batches.append(prefill_batch)
                 while batch_size <= max_decode_batch_size:
-                    _, decode_batch, _ = self.generate_token(batches, is_warmup)
+                    for i in range(int(batch_size / max_prefill_batch_size)):
+                        batch = self.generate_warmup_batch(
+                            request,
+                            PREFILL_WARMUP_SEQLEN_LIST[0] - 1,
+                            max_prefill_batch_size,
+                            is_warmup=False,
+                        )
+                        _, prefill_batch, _ = self.generate_token(
+                            [batch], is_warmup=True
+                        )
+                        batches.append(prefill_batch)
+
+                    _, decode_batch, _ = self.generate_token(batches, is_warmup=True)
                     DECODE_WARMUP_BATCH_SIZE_LIST.append(batch_size)
                     batch_size = batch_size * 2
                     batches.clear()
 
-                    for i in range(int(batch_size / max_prefill_batch_size)):
-                        batch = self.generate_warmup_batch(
-                            request,
-                            PREFILL_WARMUP_SEQLEN_LIST[0],
-                            DECODE_WARMUP_BATCH_SIZE_LIST[-1],
-                            is_warmup,
-                        )
-                        _, prefill_batch, _ = self.generate_token([batch], is_warmup)
-                        batches.append(prefill_batch)
-
-                batches.clear()
                 if DECODE_WARMUP_BATCH_SIZE_LIST[-1] < max_decode_batch_size:
                     max_decode_batch_size = math.floor(max_decode_batch_size / 2) * 2
                     batch_size = max_decode_batch_size
                     for i in range(int(max_decode_batch_size / 2)):
                         batch = self.generate_warmup_batch(
-                            request, PREFILL_WARMUP_SEQLEN_LIST[0], 2, is_warmup
+                            request,
+                            PREFILL_WARMUP_SEQLEN_LIST[0] - 1,
+                            2,
+                            is_warmup=False,
+                        )
+                        _, prefill_batch, _ = self.generate_token(
+                            [batch], is_warmup=True
                         )
-                        _, prefill_batch, _ = self.generate_token([batch], is_warmup)
                         batches.append(prefill_batch)
-                    _, decode_batch, _ = self.generate_token(batches, is_warmup)
+                    _, decode_batch, _ = self.generate_token(batches, is_warmup=True)
                     DECODE_WARMUP_BATCH_SIZE_LIST.append(max_decode_batch_size)
+
         except Exception:
             raise RuntimeError(
                 f"Not enough memory to handle batch_size({batch_size}) decode warmup."
@@ -1333,7 +1593,7 @@ class VlmCausalLM(Model):
         )
 
         max_supported_total_tokens = MAX_BATCH_SIZE * MAX_TOTAL_TOKENS
-        max_input_tokens = max_input_length
+        max_input_tokens = max_input_tokens
         max_total_tokens = MAX_TOTAL_TOKENS
 
         return max_supported_total_tokens, max_input_tokens, max_total_tokens
diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx
index c7f73618..c3481f2e 100644
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@@ -291,6 +291,8 @@ The following table contains the environment variables that can be used to confi
 
 Contributions to the TGI-Gaudi project are welcome. Please refer to the [contributing guide](https://github.com/huggingface/text-generation-inference/blob/main/CONTRIBUTING.md).
 
+**Guidelines for contributing to Gaudi on TGI:** All changes should be made within the `backends/gaudi` folder. In general, you should avoid modifying the router, launcher, or benchmark to accommodate Gaudi hardware, as all Gaudi-specific logic should be contained within the `backends/gaudi` folder.
+
 ### Building the Docker Image from Source
 
 To build the Docker image from source:

From a35fbdb92555fd30cf9270493b65b54e217089ad Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Tue, 18 Mar 2025 15:07:33 +0530
Subject: [PATCH 155/183] Bug Fix: Sliding Window Attention  (#3112)

* (fix) sliding window attention

* (fix) flashinfer

* (typo) collection link

* Add window_size_left param ipex rocm

* Update window size rocm flash decoding

* fix: bump snapshots and improve exceed window test case

* feat: add tests for image types and remove alpha from png

* Upgrading `from_env` to get token from file when necessary + fix
pali_gemma.

* fix: add pillow dependency and bump lock+requirements

* fix: bump org name in gemma3 test

* Fix qwen2.

---------

Co-authored-by: drbh <david.richard.holtz@gmail.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 docs/source/supported_models.md               |   4 +-
 .../test_flash_gemma3/test_exceed_window.json | 136 ++++++-------
 .../test_flash_gemma3/test_flash_gemma3.json  | 164 +++++++--------
 ...est_flash_gemma3_image_base64_rgb_jpg.json |  26 +++
 ...est_flash_gemma3_image_base64_rgb_png.json |  26 +++
 .../test_flash_gemma3_image_base64_rgba.json  |  26 +++
 .../test_flash_gemma3_image_cow.json          |  14 +-
 .../test_flash_gemma3_image_cow_dog.json      |  14 +-
 integration-tests/models/test_flash_gemma3.py |  94 ++++++++-
 integration-tests/pyproject.toml              |   1 +
 integration-tests/requirements.txt            |  26 +--
 integration-tests/uv.lock                     | 189 +++++++++++++++++-
 launcher/src/main.rs                          |   7 +-
 router/src/server.rs                          |   2 +-
 router/src/validation.rs                      |   2 +-
 .../text_generation_server/adapters/lora.py   |   1 -
 .../layers/attention/cuda.py                  |   9 +-
 .../layers/attention/flashinfer.py            |   4 -
 .../layers/attention/ipex.py                  |   1 +
 .../layers/attention/rocm.py                  |   7 +-
 .../text_generation_server/models/__init__.py |   4 +-
 .../custom_modeling/flash_gemma2_modeling.py  |   1 +
 .../custom_modeling/flash_gemma3_modeling.py  |  84 +++-----
 .../custom_modeling/flash_mistral_modeling.py |   1 +
 .../custom_modeling/flash_mixtral_modeling.py |   1 +
 .../flash_pali_gemma_modeling.py              |   2 +-
 .../custom_modeling/flash_qwen2_modeling.py   |  15 +-
 .../flash_starcoder2_modeling.py              |   1 +
 .../gemma3/image_processing_gemma3.py         |   2 +-
 .../gemma3/processing_gemma3.py               |   4 +-
 .../models/custom_modeling/qwen2_5_vl.py      |   2 +-
 .../models/flash_causal_lm.py                 |  40 +---
 server/text_generation_server/models/model.py |   2 +-
 33 files changed, 593 insertions(+), 319 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json

diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index 453b4f61..f168fd76 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -14,8 +14,8 @@ Text Generation Inference enables serving optimized models. The following sectio
 - [Gemma](https://huggingface.co/google/gemma-7b)
 - [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
 - [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
-- [Gemma3](https://huggingface.co/collections/google/gemma-3)
-- [Gemma3 Text](https://huggingface.co/collections/google/gemma-3)
+- [Gemma3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d)
+- [Gemma3 Text](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d)
 - [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
 - [Dbrx](https://huggingface.co/databricks/dbrx-instruct)
 - [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj)
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json
index ec8cd4f6..5c6b4cb9 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json
@@ -1,133 +1,109 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "length",
-    "generated_tokens": 20,
+    "finish_reason": "eos_token",
+    "generated_tokens": 16,
     "prefill": [],
     "seed": null,
     "tokens": [
+      {
+        "id": 506,
+        "logprob": -1.3984375,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 1331,
+        "logprob": -1.6953125,
+        "special": false,
+        "text": " people"
+      },
       {
         "id": 236764,
-        "logprob": -0.44726562,
+        "logprob": -0.23535156,
         "special": false,
         "text": ","
       },
       {
-        "id": 236743,
-        "logprob": -0.011413574,
+        "id": 532,
+        "logprob": -0.24316406,
         "special": false,
-        "text": " "
+        "text": " and"
       },
       {
-        "id": 236812,
-        "logprob": -0.09814453,
+        "id": 506,
+        "logprob": -0.12109375,
         "special": false,
-        "text": "4"
+        "text": " the"
       },
       {
-        "id": 236764,
-        "logprob": -0.044189453,
+        "id": 2780,
+        "logprob": -1.1640625,
         "special": false,
-        "text": ","
+        "text": " food"
       },
       {
-        "id": 236743,
-        "logprob": -0.15625,
+        "id": 236761,
+        "logprob": -0.21386719,
         "special": false,
-        "text": " "
+        "text": "."
       },
       {
-        "id": 236810,
-        "logprob": -0.010864258,
+        "id": 108,
+        "logprob": -0.64453125,
         "special": false,
-        "text": "5"
+        "text": "\n\n"
       },
       {
-        "id": 236764,
-        "logprob": -0.040039062,
+        "id": 2094,
+        "logprob": -0.77734375,
         "special": false,
-        "text": ","
+        "text": "This"
       },
       {
-        "id": 236743,
-        "logprob": -0.26757812,
+        "id": 563,
+        "logprob": -0.040283203,
         "special": false,
-        "text": " "
+        "text": " is"
       },
       {
-        "id": 236825,
-        "logprob": -0.0047302246,
+        "id": 496,
+        "logprob": -0.03125,
         "special": false,
-        "text": "6"
+        "text": " a"
       },
       {
-        "id": 236764,
-        "logprob": -0.026123047,
+        "id": 6290,
+        "logprob": -0.03515625,
         "special": false,
-        "text": ","
+        "text": " nice"
       },
       {
-        "id": 236743,
-        "logprob": -0.265625,
+        "id": 1977,
+        "logprob": -0.0020751953,
         "special": false,
-        "text": " "
+        "text": " place"
       },
       {
-        "id": 236832,
-        "logprob": -0.014160156,
+        "id": 236761,
+        "logprob": -0.0079956055,
         "special": false,
-        "text": "7"
+        "text": "."
       },
       {
-        "id": 236764,
-        "logprob": -0.013977051,
+        "id": 107,
+        "logprob": -0.9921875,
         "special": false,
-        "text": ","
+        "text": "\n"
       },
       {
-        "id": 236743,
-        "logprob": -0.103515625,
-        "special": false,
-        "text": " "
-      },
-      {
-        "id": 236828,
-        "logprob": -0.008178711,
-        "special": false,
-        "text": "8"
-      },
-      {
-        "id": 236764,
-        "logprob": -0.030151367,
-        "special": false,
-        "text": ","
-      },
-      {
-        "id": 236743,
-        "logprob": -0.39453125,
-        "special": false,
-        "text": " "
-      },
-      {
-        "id": 236819,
-        "logprob": -0.008728027,
-        "special": false,
-        "text": "9"
-      },
-      {
-        "id": 236764,
-        "logprob": -0.020629883,
-        "special": false,
-        "text": ","
-      },
-      {
-        "id": 236743,
-        "logprob": -0.08154297,
-        "special": false,
-        "text": " "
+        "id": 106,
+        "logprob": -0.45507812,
+        "special": true,
+        "text": "<end_of_turn>"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": ", 4, 5, 6, 7, 8, 9, "
+  "generated_text": " the people, and the food.\n\nThis is a nice place.\n"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
index 1324555a..859544c8 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
@@ -8,31 +8,31 @@
     "tokens": [
       {
         "id": 1331,
-        "logprob": -0.32421875,
+        "logprob": -0.34960938,
         "special": false,
         "text": " people"
       },
       {
         "id": 8390,
-        "logprob": -0.15332031,
+        "logprob": -0.14746094,
         "special": false,
         "text": " died"
       },
       {
         "id": 528,
-        "logprob": -1.140625,
+        "logprob": -1.2265625,
         "special": false,
         "text": " in"
       },
       {
         "id": 506,
-        "logprob": -0.42578125,
+        "logprob": -0.47070312,
         "special": false,
         "text": " the"
       },
       {
         "id": 3640,
-        "logprob": -0.64453125,
+        "logprob": -0.5859375,
         "special": false,
         "text": " United"
       },
@@ -44,31 +44,31 @@
       },
       {
         "id": 236761,
-        "logprob": -0.37890625,
+        "logprob": -0.34765625,
         "special": false,
         "text": "."
       },
       {
         "id": 108,
-        "logprob": -0.08300781,
+        "logprob": -0.0859375,
         "special": false,
         "text": "\n\n"
       },
       {
         "id": 818,
-        "logprob": -1.1796875,
+        "logprob": -1.1640625,
         "special": false,
         "text": "The"
       },
       {
         "id": 6816,
-        "logprob": -1.765625,
+        "logprob": -1.890625,
         "special": false,
         "text": " generally"
       },
       {
         "id": 10951,
-        "logprob": -0.14550781,
+        "logprob": -0.14648438,
         "special": false,
         "text": " accepted"
       },
@@ -86,49 +86,49 @@
       },
       {
         "id": 600,
-        "logprob": -0.65625,
+        "logprob": -0.65234375,
         "special": false,
         "text": " that"
       },
       {
         "id": 236743,
-        "logprob": -1.1796875,
+        "logprob": -1.2109375,
         "special": false,
         "text": " "
       },
       {
         "id": 236825,
-        "logprob": -0.0009918213,
+        "logprob": -0.00088119507,
         "special": false,
         "text": "6"
       },
       {
         "id": 236832,
-        "logprob": -6.532669e-05,
+        "logprob": -6.580353e-05,
         "special": false,
         "text": "7"
       },
       {
         "id": 236810,
-        "logprob": -4.863739e-05,
+        "logprob": -5.2690506e-05,
         "special": false,
         "text": "5"
       },
       {
         "id": 236764,
-        "logprob": -0.00017929077,
+        "logprob": -0.0001745224,
         "special": false,
         "text": ","
       },
       {
         "id": 236771,
-        "logprob": -1.2397766e-05,
+        "logprob": -1.180172e-05,
         "special": false,
         "text": "0"
       },
       {
         "id": 236771,
-        "logprob": -2.1457672e-06,
+        "logprob": -1.7881393e-06,
         "special": false,
         "text": "0"
       },
@@ -140,7 +140,7 @@
       },
       {
         "id": 1331,
-        "logprob": -0.50390625,
+        "logprob": -0.44921875,
         "special": false,
         "text": " people"
       },
@@ -152,67 +152,67 @@
       },
       {
         "id": 528,
-        "logprob": -0.08496094,
+        "logprob": -0.084472656,
         "special": false,
         "text": " in"
       },
       {
         "id": 506,
-        "logprob": -0.0003299713,
+        "logprob": -0.00034713745,
         "special": false,
         "text": " the"
       },
       {
         "id": 3640,
-        "logprob": -0.028442383,
+        "logprob": -0.028564453,
         "special": false,
         "text": " United"
       },
       {
         "id": 4184,
-        "logprob": -0.00011014938,
+        "logprob": -0.00012207031,
         "special": false,
         "text": " States"
       },
       {
         "id": 236761,
-        "logprob": -1.1796875,
+        "logprob": -1.15625,
         "special": false,
         "text": "."
       },
       {
         "id": 3153,
-        "logprob": -0.104003906,
+        "logprob": -0.103027344,
         "special": false,
         "text": " However"
       },
       {
         "id": 236764,
-        "logprob": -0.009094238,
+        "logprob": -0.009155273,
         "special": false,
         "text": ","
       },
       {
         "id": 1070,
-        "logprob": -0.88671875,
+        "logprob": -0.92578125,
         "special": false,
         "text": " some"
       },
       {
         "id": 61806,
-        "logprob": -0.84765625,
+        "logprob": -0.91796875,
         "special": false,
         "text": " historians"
       },
       {
         "id": 4646,
-        "logprob": -1.34375,
+        "logprob": -1.3828125,
         "special": false,
         "text": " believe"
       },
       {
         "id": 506,
-        "logprob": -0.59375,
+        "logprob": -0.65234375,
         "special": false,
         "text": " the"
       },
@@ -230,7 +230,7 @@
       },
       {
         "id": 1451,
-        "logprob": -0.60546875,
+        "logprob": -0.66015625,
         "special": false,
         "text": " could"
       },
@@ -242,73 +242,73 @@
       },
       {
         "id": 618,
-        "logprob": -0.61328125,
+        "logprob": -0.57421875,
         "special": false,
         "text": " as"
       },
       {
         "id": 1494,
-        "logprob": -0.00033569336,
+        "logprob": -0.00036239624,
         "special": false,
         "text": " high"
       },
       {
         "id": 618,
-        "logprob": -0.0001411438,
+        "logprob": -0.0001335144,
         "special": false,
         "text": " as"
       },
       {
         "id": 236743,
-        "logprob": -0.001045227,
+        "logprob": -0.0009689331,
         "special": false,
         "text": " "
       },
       {
         "id": 236770,
-        "logprob": -0.21289062,
+        "logprob": -0.26367188,
         "special": false,
         "text": "1"
       },
       {
         "id": 236771,
-        "logprob": -0.13378906,
+        "logprob": -0.17773438,
         "special": false,
         "text": "0"
       },
       {
         "id": 3625,
-        "logprob": -0.0087890625,
+        "logprob": -0.012084961,
         "special": false,
         "text": " million"
       },
       {
         "id": 236761,
-        "logprob": -0.2109375,
+        "logprob": -0.21289062,
         "special": false,
         "text": "."
       },
       {
         "id": 108,
-        "logprob": -0.39453125,
+        "logprob": -0.37304688,
         "special": false,
         "text": "\n\n"
       },
       {
         "id": 236777,
-        "logprob": -1.1328125,
+        "logprob": -1.078125,
         "special": false,
         "text": "I"
       },
       {
         "id": 1006,
-        "logprob": -1.4140625,
+        "logprob": -1.3203125,
         "special": false,
         "text": " am"
       },
       {
         "id": 3182,
-        "logprob": -1.15625,
+        "logprob": -1.078125,
         "special": false,
         "text": " looking"
       },
@@ -320,13 +320,13 @@
       },
       {
         "id": 919,
-        "logprob": -1.2734375,
+        "logprob": -1.25,
         "special": false,
         "text": " more"
       },
       {
         "id": 1938,
-        "logprob": -1.2265625,
+        "logprob": -1.2421875,
         "special": false,
         "text": " information"
       },
@@ -338,169 +338,169 @@
       },
       {
         "id": 672,
-        "logprob": -0.77734375,
+        "logprob": -0.73046875,
         "special": false,
         "text": " this"
       },
       {
         "id": 59725,
-        "logprob": -0.70703125,
+        "logprob": -0.75,
         "special": false,
         "text": " discrepancy"
       },
       {
         "id": 532,
-        "logprob": -0.8515625,
+        "logprob": -0.83984375,
         "special": false,
         "text": " and"
       },
       {
         "id": 506,
-        "logprob": -0.65625,
+        "logprob": -0.7109375,
         "special": false,
         "text": " the"
       },
       {
         "id": 5872,
-        "logprob": -1.15625,
+        "logprob": -1.2734375,
         "special": false,
         "text": " factors"
       },
       {
         "id": 600,
-        "logprob": -0.2265625,
+        "logprob": -0.22851562,
         "special": false,
         "text": " that"
       },
       {
         "id": 19263,
-        "logprob": -1.125,
+        "logprob": -1.1640625,
         "special": false,
         "text": " contributed"
       },
       {
         "id": 531,
-        "logprob": -0.001083374,
+        "logprob": -0.0010757446,
         "special": false,
         "text": " to"
       },
       {
         "id": 506,
-        "logprob": -0.2109375,
+        "logprob": -0.18945312,
         "special": false,
         "text": " the"
       },
       {
         "id": 5777,
-        "logprob": -1.21875,
+        "logprob": -1.2734375,
         "special": false,
         "text": " wide"
       },
       {
         "id": 2644,
-        "logprob": -0.018310547,
+        "logprob": -0.01940918,
         "special": false,
         "text": " range"
       },
       {
         "id": 529,
-        "logprob": -0.12988281,
+        "logprob": -0.14550781,
         "special": false,
         "text": " of"
       },
       {
         "id": 14287,
-        "logprob": -0.03564453,
+        "logprob": -0.032470703,
         "special": false,
         "text": " estimates"
       },
       {
         "id": 236761,
-        "logprob": -0.010314941,
+        "logprob": -0.010375977,
         "special": false,
         "text": "."
       },
       {
         "id": 108,
-        "logprob": -0.060546875,
+        "logprob": -0.06591797,
         "special": false,
         "text": "\n\n"
       },
       {
         "id": 8291,
-        "logprob": -0.734375,
+        "logprob": -0.8046875,
         "special": false,
         "text": "Here"
       },
       {
         "id": 236789,
-        "logprob": -0.26367188,
+        "logprob": -0.23828125,
         "special": false,
         "text": "'"
       },
       {
         "id": 236751,
-        "logprob": -1.1920929e-06,
+        "logprob": -1.0728836e-06,
         "special": false,
         "text": "s"
       },
       {
         "id": 496,
-        "logprob": -0.15527344,
+        "logprob": -0.17480469,
         "special": false,
         "text": " a"
       },
       {
         "id": 25890,
-        "logprob": -0.08886719,
+        "logprob": -0.087402344,
         "special": false,
         "text": " breakdown"
       },
       {
         "id": 529,
-        "logprob": -0.0020446777,
+        "logprob": -0.0021209717,
         "special": false,
         "text": " of"
       },
       {
         "id": 506,
-        "logprob": -0.17871094,
+        "logprob": -0.19140625,
         "special": false,
         "text": " the"
       },
       {
         "id": 5872,
-        "logprob": -0.90234375,
+        "logprob": -1.0078125,
         "special": false,
         "text": " factors"
       },
       {
         "id": 20894,
-        "logprob": -0.25976562,
+        "logprob": -0.26367188,
         "special": false,
         "text": " contributing"
       },
       {
         "id": 531,
-        "logprob": -8.34465e-05,
+        "logprob": -9.250641e-05,
         "special": false,
         "text": " to"
       },
       {
         "id": 506,
-        "logprob": -0.008544922,
+        "logprob": -0.008666992,
         "special": false,
         "text": " the"
       },
       {
         "id": 5777,
-        "logprob": -0.62109375,
+        "logprob": -0.6171875,
         "special": false,
         "text": " wide"
       },
       {
         "id": 2644,
-        "logprob": -0.0023345947,
+        "logprob": -0.0023956299,
         "special": false,
         "text": " range"
       },
@@ -512,25 +512,25 @@
       },
       {
         "id": 14287,
-        "logprob": -0.011291504,
+        "logprob": -0.011352539,
         "special": false,
         "text": " estimates"
       },
       {
         "id": 573,
-        "logprob": -0.29101562,
+        "logprob": -0.30664062,
         "special": false,
         "text": " for"
       },
       {
         "id": 506,
-        "logprob": -0.21484375,
+        "logprob": -0.21386719,
         "special": false,
         "text": " the"
       },
       {
         "id": 236743,
-        "logprob": -0.2890625,
+        "logprob": -0.35351562,
         "special": false,
         "text": " "
       },
@@ -566,19 +566,19 @@
       },
       {
         "id": 10248,
-        "logprob": -0.01953125,
+        "logprob": -0.015258789,
         "special": false,
         "text": " pandemic"
       },
       {
         "id": 4355,
-        "logprob": -0.78515625,
+        "logprob": -0.83203125,
         "special": false,
         "text": " death"
       },
       {
         "id": 25363,
-        "logprob": -6.771088e-05,
+        "logprob": -7.43866e-05,
         "special": false,
         "text": " toll"
       },
@@ -590,13 +590,13 @@
       },
       {
         "id": 506,
-        "logprob": -7.033348e-06,
+        "logprob": -6.67572e-06,
         "special": false,
         "text": " the"
       },
       {
         "id": 3640,
-        "logprob": -0.0067443848,
+        "logprob": -0.0059509277,
         "special": false,
         "text": " United"
       },
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
new file mode 100644
index 00000000..ae67e006
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "Okay, let's analyze the image.\n\nThe image is a solid, bright white color. There is nothing else visible within it. \n\nIt's essentially a blank white canvas or a completely white square. \n\nIs there anything specific you'd like me to do with this image, such as describe it further or imagine what it might represent?",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1741965894,
+  "id": "",
+  "model": "google/gemma-3-4b-it",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 74,
+    "prompt_tokens": 277,
+    "total_tokens": 351
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
new file mode 100644
index 00000000..afbfba30
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "Okay, let's analyze the image. \n\nThe image is entirely white, with a very subtle, faint outline of a stylized, cartoonish figure. It appears to be a simplified depiction of a person, likely a child, with a wide-eyed expression and a small, rounded body. \n\nIt's almost like a minimalist, iconic representation. \n\nDo you want me to try and describe it in more detail or perhaps speculate about the context of the image?",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1741965892,
+  "id": "",
+  "model": "google/gemma-3-4b-it",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 98,
+    "prompt_tokens": 277,
+    "total_tokens": 375
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
new file mode 100644
index 00000000..1b97d261
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "Okay, let's analyze the image. \n\nThe transparent image reveals a stylized depiction of **a human head**. It's a minimalist, geometric representation, showing the basic shapes of the skull, eye sockets, and head outline. \n\nDo you want me to describe any specific element of the image in more detail?",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1741966313,
+  "id": "",
+  "model": "google/gemma-3-4b-it",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 67,
+    "prompt_tokens": 277,
+    "total_tokens": 344
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
index 6c30ada4..cd786b3c 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a humorous and unexpected sight of a cow enjoying a tropical beach!",
+        "content": "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a quite a humorous and unusual scene – a cow enjoying a day at the beach!",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1741703756,
+  "created": 1741964480,
   "id": "",
-  "model": "gg-hf-g/gemma-3-4b-it",
+  "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.1.2-dev0-native",
+  "system_fingerprint": "3.2.1-dev0-native",
   "usage": {
-    "completion_tokens": 70,
-    "prompt_tokens": 277,
-    "total_tokens": 347
+    "completion_tokens": 74,
+    "prompt_tokens": 275,
+    "total_tokens": 349
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
index fe67c995..5ed2c450 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "Based on the image, the animal is a cow, not a dog! \n\nIt appears to be a **Brazilian cattle breed** known as a **Gir Cow**. They are recognized for their reddish-brown color and distinctive markings.",
+        "content": "That's a fantastic question! However, the image doesn't show a dog. It shows a **Brown Swiss cow** standing on a beach. \n\nBrown Swiss cows are known for their reddish-brown color and distinctive white markings. \n\nIf you'd like, you can send me another image and I’ll do my best to identify it!",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1741703753,
+  "created": 1741964477,
   "id": "",
-  "model": "gg-hf-g/gemma-3-4b-it",
+  "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.1.2-dev0-native",
+  "system_fingerprint": "3.2.1-dev0-native",
   "usage": {
-    "completion_tokens": 48,
-    "prompt_tokens": 281,
-    "total_tokens": 329
+    "completion_tokens": 75,
+    "prompt_tokens": 279,
+    "total_tokens": 354
   }
 }
diff --git a/integration-tests/models/test_flash_gemma3.py b/integration-tests/models/test_flash_gemma3.py
index ab812d64..5064f34d 100644
--- a/integration-tests/models/test_flash_gemma3.py
+++ b/integration-tests/models/test_flash_gemma3.py
@@ -1,3 +1,7 @@
+import base64
+from io import BytesIO
+from PIL import Image
+
 import pytest
 
 
@@ -49,9 +53,9 @@ async def test_flash_gemma3_image_cow_dog(flash_gemma3, response_snapshot):
 
     assert (
         response.choices[0].message.content
-        == "Based on the image, the animal is a cow, not a dog! \n\nIt appears to be a **Brazilian cattle breed** known as a **Gir Cow**. They are recognized for their reddish-brown color and distinctive markings."
+        == "That's a fantastic question! However, the image doesn't show a dog. It shows a **Brown Swiss cow** standing on a beach. \n\nBrown Swiss cows are known for their reddish-brown color and distinctive white markings. \n\nIf you'd like, you can send me another image and I’ll do my best to identify it!"
     )
-    assert response.usage["completion_tokens"] == 48
+    assert response.usage["completion_tokens"] == 75
     assert response == response_snapshot
 
 
@@ -72,19 +76,95 @@ async def test_flash_gemma3_image_cow(flash_gemma3, response_snapshot):
     )
     assert (
         response.choices[0].message.content
-        == "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a humorous and unexpected sight of a cow enjoying a tropical beach!"
+        == "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a quite a humorous and unusual scene – a cow enjoying a day at the beach!"
     )
-    assert response.usage["completion_tokens"] == 70
+    assert response.usage["completion_tokens"] == 74
     assert response == response_snapshot
 
 
 async def test_exceed_window(flash_gemma3, response_snapshot):
     response = await flash_gemma3.generate(
-        "This is a nice place. " * 800 + "Now count: 1, 2, 3",
+        "This is a nice place. " * 800 + "I really enjoy the scenery,",
         seed=42,
         max_new_tokens=20,
     )
 
-    assert response.generated_text == ", 4, 5, 6, 7, 8, 9, "
-    assert response.details.generated_tokens == 20
+    assert (
+        response.generated_text
+        == " the people, and the food.\n\nThis is a nice place.\n"
+    )
+    assert response.details.generated_tokens == 16
+    assert response == response_snapshot
+
+
+# Helper function to convert a Pillow image to a base64 data URL
+def image_to_data_url(img: Image.Image, fmt: str) -> str:
+    buffer = BytesIO()
+    img.save(buffer, format=fmt)
+    img_data = buffer.getvalue()
+    b64_str = base64.b64encode(img_data).decode("utf-8")
+    mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
+    return f"data:{mime_type};base64,{b64_str}"
+
+
+async def test_flash_gemma3_image_base64_rgba(flash_gemma3, response_snapshot):
+    # Create an empty 100x100 PNG image with alpha (transparent background)
+    img = Image.new("RGBA", (100, 100), (0, 0, 0, 0))
+    data_url = image_to_data_url(img, "PNG")
+    response = await flash_gemma3.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": data_url}},
+                    {
+                        "type": "text",
+                        "text": "What do you see in this transparent image?",
+                    },
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
+    assert response == response_snapshot
+
+
+async def test_flash_gemma3_image_base64_rgb_png(flash_gemma3, response_snapshot):
+    # Create an empty 100x100 PNG image without alpha (white background)
+    img = Image.new("RGB", (100, 100), (255, 255, 255))
+    data_url = image_to_data_url(img, "PNG")
+    response = await flash_gemma3.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": data_url}},
+                    {"type": "text", "text": "What do you see in this plain image?"},
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
+    assert response == response_snapshot
+
+
+async def test_flash_gemma3_image_base64_rgb_jpg(flash_gemma3, response_snapshot):
+    # Create an empty 100x100 JPEG image (white background)
+    img = Image.new("RGB", (100, 100), (255, 255, 255))
+    data_url = image_to_data_url(img, "JPEG")
+    response = await flash_gemma3.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": data_url}},
+                    {"type": "text", "text": "What do you see in this JPEG image?"},
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
     assert response == response_snapshot
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
index 07aa4307..abe8cfee 100644
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "numpy>=2.0",
     "openai>=1.65",
     "huggingface_hub>=0.29",
+    "pillow>=11.1.0",
 ]
 
 [tool.isort]
diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
index a85db4a5..ca2dee93 100644
--- a/integration-tests/requirements.txt
+++ b/integration-tests/requirements.txt
@@ -1,8 +1,8 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile pyproject.toml -o requirements.txt
-aiohappyeyeballs==2.4.6
+#    uv pip compile pyproject.toml
+aiohappyeyeballs==2.6.1
     # via aiohttp
-aiohttp==3.11.12
+aiohttp==3.11.13
     # via text-generation
 aiosignal==1.3.2
     # via aiohttp
@@ -12,7 +12,7 @@ anyio==4.8.0
     # via
     #   httpx
     #   openai
-attrs==25.1.0
+attrs==25.3.0
     # via aiohttp
 certifi==2025.1.31
     # via
@@ -25,13 +25,13 @@ distro==1.9.0
     # via openai
 docker==7.1.0
     # via text-generation-integration-tests (pyproject.toml)
-filelock==3.17.0
+filelock==3.18.0
     # via huggingface-hub
 frozenlist==1.5.0
     # via
     #   aiohttp
     #   aiosignal
-fsspec==2025.2.0
+fsspec==2025.3.0
     # via huggingface-hub
 h11==0.14.0
     # via httpcore
@@ -39,7 +39,7 @@ httpcore==1.0.7
     # via httpx
 httpx==0.28.1
     # via openai
-huggingface-hub==0.29.0
+huggingface-hub==0.29.3
     # via
     #   text-generation-integration-tests (pyproject.toml)
     #   text-generation
@@ -51,7 +51,7 @@ idna==3.10
     #   yarl
 iniconfig==2.0.0
     # via pytest
-jiter==0.8.2
+jiter==0.9.0
     # via openai
 multidict==6.1.0
     # via
@@ -59,15 +59,17 @@ multidict==6.1.0
     #   yarl
 numpy==2.2.3
     # via text-generation-integration-tests (pyproject.toml)
-openai==1.65.3
+openai==1.66.3
     # via text-generation-integration-tests (pyproject.toml)
 packaging==24.2
     # via
     #   huggingface-hub
     #   pytest
+pillow==11.1.0
+    # via text-generation-integration-tests (pyproject.toml)
 pluggy==1.5.0
     # via pytest
-propcache==0.2.1
+propcache==0.3.0
     # via
     #   aiohttp
     #   yarl
@@ -78,7 +80,7 @@ pydantic==2.10.6
     #   text-generation
 pydantic-core==2.27.2
     # via pydantic
-pytest==8.3.4
+pytest==8.3.5
     # via
     #   text-generation-integration-tests (pyproject.toml)
     #   pytest-asyncio
@@ -95,7 +97,7 @@ sniffio==1.3.1
     # via
     #   anyio
     #   openai
-syrupy==4.8.1
+syrupy==4.9.0
     # via text-generation-integration-tests (pyproject.toml)
 text-generation==0.7.0
     # via text-generation-integration-tests (pyproject.toml)
diff --git a/integration-tests/uv.lock b/integration-tests/uv.lock
index 9f3765b8..bad6aa8f 100644
--- a/integration-tests/uv.lock
+++ b/integration-tests/uv.lock
@@ -97,6 +97,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 },
 ]
 
+[[package]]
+name = "anyio"
+version = "4.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "idna" },
+    { name = "sniffio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a3/73/199a98fc2dae33535d6b8e8e6ec01f8c1d76c9adb096c6b7d64823038cde/anyio-4.8.0.tar.gz", hash = "sha256:1d9fe889df5212298c0c0723fa20479d1b94883a2df44bd3897aa91083316f7a", size = 181126 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/eb/e7f063ad1fec6b3178a3cd82d1a3c4de82cccf283fc42746168188e1cdd5/anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a", size = 96041 },
+]
+
 [[package]]
 name = "async-timeout"
 version = "5.0.1"
@@ -181,6 +196,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
 ]
 
+[[package]]
+name = "distro"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277 },
+]
+
 [[package]]
 name = "docker"
 version = "7.1.0"
@@ -276,6 +300,43 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 },
 ]
 
+[[package]]
+name = "h11"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f5/38/3af3d3633a34a3316095b39c8e8fb4853a28a536e55d347bd8d8e9a14b03/h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", size = 100418 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6a/41/d7d0a89eb493922c37d343b607bc1b5da7f5be7e383740b4753ad8943e90/httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c", size = 85196 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/87/f5/72347bc88306acb359581ac4d52f23c0ef445b57157adedb9aee0cd689d2/httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd", size = 78551 },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "certifi" },
+    { name = "httpcore" },
+    { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 },
+]
+
 [[package]]
 name = "huggingface-hub"
 version = "0.29.0"
@@ -312,6 +373,50 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 },
 ]
 
+[[package]]
+name = "jiter"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1e/c2/e4562507f52f0af7036da125bb699602ead37a2332af0788f8e0a3417f36/jiter-0.9.0.tar.gz", hash = "sha256:aadba0964deb424daa24492abc3d229c60c4a31bfee205aedbf1acc7639d7893", size = 162604 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b0/82/39f7c9e67b3b0121f02a0b90d433626caa95a565c3d2449fea6bcfa3f5f5/jiter-0.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:816ec9b60fdfd1fec87da1d7ed46c66c44ffec37ab2ef7de5b147b2fce3fd5ad", size = 314540 },
+    { url = "https://files.pythonhosted.org/packages/01/07/7bf6022c5a152fca767cf5c086bb41f7c28f70cf33ad259d023b53c0b858/jiter-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9b1d3086f8a3ee0194ecf2008cf81286a5c3e540d977fa038ff23576c023c0ea", size = 321065 },
+    { url = "https://files.pythonhosted.org/packages/6c/b2/de3f3446ecba7c48f317568e111cc112613da36c7b29a6de45a1df365556/jiter-0.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1339f839b91ae30b37c409bf16ccd3dc453e8b8c3ed4bd1d6a567193651a4a51", size = 341664 },
+    { url = "https://files.pythonhosted.org/packages/13/cf/6485a4012af5d407689c91296105fcdb080a3538e0658d2abf679619c72f/jiter-0.9.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ffba79584b3b670fefae66ceb3a28822365d25b7bf811e030609a3d5b876f538", size = 364635 },
+    { url = "https://files.pythonhosted.org/packages/0d/f7/4a491c568f005553240b486f8e05c82547340572d5018ef79414b4449327/jiter-0.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cfc7d0a8e899089d11f065e289cb5b2daf3d82fbe028f49b20d7b809193958d", size = 406288 },
+    { url = "https://files.pythonhosted.org/packages/d3/ca/f4263ecbce7f5e6bded8f52a9f1a66540b270c300b5c9f5353d163f9ac61/jiter-0.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e00a1a2bbfaaf237e13c3d1592356eab3e9015d7efd59359ac8b51eb56390a12", size = 397499 },
+    { url = "https://files.pythonhosted.org/packages/ac/a2/522039e522a10bac2f2194f50e183a49a360d5f63ebf46f6d890ef8aa3f9/jiter-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1d9870561eb26b11448854dce0ff27a9a27cb616b632468cafc938de25e9e51", size = 352926 },
+    { url = "https://files.pythonhosted.org/packages/b1/67/306a5c5abc82f2e32bd47333a1c9799499c1c3a415f8dde19dbf876f00cb/jiter-0.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9872aeff3f21e437651df378cb75aeb7043e5297261222b6441a620218b58708", size = 384506 },
+    { url = "https://files.pythonhosted.org/packages/0f/89/c12fe7b65a4fb74f6c0d7b5119576f1f16c79fc2953641f31b288fad8a04/jiter-0.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1fd19112d1049bdd47f17bfbb44a2c0001061312dcf0e72765bfa8abd4aa30e5", size = 520621 },
+    { url = "https://files.pythonhosted.org/packages/c4/2b/d57900c5c06e6273fbaa76a19efa74dbc6e70c7427ab421bf0095dfe5d4a/jiter-0.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6ef5da104664e526836070e4a23b5f68dec1cc673b60bf1edb1bfbe8a55d0678", size = 512613 },
+    { url = "https://files.pythonhosted.org/packages/89/05/d8b90bfb21e58097d5a4e0224f2940568366f68488a079ae77d4b2653500/jiter-0.9.0-cp310-cp310-win32.whl", hash = "sha256:cb12e6d65ebbefe5518de819f3eda53b73187b7089040b2d17f5b39001ff31c4", size = 206613 },
+    { url = "https://files.pythonhosted.org/packages/2c/1d/5767f23f88e4f885090d74bbd2755518050a63040c0f59aa059947035711/jiter-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:c43ca669493626d8672be3b645dbb406ef25af3f4b6384cfd306da7eb2e70322", size = 208371 },
+    { url = "https://files.pythonhosted.org/packages/23/44/e241a043f114299254e44d7e777ead311da400517f179665e59611ab0ee4/jiter-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6c4d99c71508912a7e556d631768dcdef43648a93660670986916b297f1c54af", size = 314654 },
+    { url = "https://files.pythonhosted.org/packages/fb/1b/a7e5e42db9fa262baaa9489d8d14ca93f8663e7f164ed5e9acc9f467fc00/jiter-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8f60fb8ce7df529812bf6c625635a19d27f30806885139e367af93f6e734ef58", size = 320909 },
+    { url = "https://files.pythonhosted.org/packages/60/bf/8ebdfce77bc04b81abf2ea316e9c03b4a866a7d739cf355eae4d6fd9f6fe/jiter-0.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51c4e1a4f8ea84d98b7b98912aa4290ac3d1eabfde8e3c34541fae30e9d1f08b", size = 341733 },
+    { url = "https://files.pythonhosted.org/packages/a8/4e/754ebce77cff9ab34d1d0fa0fe98f5d42590fd33622509a3ba6ec37ff466/jiter-0.9.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f4c677c424dc76684fea3e7285a7a2a7493424bea89ac441045e6a1fb1d7b3b", size = 365097 },
+    { url = "https://files.pythonhosted.org/packages/32/2c/6019587e6f5844c612ae18ca892f4cd7b3d8bbf49461ed29e384a0f13d98/jiter-0.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2221176dfec87f3470b21e6abca056e6b04ce9bff72315cb0b243ca9e835a4b5", size = 406603 },
+    { url = "https://files.pythonhosted.org/packages/da/e9/c9e6546c817ab75a1a7dab6dcc698e62e375e1017113e8e983fccbd56115/jiter-0.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c7adb66f899ffa25e3c92bfcb593391ee1947dbdd6a9a970e0d7e713237d572", size = 396625 },
+    { url = "https://files.pythonhosted.org/packages/be/bd/976b458add04271ebb5a255e992bd008546ea04bb4dcadc042a16279b4b4/jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c98d27330fdfb77913c1097a7aab07f38ff2259048949f499c9901700789ac15", size = 351832 },
+    { url = "https://files.pythonhosted.org/packages/07/51/fe59e307aaebec9265dbad44d9d4381d030947e47b0f23531579b9a7c2df/jiter-0.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eda3f8cc74df66892b1d06b5d41a71670c22d95a1ca2cbab73654745ce9d0419", size = 384590 },
+    { url = "https://files.pythonhosted.org/packages/db/55/5dcd2693794d8e6f4889389ff66ef3be557a77f8aeeca8973a97a7c00557/jiter-0.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:dd5ab5ddc11418dce28343123644a100f487eaccf1de27a459ab36d6cca31043", size = 520690 },
+    { url = "https://files.pythonhosted.org/packages/54/d5/9f51dc90985e9eb251fbbb747ab2b13b26601f16c595a7b8baba964043bd/jiter-0.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42f8a68a69f047b310319ef8e2f52fdb2e7976fb3313ef27df495cf77bcad965", size = 512649 },
+    { url = "https://files.pythonhosted.org/packages/a6/e5/4e385945179bcf128fa10ad8dca9053d717cbe09e258110e39045c881fe5/jiter-0.9.0-cp311-cp311-win32.whl", hash = "sha256:a25519efb78a42254d59326ee417d6f5161b06f5da827d94cf521fed961b1ff2", size = 206920 },
+    { url = "https://files.pythonhosted.org/packages/4c/47/5e0b94c603d8e54dd1faab439b40b832c277d3b90743e7835879ab663757/jiter-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:923b54afdd697dfd00d368b7ccad008cccfeb1efb4e621f32860c75e9f25edbd", size = 210119 },
+    { url = "https://files.pythonhosted.org/packages/af/d7/c55086103d6f29b694ec79156242304adf521577530d9031317ce5338c59/jiter-0.9.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7b46249cfd6c48da28f89eb0be3f52d6fdb40ab88e2c66804f546674e539ec11", size = 309203 },
+    { url = "https://files.pythonhosted.org/packages/b0/01/f775dfee50beb420adfd6baf58d1c4d437de41c9b666ddf127c065e5a488/jiter-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:609cf3c78852f1189894383cf0b0b977665f54cb38788e3e6b941fa6d982c00e", size = 319678 },
+    { url = "https://files.pythonhosted.org/packages/ab/b8/09b73a793714726893e5d46d5c534a63709261af3d24444ad07885ce87cb/jiter-0.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d726a3890a54561e55a9c5faea1f7655eda7f105bd165067575ace6e65f80bb2", size = 341816 },
+    { url = "https://files.pythonhosted.org/packages/35/6f/b8f89ec5398b2b0d344257138182cc090302854ed63ed9c9051e9c673441/jiter-0.9.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e89dc075c1fef8fa9be219e249f14040270dbc507df4215c324a1839522ea75", size = 364152 },
+    { url = "https://files.pythonhosted.org/packages/9b/ca/978cc3183113b8e4484cc7e210a9ad3c6614396e7abd5407ea8aa1458eef/jiter-0.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04e8ffa3c353b1bc4134f96f167a2082494351e42888dfcf06e944f2729cbe1d", size = 406991 },
+    { url = "https://files.pythonhosted.org/packages/13/3a/72861883e11a36d6aa314b4922125f6ae90bdccc225cd96d24cc78a66385/jiter-0.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:203f28a72a05ae0e129b3ed1f75f56bc419d5f91dfacd057519a8bd137b00c42", size = 395824 },
+    { url = "https://files.pythonhosted.org/packages/87/67/22728a86ef53589c3720225778f7c5fdb617080e3deaed58b04789418212/jiter-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fca1a02ad60ec30bb230f65bc01f611c8608b02d269f998bc29cca8619a919dc", size = 351318 },
+    { url = "https://files.pythonhosted.org/packages/69/b9/f39728e2e2007276806d7a6609cda7fac44ffa28ca0d02c49a4f397cc0d9/jiter-0.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:237e5cee4d5d2659aaf91bbf8ec45052cc217d9446070699441a91b386ae27dc", size = 384591 },
+    { url = "https://files.pythonhosted.org/packages/eb/8f/8a708bc7fd87b8a5d861f1c118a995eccbe6d672fe10c9753e67362d0dd0/jiter-0.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:528b6b71745e7326eed73c53d4aa57e2a522242320b6f7d65b9c5af83cf49b6e", size = 520746 },
+    { url = "https://files.pythonhosted.org/packages/95/1e/65680c7488bd2365dbd2980adaf63c562d3d41d3faac192ebc7ef5b4ae25/jiter-0.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9f48e86b57bc711eb5acdfd12b6cb580a59cc9a993f6e7dcb6d8b50522dcd50d", size = 512754 },
+    { url = "https://files.pythonhosted.org/packages/78/f3/fdc43547a9ee6e93c837685da704fb6da7dba311fc022e2766d5277dfde5/jiter-0.9.0-cp312-cp312-win32.whl", hash = "sha256:699edfde481e191d81f9cf6d2211debbfe4bd92f06410e7637dffb8dd5dfde06", size = 207075 },
+    { url = "https://files.pythonhosted.org/packages/cd/9d/742b289016d155f49028fe1bfbeb935c9bf0ffeefdf77daf4a63a42bb72b/jiter-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:099500d07b43f61d8bd780466d429c45a7b25411b334c60ca875fa775f68ccb0", size = 207999 },
+]
+
 [[package]]
 name = "multidict"
 version = "6.1.0"
@@ -411,6 +516,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/17/7f/d322a4125405920401450118dbdc52e0384026bd669939484670ce8b2ab9/numpy-2.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:783145835458e60fa97afac25d511d00a1eca94d4a8f3ace9fe2043003c678e4", size = 12839607 },
 ]
 
+[[package]]
+name = "openai"
+version = "1.66.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "distro" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "pydantic" },
+    { name = "sniffio" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a3/77/5172104ca1df35ed2ed8fb26dbc787f721c39498fc51d666c4db07756a0c/openai-1.66.3.tar.gz", hash = "sha256:8dde3aebe2d081258d4159c4cb27bdc13b5bb3f7ea2201d9bd940b9a89faf0c9", size = 397244 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/5a/e20182f7b6171642d759c548daa0ba20a1d3ac10d2bd0a13fd75704a9ac3/openai-1.66.3-py3-none-any.whl", hash = "sha256:a427c920f727711877ab17c11b95f1230b27767ba7a01e5b66102945141ceca9", size = 567400 },
+]
+
 [[package]]
 name = "packaging"
 version = "24.2"
@@ -420,6 +544,54 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 },
 ]
 
+[[package]]
+name = "pillow"
+version = "11.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/af/c097e544e7bd278333db77933e535098c259609c4eb3b85381109602fb5b/pillow-11.1.0.tar.gz", hash = "sha256:368da70808b36d73b4b390a8ffac11069f8a5c85f29eff1f1b01bcf3ef5b2a20", size = 46742715 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/50/1c/2dcea34ac3d7bc96a1fd1bd0a6e06a57c67167fec2cff8d95d88229a8817/pillow-11.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:e1abe69aca89514737465752b4bcaf8016de61b3be1397a8fc260ba33321b3a8", size = 3229983 },
+    { url = "https://files.pythonhosted.org/packages/14/ca/6bec3df25e4c88432681de94a3531cc738bd85dea6c7aa6ab6f81ad8bd11/pillow-11.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c640e5a06869c75994624551f45e5506e4256562ead981cce820d5ab39ae2192", size = 3101831 },
+    { url = "https://files.pythonhosted.org/packages/d4/2c/668e18e5521e46eb9667b09e501d8e07049eb5bfe39d56be0724a43117e6/pillow-11.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a07dba04c5e22824816b2615ad7a7484432d7f540e6fa86af60d2de57b0fcee2", size = 4314074 },
+    { url = "https://files.pythonhosted.org/packages/02/80/79f99b714f0fc25f6a8499ecfd1f810df12aec170ea1e32a4f75746051ce/pillow-11.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e267b0ed063341f3e60acd25c05200df4193e15a4a5807075cd71225a2386e26", size = 4394933 },
+    { url = "https://files.pythonhosted.org/packages/81/aa/8d4ad25dc11fd10a2001d5b8a80fdc0e564ac33b293bdfe04ed387e0fd95/pillow-11.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bd165131fd51697e22421d0e467997ad31621b74bfc0b75956608cb2906dda07", size = 4353349 },
+    { url = "https://files.pythonhosted.org/packages/84/7a/cd0c3eaf4a28cb2a74bdd19129f7726277a7f30c4f8424cd27a62987d864/pillow-11.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:abc56501c3fd148d60659aae0af6ddc149660469082859fa7b066a298bde9482", size = 4476532 },
+    { url = "https://files.pythonhosted.org/packages/8f/8b/a907fdd3ae8f01c7670dfb1499c53c28e217c338b47a813af8d815e7ce97/pillow-11.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:54ce1c9a16a9561b6d6d8cb30089ab1e5eb66918cb47d457bd996ef34182922e", size = 4279789 },
+    { url = "https://files.pythonhosted.org/packages/6f/9a/9f139d9e8cccd661c3efbf6898967a9a337eb2e9be2b454ba0a09533100d/pillow-11.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:73ddde795ee9b06257dac5ad42fcb07f3b9b813f8c1f7f870f402f4dc54b5269", size = 4413131 },
+    { url = "https://files.pythonhosted.org/packages/a8/68/0d8d461f42a3f37432203c8e6df94da10ac8081b6d35af1c203bf3111088/pillow-11.1.0-cp310-cp310-win32.whl", hash = "sha256:3a5fe20a7b66e8135d7fd617b13272626a28278d0e578c98720d9ba4b2439d49", size = 2291213 },
+    { url = "https://files.pythonhosted.org/packages/14/81/d0dff759a74ba87715509af9f6cb21fa21d93b02b3316ed43bda83664db9/pillow-11.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6123aa4a59d75f06e9dd3dac5bf8bc9aa383121bb3dd9a7a612e05eabc9961a", size = 2625725 },
+    { url = "https://files.pythonhosted.org/packages/ce/1f/8d50c096a1d58ef0584ddc37e6f602828515219e9d2428e14ce50f5ecad1/pillow-11.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:a76da0a31da6fcae4210aa94fd779c65c75786bc9af06289cd1c184451ef7a65", size = 2375213 },
+    { url = "https://files.pythonhosted.org/packages/dd/d6/2000bfd8d5414fb70cbbe52c8332f2283ff30ed66a9cde42716c8ecbe22c/pillow-11.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:e06695e0326d05b06833b40b7ef477e475d0b1ba3a6d27da1bb48c23209bf457", size = 3229968 },
+    { url = "https://files.pythonhosted.org/packages/d9/45/3fe487010dd9ce0a06adf9b8ff4f273cc0a44536e234b0fad3532a42c15b/pillow-11.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96f82000e12f23e4f29346e42702b6ed9a2f2fea34a740dd5ffffcc8c539eb35", size = 3101806 },
+    { url = "https://files.pythonhosted.org/packages/e3/72/776b3629c47d9d5f1c160113158a7a7ad177688d3a1159cd3b62ded5a33a/pillow-11.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3cd561ded2cf2bbae44d4605837221b987c216cff94f49dfeed63488bb228d2", size = 4322283 },
+    { url = "https://files.pythonhosted.org/packages/e4/c2/e25199e7e4e71d64eeb869f5b72c7ddec70e0a87926398785ab944d92375/pillow-11.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f189805c8be5ca5add39e6f899e6ce2ed824e65fb45f3c28cb2841911da19070", size = 4402945 },
+    { url = "https://files.pythonhosted.org/packages/c1/ed/51d6136c9d5911f78632b1b86c45241c712c5a80ed7fa7f9120a5dff1eba/pillow-11.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dd0052e9db3474df30433f83a71b9b23bd9e4ef1de13d92df21a52c0303b8ab6", size = 4361228 },
+    { url = "https://files.pythonhosted.org/packages/48/a4/fbfe9d5581d7b111b28f1d8c2762dee92e9821bb209af9fa83c940e507a0/pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:837060a8599b8f5d402e97197d4924f05a2e0d68756998345c829c33186217b1", size = 4484021 },
+    { url = "https://files.pythonhosted.org/packages/39/db/0b3c1a5018117f3c1d4df671fb8e47d08937f27519e8614bbe86153b65a5/pillow-11.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa8dd43daa836b9a8128dbe7d923423e5ad86f50a7a14dc688194b7be5c0dea2", size = 4287449 },
+    { url = "https://files.pythonhosted.org/packages/d9/58/bc128da7fea8c89fc85e09f773c4901e95b5936000e6f303222490c052f3/pillow-11.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0a2f91f8a8b367e7a57c6e91cd25af510168091fb89ec5146003e424e1558a96", size = 4419972 },
+    { url = "https://files.pythonhosted.org/packages/5f/bb/58f34379bde9fe197f51841c5bbe8830c28bbb6d3801f16a83b8f2ad37df/pillow-11.1.0-cp311-cp311-win32.whl", hash = "sha256:c12fc111ef090845de2bb15009372175d76ac99969bdf31e2ce9b42e4b8cd88f", size = 2291201 },
+    { url = "https://files.pythonhosted.org/packages/3a/c6/fce9255272bcf0c39e15abd2f8fd8429a954cf344469eaceb9d0d1366913/pillow-11.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd43429d0d7ed6533b25fc993861b8fd512c42d04514a0dd6337fb3ccf22761", size = 2625686 },
+    { url = "https://files.pythonhosted.org/packages/c8/52/8ba066d569d932365509054859f74f2a9abee273edcef5cd75e4bc3e831e/pillow-11.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:f7955ecf5609dee9442cbface754f2c6e541d9e6eda87fad7f7a989b0bdb9d71", size = 2375194 },
+    { url = "https://files.pythonhosted.org/packages/95/20/9ce6ed62c91c073fcaa23d216e68289e19d95fb8188b9fb7a63d36771db8/pillow-11.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2062ffb1d36544d42fcaa277b069c88b01bb7298f4efa06731a7fd6cc290b81a", size = 3226818 },
+    { url = "https://files.pythonhosted.org/packages/b9/d8/f6004d98579a2596c098d1e30d10b248798cceff82d2b77aa914875bfea1/pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a85b653980faad27e88b141348707ceeef8a1186f75ecc600c395dcac19f385b", size = 3101662 },
+    { url = "https://files.pythonhosted.org/packages/08/d9/892e705f90051c7a2574d9f24579c9e100c828700d78a63239676f960b74/pillow-11.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9409c080586d1f683df3f184f20e36fb647f2e0bc3988094d4fd8c9f4eb1b3b3", size = 4329317 },
+    { url = "https://files.pythonhosted.org/packages/8c/aa/7f29711f26680eab0bcd3ecdd6d23ed6bce180d82e3f6380fb7ae35fcf3b/pillow-11.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fdadc077553621911f27ce206ffcbec7d3f8d7b50e0da39f10997e8e2bb7f6a", size = 4412999 },
+    { url = "https://files.pythonhosted.org/packages/c8/c4/8f0fe3b9e0f7196f6d0bbb151f9fba323d72a41da068610c4c960b16632a/pillow-11.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:93a18841d09bcdd774dcdc308e4537e1f867b3dec059c131fde0327899734aa1", size = 4368819 },
+    { url = "https://files.pythonhosted.org/packages/38/0d/84200ed6a871ce386ddc82904bfadc0c6b28b0c0ec78176871a4679e40b3/pillow-11.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9aa9aeddeed452b2f616ff5507459e7bab436916ccb10961c4a382cd3e03f47f", size = 4496081 },
+    { url = "https://files.pythonhosted.org/packages/84/9c/9bcd66f714d7e25b64118e3952d52841a4babc6d97b6d28e2261c52045d4/pillow-11.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3cdcdb0b896e981678eee140d882b70092dac83ac1cdf6b3a60e2216a73f2b91", size = 4296513 },
+    { url = "https://files.pythonhosted.org/packages/db/61/ada2a226e22da011b45f7104c95ebda1b63dcbb0c378ad0f7c2a710f8fd2/pillow-11.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36ba10b9cb413e7c7dfa3e189aba252deee0602c86c309799da5a74009ac7a1c", size = 4431298 },
+    { url = "https://files.pythonhosted.org/packages/e7/c4/fc6e86750523f367923522014b821c11ebc5ad402e659d8c9d09b3c9d70c/pillow-11.1.0-cp312-cp312-win32.whl", hash = "sha256:cfd5cd998c2e36a862d0e27b2df63237e67273f2fc78f47445b14e73a810e7e6", size = 2291630 },
+    { url = "https://files.pythonhosted.org/packages/08/5c/2104299949b9d504baf3f4d35f73dbd14ef31bbd1ddc2c1b66a5b7dfda44/pillow-11.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:a697cd8ba0383bba3d2d3ada02b34ed268cb548b369943cd349007730c92bddf", size = 2626369 },
+    { url = "https://files.pythonhosted.org/packages/37/f3/9b18362206b244167c958984b57c7f70a0289bfb59a530dd8af5f699b910/pillow-11.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:4dd43a78897793f60766563969442020e90eb7847463eca901e41ba186a7d4a5", size = 2375240 },
+    { url = "https://files.pythonhosted.org/packages/fa/c5/389961578fb677b8b3244fcd934f720ed25a148b9a5cc81c91bdf59d8588/pillow-11.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8c730dc3a83e5ac137fbc92dfcfe1511ce3b2b5d7578315b63dbbb76f7f51d90", size = 3198345 },
+    { url = "https://files.pythonhosted.org/packages/c4/fa/803c0e50ffee74d4b965229e816af55276eac1d5806712de86f9371858fd/pillow-11.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7d33d2fae0e8b170b6a6c57400e077412240f6f5bb2a342cf1ee512a787942bb", size = 3072938 },
+    { url = "https://files.pythonhosted.org/packages/dc/67/2a3a5f8012b5d8c63fe53958ba906c1b1d0482ebed5618057ef4d22f8076/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8d65b38173085f24bc07f8b6c505cbb7418009fa1a1fcb111b1f4961814a442", size = 3400049 },
+    { url = "https://files.pythonhosted.org/packages/e5/a0/514f0d317446c98c478d1872497eb92e7cde67003fed74f696441e647446/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:015c6e863faa4779251436db398ae75051469f7c903b043a48f078e437656f83", size = 3422431 },
+    { url = "https://files.pythonhosted.org/packages/cd/00/20f40a935514037b7d3f87adfc87d2c538430ea625b63b3af8c3f5578e72/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d44ff19eea13ae4acdaaab0179fa68c0c6f2f45d66a4d8ec1eda7d6cecbcc15f", size = 3446208 },
+    { url = "https://files.pythonhosted.org/packages/28/3c/7de681727963043e093c72e6c3348411b0185eab3263100d4490234ba2f6/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d3d8da4a631471dfaf94c10c85f5277b1f8e42ac42bade1ac67da4b4a7359b73", size = 3509746 },
+    { url = "https://files.pythonhosted.org/packages/41/67/936f9814bdd74b2dfd4822f1f7725ab5d8ff4103919a1664eb4874c58b2f/pillow-11.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4637b88343166249fe8aa94e7c4a62a180c4b3898283bb5d3d2fd5fe10d8e4e0", size = 2626353 },
+]
+
 [[package]]
 name = "pluggy"
 version = "1.5.0"
@@ -656,6 +828,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
 ]
 
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 },
+]
+
 [[package]]
 name = "syrupy"
 version = "4.8.1"
@@ -688,7 +869,10 @@ version = "2.0.1"
 source = { virtual = "." }
 dependencies = [
     { name = "docker" },
+    { name = "huggingface-hub" },
     { name = "numpy" },
+    { name = "openai" },
+    { name = "pillow" },
     { name = "pydantic" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
@@ -699,7 +883,10 @@ dependencies = [
 [package.metadata]
 requires-dist = [
     { name = "docker", specifier = ">=7" },
+    { name = "huggingface-hub", specifier = ">=0.29" },
     { name = "numpy", specifier = ">=2.0" },
+    { name = "openai", specifier = ">=1.65" },
+    { name = "pillow", specifier = ">=11.1.0" },
     { name = "pydantic", specifier = ">2,<3" },
     { name = "pytest", specifier = ">=8.3.0" },
     { name = "pytest-asyncio", specifier = ">=0.23.1" },
@@ -741,7 +928,7 @@ name = "tqdm"
 version = "4.67.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "colorama", marker = "platform_system == 'Windows'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
 wheels = [
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index fde1472f..e3abb843 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -97,11 +97,10 @@ fn get_config(
     let filename = if !path.exists() {
         // Assume it's a hub id
 
-        let mut builder = if let Ok(token) = std::env::var("HF_TOKEN") {
+        let mut builder = ApiBuilder::from_env();
+        if let Ok(token) = std::env::var("HF_TOKEN") {
             // env variable has precedence over on file token.
-            ApiBuilder::new().with_token(Some(token))
-        } else {
-            ApiBuilder::new()
+            builder = builder.with_token(Some(token))
         };
         if let Ok(origin) = env::var("HF_HUB_USER_AGENT_ORIGIN") {
             builder = builder.with_user_agent("origin", origin.as_str());
diff --git a/router/src/server.rs b/router/src/server.rs
index 0346b1f1..45d2b9f3 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1522,7 +1522,7 @@ pub async fn run(
 
     // Shared API builder initialization
     let api_builder = || {
-        let mut builder = ApiBuilder::new().with_progress(false);
+        let mut builder = ApiBuilder::from_env().with_progress(false);
         if let Some(token) = authorization_token {
             builder = builder.with_token(Some(token));
         }
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 87b28eb7..1119347d 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -699,7 +699,7 @@ fn image_tokens(
             // TODO: prefer using the config to determine the number of features
             let num_mm_soft_tokens_per_image = 256;
             format!(
-                "\n\n<start_of_image>{:?}<end_of_image>\n\n",
+                "\n\n<start_of_image>{}<end_of_image>\n\n",
                 "<image_soft_token>".repeat(num_mm_soft_tokens_per_image)
             )
         }
diff --git a/server/text_generation_server/adapters/lora.py b/server/text_generation_server/adapters/lora.py
index cdcfe91b..782d66e4 100644
--- a/server/text_generation_server/adapters/lora.py
+++ b/server/text_generation_server/adapters/lora.py
@@ -205,7 +205,6 @@ class LoraWeights(AdapterWeights):
         lora_a_list = [None] * nlayers
         lora_b_list = [None] * nlayers
 
-        # import ipdb; ipdb.set_trace()
         for layer_id in range(nlayers):
             key = (layer_id, layer_type)
             if key not in target_to_layer:
diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py
index 4f25cc19..fb50dda6 100644
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -38,6 +38,7 @@ def paged_attention(
     *,
     kv_scales: KVScales,
     softcap: Optional[float] = None,
+    window_size_left: Optional[int] = -1,
 ):
     # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
     # Copyright 2023 The vLLM team. All rights
@@ -79,12 +80,15 @@ def paged_attention(
             sm_scale=softmax_scale,
             k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
             v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
+            window_left=window_size_left,
         )
     elif ATTENTION == "flashdecoding":
         max_q = 1
         max_k = max_s
         import flash_attn_2_cuda
 
+        window_size_right = -1 if window_size_left == -1 else 0
+
         # TODO fixme when flash contains the fix.
         # Number of splits is not correctly handled
         # by the current path
@@ -109,8 +113,8 @@ def paged_attention(
             softmax_scale,
             False,  # zero_tensors
             True,  # causal
-            -1,  # Window_left
-            -1,  # Window right
+            window_size_left,  # Window_left
+            window_size_right,  # Window right
             softcap,
             False,  # return softmax
             None,  # generator
@@ -253,6 +257,7 @@ def attention(
             sm_scale=softmax_scale,
             k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
             v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
+            window_left=window_size_left,
         )
 
     # If we are using flashdecoding or paged, we always use flash-attn for
diff --git a/server/text_generation_server/layers/attention/flashinfer.py b/server/text_generation_server/layers/attention/flashinfer.py
index d2345184..9479b606 100644
--- a/server/text_generation_server/layers/attention/flashinfer.py
+++ b/server/text_generation_server/layers/attention/flashinfer.py
@@ -52,7 +52,6 @@ def use_prefill_with_paged_kv_state(
     page_size: int,
     kv_dtype: torch.dtype,
     q_dtype: torch.dtype,
-    window_left: int,
 ):
     """
     Context manager to set the active flashinfer prefill state to the given
@@ -95,7 +94,6 @@ def use_prefill_with_paged_kv_state(
             kv_data_type=kv_dtype,
             q_data_type=q_dtype,
             page_size=page_size,
-            window_left=-1 if window_left is None else window_left,
         )
         yield
     finally:
@@ -172,7 +170,6 @@ def use_decode_state(
     page_size: int,
     kv_cache_dtype: torch.dtype,
     q_dtype: torch.dtype,
-    window_left: int,
 ):
     """
     Context manager to set the active flashinfer decoding state to the given
@@ -209,7 +206,6 @@ def use_decode_state(
             page_size=page_size,
             data_type=kv_cache_dtype,
             q_data_type=q_dtype,
-            window_left=-1 if window_left is None else window_left,
         )
         yield
     finally:
diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py
index 54422308..2b89060e 100644
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@@ -78,6 +78,7 @@ def paged_attention(
     *,
     kv_scales: KVScales,
     softcap: Optional[float] = None,
+    window_size_left: Optional[int] = -1,
 ):
     if softcap is not None:
         raise NotImplementedError("softcap is not available in IPEX")
diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py
index 65f3ea41..518e55ee 100644
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -59,6 +59,7 @@ def paged_attention(
     *,
     kv_scales: KVScales,
     softcap: Optional[float] = None,
+    window_size_left: Optional[int] = -1,
 ):
     # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
     # Copyright 2023 The vLLM team. All rights
@@ -82,6 +83,8 @@ def paged_attention(
         max_k = max_s
         import flash_attn_2_cuda
 
+        window_size_right = -1 if window_size_left == -1 else 0
+
         if softcap is None:
             softcap = 0.0
         out = flash_attn_2_cuda.varlen_fwd(
@@ -101,8 +104,8 @@ def paged_attention(
             softmax_scale,
             False,  # zero_tensors
             True,  # causal
-            -1,  # Window_left
-            -1,  # Window right
+            window_size_left,  # Window_left
+            window_size_right,  # Window right
             softcap,
             False,  # return softmax
             None,  # generator
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 0fdc009c..ab830b58 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -272,12 +272,12 @@ class ModelType(enum.Enum):
     GEMMA3 = {
         "type": "gemma3",
         "name": "Gemma3",
-        "url": "https://huggingface.co/collections/google/gemma-3",
+        "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d",
     }
     GEMMA3_TEXT = {
         "type": "gemma3_text",
         "name": "Gemma3 Text",
-        "url": "https://huggingface.co/collections/google/gemma-3",
+        "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d",
     }
     COHERE = {
         "type": "cohere",
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
index ebf1b80e..2554bd26 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
@@ -287,6 +287,7 @@ class FlashGemma2Attention(torch.nn.Module):
                 max_s,
                 softcap=self.softcap,
                 kv_scales=self.kv_scales,
+                window_size_left=self.window_size,
             )
 
         return self.o_proj(
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
index 085f57ef..70fe9a3d 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
@@ -281,22 +281,12 @@ class FlashGemma3Attention(torch.nn.Module):
                 padded_query = padded_query.transpose(1, 2).contiguous()
                 padded_key = padded_key.transpose(1, 2).contiguous()
                 padded_value = padded_value.transpose(1, 2).contiguous()
-                zeros_to_add = torch.zeros(
-                    padded_key.size(0),
-                    self.num_key_value_heads,
-                    1,
-                    self.head_size,
-                    dtype=padded_key.dtype,
-                    device=padded_key.device,
-                )
-                key_states = torch.cat([padded_key, zeros_to_add], dim=2)
-                value_states = torch.cat([padded_value, zeros_to_add], dim=2)
 
                 # Compute attention
                 attn_output = F.scaled_dot_product_attention(
                     padded_query,
-                    key_states,
-                    value_states,
+                    padded_key,
+                    padded_value,
                     attn_mask=attention_mask,
                     scale=self.softmax_scale,
                     enable_gqa=self.enable_gqa,
@@ -327,6 +317,7 @@ class FlashGemma3Attention(torch.nn.Module):
                 max_s,
                 softcap=self.softcap,
                 kv_scales=self.kv_scales,
+                window_size_left=self.window_size,
             )
 
         return self.o_proj(
@@ -513,6 +504,7 @@ class FlashGemma3Model(torch.nn.Module):
         max_s: int,
         adapter_data: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        attention_mask_local: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
@@ -525,25 +517,6 @@ class FlashGemma3Model(torch.nn.Module):
                 position_ids, max_s, hidden_states.dtype
             )
 
-            # apply sliding window mask if needed
-            if layer.self_attn.window_size > 0 and attention_mask is not None:
-                min_dtype = torch.finfo(hidden_states.dtype).min
-                # prefill may be larger than sliding window
-                effective_seq_len = max(
-                    position_ids.shape[0], self.layers[i].self_attn.window_size
-                )
-                sliding_window_mask = torch.tril(
-                    torch.ones_like(attention_mask, dtype=torch.bool),
-                    diagonal=-self.layers[i].self_attn.window_size,
-                )
-                attention_mask = torch.where(
-                    sliding_window_mask, min_dtype, attention_mask
-                )
-                offset = max(0, position_ids.shape[0] - effective_seq_len)
-                attention_mask = attention_mask[
-                    :, :, offset : offset + effective_seq_len
-                ]
-
             hidden_states, residual = layer(
                 hidden_states,
                 residual,
@@ -556,7 +529,11 @@ class FlashGemma3Model(torch.nn.Module):
                 seqlen,
                 max_s,
                 adapter_data,
-                attention_mask,
+                (
+                    attention_mask
+                    if self.layers[i].self_attn.window_size == -1
+                    else attention_mask_local
+                ),
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -723,24 +700,6 @@ class Gemma3ForConditionalGeneration(nn.Module):
             config.pad_token_id if config.pad_token_id is not None else -1
         )
 
-    def get_image_token_mask(self, input_ids):
-        device = input_ids.device
-
-        start_token_id = self.config.boi_token_index
-        K = self.config.mm_tokens_per_image
-
-        mask = torch.zeros_like(input_ids, dtype=torch.bool, device=device)
-        start_positions = (input_ids == start_token_id).nonzero(as_tuple=True)[0]
-        mask_indices = start_positions.unsqueeze(1) + torch.arange(
-            1, K + 1, device=device
-        ).unsqueeze(0)
-
-        valid_mask = mask_indices < input_ids.size(0)
-        mask_indices = mask_indices[valid_mask]
-        mask[mask_indices] = True
-
-        return mask
-
     def get_attention_mask(
         self, input_ids, max_s, cu_seqlen_prefill, dtype, image_token_mask
     ):
@@ -751,7 +710,7 @@ class Gemma3ForConditionalGeneration(nn.Module):
         batch_size = len(lengths)
 
         sequence_length = max(lengths)
-        target_length = max_s
+        target_length = sequence_length
         # Create the padding mask from the computed lengths.
         # pad_mask: [batch, sequence_length] where True indicates valid tokens.
         seq_range = torch.arange(sequence_length, device=device).unsqueeze(0)
@@ -847,7 +806,7 @@ class Gemma3ForConditionalGeneration(nn.Module):
 
         #         # Determine the maximum sequence length (after padding) from query.
         #         sequence_length = max(lengths)
-        #         target_length = max_s
+        #         target_length = sequence_length
 
         #         # Create the padding mask from the computed lengths.
         #         # pad_mask: [batch, sequence_length] where True indicates valid tokens.
@@ -885,6 +844,26 @@ class Gemma3ForConditionalGeneration(nn.Module):
         #             input_ids.device
         #         )
 
+        if attention_mask is not None:
+            min_dtype = torch.finfo(inputs_embeds.dtype).min
+            # prefill may be larger than sliding window
+            effective_seq_len = max(
+                position_ids.shape[0], self.config.text_config.sliding_window
+            )
+            sliding_window_mask = torch.tril(
+                torch.ones_like(attention_mask, dtype=torch.bool),
+                diagonal=-self.config.text_config.sliding_window,
+            )
+            attention_mask_local = torch.where(
+                sliding_window_mask, min_dtype, attention_mask
+            )
+            offset = max(0, position_ids.shape[0] - effective_seq_len)
+            attention_mask_local = attention_mask_local[
+                :, :, :, offset : offset + effective_seq_len
+            ]
+        else:
+            attention_mask_local = None
+
         hidden_states = self.text_model.model(
             inputs_embeds=inputs_embeds,
             position_ids=position_ids,
@@ -895,6 +874,7 @@ class Gemma3ForConditionalGeneration(nn.Module):
             seqlen=seqlen,
             max_s=max_s,
             attention_mask=attention_mask,
+            attention_mask_local=attention_mask_local,
         )
 
         if lm_head_indices is not None:
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
index 0fa172d0..7ad294f4 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -242,6 +242,7 @@ class MistralAttention(torch.nn.Module):
                 seqlen,
                 max_s,
                 kv_scales=self.kv_scales,
+                window_size_left=self.max_past,
             )
 
         return self.o_proj(
diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
index a45dd1e6..e2a3e586 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -290,6 +290,7 @@ class MixtralAttention(torch.nn.Module):
                 seqlen,
                 max_s,
                 kv_scales=self.kv_scales,
+                window_size_left=self.max_past,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
index 4ea60451..b1f89eff 100644
--- a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
@@ -31,7 +31,7 @@ class PaliGemmaForConditionalGeneration(nn.Module):
         super().__init__()
         config.vision_config.quantize = config.quantize
         self.vision_tower = load_vision_model(
-            prefix="vision_model" if not prefix else f"{prefix}.vision_model",
+            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
             config=config.vision_config,
             weights=weights,
         )
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index 9d956222..f5e4e15c 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -74,7 +74,7 @@ class Qwen2Attention(torch.nn.Module):
         weights,
     ):
         super().__init__()
-        self.max_past = (
+        self.window_size = (
             config.sliding_window if config.sliding_window is not None else -1
         )
         self.num_heads = config.num_attention_heads
@@ -172,7 +172,7 @@ class Qwen2Attention(torch.nn.Module):
                 seqlen=seqlen,
                 block_tables=block_tables,
                 softmax_scale=self.softmax_scale,
-                window_size_left=self.max_past,
+                window_size_left=self.window_size,
             )
         # Decode
         else:
@@ -185,6 +185,7 @@ class Qwen2Attention(torch.nn.Module):
                 seqlen,
                 max_s,
                 kv_scales=self.kv_scales,
+                window_size_left=self.window_size,
             )
 
         return self.o_proj(
@@ -405,10 +406,10 @@ class Qwen2ForCausalLM(torch.nn.Module):
             weights=weights,
         )
 
-        self.max_past = config.sliding_window
-        self.max_past_tensor = (
+        self.window_size = config.sliding_window
+        self.window_size_tensor = (
             torch.tensor(config.sliding_window, device=weights.device)
-            if self.max_past is not None
+            if self.window_size is not None
             else None
         )
 
@@ -430,10 +431,10 @@ class Qwen2ForCausalLM(torch.nn.Module):
         if prefill_cache_indices is not None:
             # Slots also need to be sliced as it has the same size as the whole kv tensor
             slots = slots[prefill_cache_indices]
-        elif self.max_past is not None:
+        elif self.window_size is not None:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            seqlen = seqlen.clamp(max=self.max_past_tensor)
+            seqlen = seqlen.clamp(max=self.window_size_tensor)
 
         inputs_embeds = self.embed_tokens(input_ids)
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
index 5e090369..9508cc4f 100644
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -291,6 +291,7 @@ class Starcoder2Attention(torch.nn.Module):
                 seqlen,
                 max_s,
                 kv_scales=self.kv_scales,
+                window_size_left=self.max_past,
             )
 
         return self.o_proj(
diff --git a/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
index 803d81ea..2972abea 100644
--- a/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
+++ b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
@@ -263,7 +263,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: bool = True,
         do_pan_and_scan: bool = None,
         pan_and_scan_min_crop_size: int = None,
         pan_and_scan_max_num_crops: int = None,
diff --git a/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py
index 08e39a7c..6bdf35c6 100644
--- a/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py
+++ b/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py
@@ -82,7 +82,7 @@ class Gemma3Processor(ProcessorMixin):
             do_rescale=False,
             resample=PILImageResampling.BILINEAR,
         )
-        # import ipdb; ipdb.set_trace()
+
         self.image_token_id = tokenizer.image_token_id
         image_tokens_expanded = "".join(
             [tokenizer.image_token] * num_mm_soft_tokens_per_image
@@ -91,8 +91,6 @@ class Gemma3Processor(ProcessorMixin):
             f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
         )
 
-        # import ipdb; ipdb.set_trace()
-
         self.image_processor = image_processor
         self.tokenizer = tokenizer
         self.chat_template = chat_template
diff --git a/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
index e317c5b5..066de6a2 100644
--- a/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
+++ b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
@@ -633,7 +633,7 @@ class Qwen2_5VisionModel(nn.Module):
             config=config,
             weights=weights,
         )
-        # import ipdb; ipdb.set_trace()
+
         self.temporal_patch_size = config.temporal_patch_size
         self.spatial_patch_size = config.spatial_patch_size
         self.in_channels = config.in_channels
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index e268af8b..d3a83e27 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -83,24 +83,11 @@ from text_generation_server.models.metadata_kernels import (
 
 tracer = trace.get_tracer(__name__)
 
-# Will be set in init
-SLIDING_WINDOW: Optional[int] = None
-
 
 def small_power_of_2(n: int):
     return 1 << ((n - 1).bit_length() - 1)
 
 
-def set_sliding_window(sliding_window: int):
-    global SLIDING_WINDOW
-    SLIDING_WINDOW = sliding_window
-
-
-def get_sliding_windows() -> int:
-    global SLIDING_WINDOW
-    return SLIDING_WINDOW
-
-
 def init_cpu_threads_env(rank_id: int, world_size: int):
     import importlib.util
 
@@ -1002,10 +989,8 @@ class FlashCausalLMBatch(Batch):
                 self.slot_indices,
             )
 
-        sliding_window = get_sliding_windows()
         position_ids = []
         slot_indices = []
-        prefill_cache_indices = []
         all_prefill_logprobs = True
         no_prefill_logprobs = True
         prefill_cu_outlens = [0]
@@ -1064,14 +1049,6 @@ class FlashCausalLMBatch(Batch):
                 # Update
                 cumulative_slot_tokens += len(request_slots)
 
-            # Create tensor to slice into the kv tensor in prefill
-            if sliding_window is not None:
-                request_prefill_cache_indices = torch.arange(
-                    cumulative_length + max(0, input_length - sliding_window),
-                    cumulative_length + input_length,
-                    dtype=torch.int64,
-                )
-
             # Prefill logprobs is ignored if the request is done prefilling
             prefill_logprobs = r.prefill_logprobs and request_prefilling
 
@@ -1085,9 +1062,6 @@ class FlashCausalLMBatch(Batch):
                 prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
                 prefill_out_cumulative_length += 1
 
-            if sliding_window is not None:
-                prefill_cache_indices.append(request_prefill_cache_indices)
-
             ADAPTER_TO_INDEX = get_adapter_to_index()
             if ADAPTER_TO_INDEX:
                 adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
@@ -1151,24 +1125,18 @@ class FlashCausalLMBatch(Batch):
                 position_ids = torch.cat(position_ids)
             if slot_indices:
                 slot_indices = torch.cat(slot_indices)
-            if sliding_window is not None:
-                prefill_cache_indices = torch.cat(prefill_cache_indices)
         else:
             if position_ids:
                 position_ids = position_ids[0]
             if slot_indices:
                 slot_indices = slot_indices[0]
-            if sliding_window is not None:
-                prefill_cache_indices = prefill_cache_indices[0]
 
         if not has_triton():
             self.position_ids = position_ids.to(device)
             self.slot_indices = slot_indices.to(device)
 
         self.prefill_cu_outlens = prefill_cu_outlens
-        self.prefill_cache_indices = (
-            prefill_cache_indices.to(device) if sliding_window is not None else None
-        )
+        self.prefill_cache_indices = None
 
         if all_prefill_logprobs:
             prefill_head_indices = None
@@ -1306,9 +1274,7 @@ class FlashCausalLM(Model):
         if text_config is not None:
             config = text_config
 
-        if getattr(config, "sliding_window", None) is not None:
-            set_sliding_window(config.sliding_window)
-        else:
+        if getattr(config, "sliding_window", None) is None:
             config.sliding_window = None
 
         self.num_layers = config.num_hidden_layers
@@ -2500,7 +2466,6 @@ class FlashCausalLM(Model):
                 page_size=BLOCK_SIZE,
                 kv_dtype=self.kv_cache_dtype,
                 q_dtype=self.dtype,
-                window_left=self.sliding_window,
             )
         else:
             assert input_lengths_tensor is not None
@@ -2514,5 +2479,4 @@ class FlashCausalLM(Model):
                 page_size=BLOCK_SIZE,
                 kv_cache_dtype=self.kv_cache_dtype,
                 q_dtype=self.dtype,
-                window_left=self.sliding_window,
             )
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index af4d1f08..da317a62 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -110,7 +110,7 @@ class Model(ABC):
             requires_padding=self.requires_padding,
             dtype=str(self.dtype),
             device_type=self.device.type,
-            window_size=self.sliding_window,
+            window_size=None,  # Setting this parameter to None disabled the block logic with sliding window.
             speculate=self.speculate,
             support_chunking=self.support_chunking,
             use_prefix_caching=PREFIX_CACHING,

From 11f2eec10e65c245c986ef8d5697f162ac8b3351 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 18 Mar 2025 12:58:21 +0100
Subject: [PATCH 156/183] Publish nix docker image. (#3122)

* Publish nix docker image.

* Run during PR.

* Something else.

* Forgot to push.

* Build zstd.

* Pushing with skopeo

* Testing the PR.

* Runnign from nix.

* Cleaner tags.
---
 .github/workflows/nix_build.yaml | 53 ++++++++++++++++++++++++++++++++
 nix/docker.nix                   |  2 +-
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/nix_build.yaml

diff --git a/.github/workflows/nix_build.yaml b/.github/workflows/nix_build.yaml
new file mode 100644
index 00000000..33c8ff14
--- /dev/null
+++ b/.github/workflows/nix_build.yaml
@@ -0,0 +1,53 @@
+name: "Nix Build Docker image"
+on:
+  pull_request:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - 'v*'
+concurrency:
+  group: nix-image-${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build_nix_image:
+    runs-on:
+      group: aws-highmemory-32-plus-priv
+    steps:
+    - uses: actions/checkout@v4
+    - uses: cachix/install-nix-action@v27
+      with:
+        nix_path: nixpkgs=channel:nixos-unstable
+    - uses: cachix/cachix-action@v14
+      with:
+        name: text-generation-inference
+        # If you chose signing key for write access
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+      env:
+        USER: github_runner
+    - name: Build
+      run: nix build .#dockerImage
+    - name: Initialize Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with:
+        install: true
+        buildkitd-config: /tmp/buildkitd.toml
+    - name: Inject slug/short variables
+      uses: rlespinasse/github-slug-action@v4.4.1
+    - name: Login to internal Container Registry
+      # if: github.event_name != 'pull_request'
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.REGISTRY_USERNAME }}
+        password: ${{ secrets.REGISTRY_PASSWORD }}
+        registry: registry.internal.huggingface.tech
+    - name: Push to docker
+      run: |
+        if [ "${{ github.event_name }}" = "pull_request" ]; then
+          export TAG=nix-sha-${{ env.GITHUB_SHA_SHORT }}
+        else
+          export TAG=nix-{{ version }}
+        fi
+        export IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:$TAG
+        nix-shell -p skopeo --command "skopeo --insecure-policy copy docker-archive:$(readlink -f ./result) docker://$IMAGE --dest-compress-format zstd"
diff --git a/nix/docker.nix b/nix/docker.nix
index a8fc256e..ed9e56d2 100644
--- a/nix/docker.nix
+++ b/nix/docker.nix
@@ -3,7 +3,6 @@
   dockerTools,
   cacert,
   text-generation-inference,
-  runCommand,
   stream ? false,
 }:
 
@@ -13,6 +12,7 @@ in
 build {
   name = "tgi-docker";
   tag = "latest";
+  compressor = "zstd";
   config = {
     EntryPoint = [ "${text-generation-inference}/bin/text-generation-inference" ];
     Env = [

From 83fe45c15e94f568ca893283bbc8554754dba399 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 18 Mar 2025 15:11:55 +0100
Subject: [PATCH 157/183] Prepare for patch release. (#3124)

---
 README.md                                         |  6 +++---
 docs/source/backends/gaudi.mdx                    | 10 +++++-----
 docs/source/backends/neuron.md                    |  2 +-
 docs/source/basic_tutorials/gated_model_access.md |  2 +-
 docs/source/conceptual/quantization.md            |  6 +++---
 docs/source/installation_amd.md                   |  2 +-
 docs/source/installation_intel.md                 |  4 ++--
 docs/source/installation_nvidia.md                |  2 +-
 docs/source/quicktour.md                          |  4 ++--
 docs/source/reference/api_reference.md            |  2 +-
 10 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 64499c07..274cd86c 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx
index c3481f2e..7e0e69ab 100644
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
     --model-id $model
 ```
 
@@ -74,7 +74,7 @@ hf_token=YOUR_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
     --model-id $model
     <text-generation-inference-launcher-arguments>
 ```
@@ -137,7 +137,7 @@ docker run -p 8080:80 \
    -e BATCH_BUCKET_SIZE=256 \
    -e PREFILL_BATCH_BUCKET_SIZE=4 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -163,7 +163,7 @@ docker run -p 8080:80 \
    -v $volume:/data \
     -e PREFILL_BATCH_BUCKET_SIZE=1 \
     -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
    --model-id $model \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
    --max-total-tokens 8192 --max-batch-size 4
@@ -230,7 +230,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \
     -e PROF_PATH=/tmp/hpu_profile \
     -e PROF_RANKS=0 \
     -e PROF_RECORD_SHAPES=True \
-    ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
     --model-id $model
 ```
 
diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md
index 7a8b09e5..f3a5dac7 100644
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:
 
 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.2.0-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.2.1-neuron <service_parameters>
 ```
 
 - system parameters are used to map ports, volumes and devices between the host and the service,
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 449969af..29ced4f8 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 \
     --model-id $model
 ```
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index b86a4261..4288b0a2 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize bitsandbytes
 ```
 
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize bitsandbytes-nf4
 ```
 
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.0 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize gptq
 ```
 
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
index 25b96fd3..4242cffb 100644
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
     --device=/dev/kfd --device=/dev/dri --group-add video \
     --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.0-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.2.1-rocm \
     --model-id $model
 ```
 
diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md
index f075cc50..c0b40e02 100644
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.0-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.1-intel-xpu \
     --model-id $model --cuda-graphs 0
 ```
 
@@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.0-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.1-intel-cpu \
     --model-id $model --cuda-graphs 0
 ```
 
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
index 1adffb9a..31fba9d6 100644
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.0 \
+    ghcr.io/huggingface/text-generation-inference:3.2.1 \
     --model-id $model
 ```
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index 540a959e..9ed60efd 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.0 \
+    ghcr.io/huggingface/text-generation-inference:3.2.1 \
     --model-id $model
 ```
 
@@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.2.0 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.2.1 --help
 ```
 
 </Tip>
diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
index cf5ff2d7..e563a9b1 100644
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@@ -163,7 +163,7 @@ hub = {
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.0"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.1"),
  env=hub,
  role=role,
 )

From 67ce543e04aed5abfc98a08249a42058645754b4 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 18 Mar 2025 15:12:11 +0100
Subject: [PATCH 158/183] Intel docker. (#3121)

* Intel docker.

* torchaudio ?

* Fixing dockerfile ?
---
 Dockerfile_intel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index bacefd02..10ed914a 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -98,7 +98,7 @@ ENV HF_HOME=/data \
 
 
 WORKDIR /usr/src
-RUN pip install torch==2.6.0 torchvision torchaudio  --index-url https://download.pytorch.org/whl/test/xpu
+RUN pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/test/xpu
 
 RUN pip install triton-xpu==3.2.0b1 --no-cache-dir
 

From e497bc09f6107baae4f06d6d31fc18730d0970c3 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 18 Mar 2025 15:42:35 +0100
Subject: [PATCH 159/183] Minor fixes. (#3125)

---
 .github/workflows/nix_build.yaml | 2 +-
 tgi-entrypoint.sh                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nix_build.yaml b/.github/workflows/nix_build.yaml
index 33c8ff14..5a10bd39 100644
--- a/.github/workflows/nix_build.yaml
+++ b/.github/workflows/nix_build.yaml
@@ -47,7 +47,7 @@ jobs:
         if [ "${{ github.event_name }}" = "pull_request" ]; then
           export TAG=nix-sha-${{ env.GITHUB_SHA_SHORT }}
         else
-          export TAG=nix-{{ version }}
+          export TAG=nix-${{ github.ref_name }}
         fi
         export IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:$TAG
         nix-shell -p skopeo --command "skopeo --insecure-policy copy docker-archive:$(readlink -f ./result) docker://$IMAGE --dest-compress-format zstd"
diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh
index cd551ed5..32eccea5 100755
--- a/tgi-entrypoint.sh
+++ b/tgi-entrypoint.sh
@@ -2,5 +2,5 @@
 
 ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
 
-source ./.venv/bin/activate
+source /usr/src/.venv/bin/activate
 exec text-generation-launcher $@

From e5503eba781143f6f583c06cfcb6f0bf11349940 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Erik=20Kaunism=C3=A4ki?= <erik.kaum@gmail.com>
Date: Thu, 20 Mar 2025 14:25:56 +0100
Subject: [PATCH 160/183] configurable termination timeout (#3126)

* make shard and webserver termination timeouts configurable

* Updating documentation.

* Fmt.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 docs/source/reference/launcher.md |  9 +++++++++
 launcher/src/main.rs              | 24 ++++++++++++++++++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md
index 2d9b58d9..6505a08d 100644
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@@ -477,6 +477,15 @@ Options:
           
           [env: ENABLE_PREFILL_LOGPROBS=]
 
+```
+## GRACEFUL_TERMINATION_TIMEOUT
+```shell
+  -g, --graceful-termination-timeout <GRACEFUL_TERMINATION_TIMEOUT>
+          Change timeout of graceful termination of the TGI server
+          
+          [env: GRACEFUL_TERMINATION_TIMEOUT=]
+          [default: 90]
+
 ```
 ## HELP
 ```shell
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index e3abb843..25061381 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -892,6 +892,10 @@ struct Args {
     /// Using this flag reallows users to ask for them.
     #[clap(long, env)]
     enable_prefill_logprobs: bool,
+
+    /// Change timeout of graceful termination of the TGI server
+    #[clap(default_value = "90", long, short, env)]
+    graceful_termination_timeout: u64,
 }
 
 #[derive(Debug)]
@@ -933,6 +937,7 @@ fn shard_manager(
     log_level: LevelFilter,
     status_sender: mpsc::Sender<ShardStatus>,
     shutdown: Arc<AtomicBool>,
+    graceful_termination_timeout: u64,
     _shutdown_sender: mpsc::Sender<()>,
 ) {
     // Enter shard-manager tracing span
@@ -1206,7 +1211,12 @@ fn shard_manager(
 
         // We received a shutdown signal
         if shutdown.load(Ordering::SeqCst) {
-            terminate("shard", p, Duration::from_secs(90)).unwrap();
+            terminate(
+                "shard",
+                p,
+                Duration::from_secs(graceful_termination_timeout),
+            )
+            .unwrap();
             return;
         }
 
@@ -1545,6 +1555,7 @@ fn spawn_shards(
     status_receiver: &mpsc::Receiver<ShardStatus>,
     status_sender: mpsc::Sender<ShardStatus>,
     running: Arc<AtomicBool>,
+    graceful_termination_timeout: u64,
 ) -> Result<(), LauncherError> {
     // Start shard processes
     for rank in 0..num_shard {
@@ -1612,6 +1623,7 @@ fn spawn_shards(
                 max_log_level,
                 status_sender,
                 shutdown,
+                graceful_termination_timeout,
                 shutdown_sender,
             )
         });
@@ -1999,6 +2011,8 @@ fn main() -> Result<(), LauncherError> {
     // Pattern match configuration
     let args: Args = Args::parse();
 
+    let graceful_termination_timeout = args.graceful_termination_timeout;
+
     // Filter events with LOG_LEVEL
     let varname = "LOG_LEVEL";
     let env_filter = if let Ok(log_level) = std::env::var(varname) {
@@ -2263,6 +2277,7 @@ fn main() -> Result<(), LauncherError> {
         &status_receiver,
         status_sender,
         running.clone(),
+        graceful_termination_timeout,
     )?;
 
     // We might have received a termination signal
@@ -2307,7 +2322,12 @@ fn main() -> Result<(), LauncherError> {
     }
 
     // Graceful termination
-    terminate("webserver", webserver, Duration::from_secs(90)).unwrap();
+    terminate(
+        "webserver",
+        webserver,
+        Duration::from_secs(graceful_termination_timeout),
+    )
+    .unwrap();
     shutdown_shards(shutdown, &shutdown_receiver);
 
     exit_code

From 2e60a8dd6587c8db267c4cc3e4a8c42e9b346ef4 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Thu, 20 Mar 2025 16:07:31 +0100
Subject: [PATCH 161/183] CI: enable server tests for backends (#3128)

add test for backends
---
 .github/workflows/nix_tests.yaml | 1 +
 .github/workflows/tests.yaml     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/nix_tests.yaml b/.github/workflows/nix_tests.yaml
index f2209f8a..d9b91048 100644
--- a/.github/workflows/nix_tests.yaml
+++ b/.github/workflows/nix_tests.yaml
@@ -7,6 +7,7 @@ on:
       - "proto/**"
       - "router/**"
       - "launcher/**"
+      - "backends/**"
       - "Cargo.lock"
       - "rust-toolchain.toml"
 concurrency:
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 895b4dd4..3e431c86 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -8,6 +8,7 @@ on:
       - "proto/**"
       - "router/**"
       - "launcher/**"
+      - "backends/**"
       - "Cargo.lock"
       - "rust-toolchain.toml"
 

From 54d15462dcac62ac13d08148728fe727aa61dd3a Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 24 Mar 2025 11:55:49 +0100
Subject: [PATCH 162/183] Torch 2.6 (#3134)

* Torch 2.6

* Upgrade the toolchain.

* Don't upgrade just yet.

* Upgrade toolchain.

* Time upgrade.

* TGI-nix main.

* Upgrade to transformers 4.50
---
 Cargo.lock          | 763 +++++++++++++++++++++++++-------------------
 Dockerfile          |   2 +-
 Dockerfile.neuron   |   2 +-
 Dockerfile_amd      |   2 +-
 Dockerfile_gaudi    |   2 +-
 Dockerfile_intel    |   2 +-
 Dockerfile_llamacpp |   2 +-
 Dockerfile_trtllm   |   2 +-
 flake.lock          |  13 +-
 flake.nix           |   2 +-
 nix/overlay.nix     |   4 +-
 rust-toolchain.toml |   2 +-
 12 files changed, 449 insertions(+), 349 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5e13e855..6cde83b5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -119,9 +119,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.95"
+version = "1.0.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
+checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f"
 
 [[package]]
 name = "arbitrary"
@@ -143,7 +143,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -182,18 +182,18 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
 name = "async-trait"
-version = "0.1.86"
+version = "0.1.88"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d"
+checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -246,9 +246,9 @@ dependencies = [
 
 [[package]]
 name = "avif-serialize"
-version = "0.8.2"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e335041290c43101ca215eed6f43ec437eb5a42125573f600fc3fa42b9bddd62"
+checksum = "98922d6a4cfbcb08820c69d8eeccc05bb1f29bfa06b4f5b1dbfe9a868bd7608e"
 dependencies = [
  "arrayvec",
 ]
@@ -267,27 +267,25 @@ dependencies = [
 
 [[package]]
 name = "aws-lc-rs"
-version = "1.12.2"
+version = "1.12.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c2b7ddaa2c56a367ad27a094ad8ef4faacf8a617c2575acb2ba88949df999ca"
+checksum = "dabb68eb3a7aa08b46fddfd59a3d55c978243557a90ab804769f7e20e67d2b01"
 dependencies = [
  "aws-lc-sys",
- "paste",
  "zeroize",
 ]
 
 [[package]]
 name = "aws-lc-sys"
-version = "0.25.1"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54ac4f13dad353b209b34cbec082338202cbc01c8f00336b55c750c13ac91f8f"
+checksum = "77926887776171ced7d662120a75998e444d3750c951abfe07f90da130514b1f"
 dependencies = [
  "bindgen 0.69.5",
  "cc",
  "cmake",
  "dunce",
  "fs_extra",
- "paste",
 ]
 
 [[package]]
@@ -332,7 +330,7 @@ dependencies = [
  "axum-core 0.4.5",
  "bytes",
  "futures-util",
- "http 1.2.0",
+ "http 1.3.1",
  "http-body 1.0.1",
  "http-body-util",
  "hyper 1.6.0",
@@ -382,7 +380,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "futures-util",
- "http 1.2.0",
+ "http 1.3.1",
  "http-body 1.0.1",
  "http-body-util",
  "mime",
@@ -403,7 +401,7 @@ dependencies = [
  "axum 0.7.9",
  "futures-core",
  "futures-util",
- "http 1.2.0",
+ "http 1.3.1",
  "opentelemetry 0.21.0",
  "pin-project-lite",
  "tower 0.4.13",
@@ -451,7 +449,7 @@ version = "0.69.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "cexpr",
  "clang-sys",
  "itertools 0.12.1",
@@ -464,7 +462,7 @@ dependencies = [
  "regex",
  "rustc-hash 1.1.0",
  "shlex",
- "syn 2.0.98",
+ "syn 2.0.100",
  "which",
 ]
 
@@ -474,7 +472,7 @@ version = "0.71.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "cexpr",
  "clang-sys",
  "itertools 0.13.0",
@@ -485,7 +483,7 @@ dependencies = [
  "regex",
  "rustc-hash 2.1.1",
  "shlex",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -517,9 +515,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.8.0"
+version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
+checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd"
 
 [[package]]
 name = "bitstream-io"
@@ -562,9 +560,9 @@ checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
 
 [[package]]
 name = "bytemuck"
-version = "1.21.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3"
+checksum = "b6b1fc10dbac614ebc03540c9dbd60e83887fda27794998c6528f1782047d540"
 
 [[package]]
 name = "byteorder"
@@ -580,9 +578,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
 
 [[package]]
 name = "bytes"
-version = "1.10.0"
+version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9"
+checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
 
 [[package]]
 name = "camino"
@@ -639,9 +637,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.14"
+version = "1.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9"
+checksum = "1fcb57c740ae1daf453ae85f16e37396f672b039e00d9d866e07ddb24e328e3a"
 dependencies = [
  "jobserver",
  "libc",
@@ -687,16 +685,16 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
 
 [[package]]
 name = "chrono"
-version = "0.4.39"
+version = "0.4.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825"
+checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c"
 dependencies = [
  "android-tzdata",
  "iana-time-zone",
  "js-sys",
  "num-traits",
  "wasm-bindgen",
- "windows-targets 0.52.6",
+ "windows-link",
 ]
 
 [[package]]
@@ -723,9 +721,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.30"
+version = "4.5.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92b7b18d71fad5313a1e320fa9897994228ce274b60faa4d694fe0ea89cd9e6d"
+checksum = "6088f3ae8c3608d19260cd7445411865a485688711b78b5be70d78cd96136f83"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -733,9 +731,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.30"
+version = "4.5.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a35db2071778a7344791a4fb4f95308b5673d219dee3ae348b86642574ecc90c"
+checksum = "22a7ef7f676155edfb82daa97f99441f3ebf4a58d5e32f295a56259f1b6facc8"
 dependencies = [
  "anstream",
  "anstyle",
@@ -745,14 +743,14 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.28"
+version = "4.5.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed"
+checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -772,12 +770,13 @@ dependencies = [
 
 [[package]]
 name = "codespan-reporting"
-version = "0.11.1"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
+checksum = "fe6d2e5af09e8c8ad56c969f2157a3d4238cebc7c55f0a517728c38f7b200f81"
 dependencies = [
+ "serde",
  "termcolor",
- "unicode-width 0.1.14",
+ "unicode-width 0.2.0",
 ]
 
 [[package]]
@@ -808,9 +807,9 @@ dependencies = [
 
 [[package]]
 name = "console"
-version = "0.15.10"
+version = "0.15.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
+checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
 dependencies = [
  "encode_unicode",
  "libc",
@@ -939,11 +938,11 @@ version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "crossterm_winapi",
  "mio",
  "parking_lot",
- "rustix",
+ "rustix 0.38.44",
  "signal-hook",
  "signal-hook-mio",
  "winapi",
@@ -1007,9 +1006,9 @@ dependencies = [
 
 [[package]]
 name = "cxx"
-version = "1.0.141"
+version = "1.0.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bc580dceb395cae0efdde0a88f034cfd8a276897e40c693a7b87bed17971d33"
+checksum = "6d1cf22155cf6a8e0b0536efc30c775eadd7a481c376d2d7e30daf0825a42ef9"
 dependencies = [
  "cc",
  "cxxbridge-cmd",
@@ -1021,47 +1020,47 @@ dependencies = [
 
 [[package]]
 name = "cxx-build"
-version = "1.0.141"
+version = "1.0.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49d8c1baedad72a7efda12ad8d7ad687b3e7221dfb304a12443fd69e9de8bb30"
+checksum = "db4e07e3a69db032f03450594e53785a5d6b1d787c2ad5b901d9347f0064af94"
 dependencies = [
  "cc",
  "codespan-reporting",
  "proc-macro2",
  "quote",
  "scratch",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
 name = "cxxbridge-cmd"
-version = "1.0.141"
+version = "1.0.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e43afb0e3b2ef293492a31ecd796af902112460d53e5f923f7804f348a769f9c"
+checksum = "48e9ff9c627d3abe06190462f7db81fb6cc12f3424ea081c2a8c9ed7a8cc167a"
 dependencies = [
- "clap 4.5.30",
+ "clap 4.5.32",
  "codespan-reporting",
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
 name = "cxxbridge-flags"
-version = "1.0.141"
+version = "1.0.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0257ad2096a2474fe877e9e055ab69603851c3d6b394efcc7e0443899c2492ce"
+checksum = "2e6417f4e1518ded330e088d5a66f50fbae9bbc96840e147058ae44970a2b51a"
 
 [[package]]
 name = "cxxbridge-macro"
-version = "1.0.141"
+version = "1.0.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b46cbd7358a46b760609f1cb5093683328e58ca50e594a308716f5403fdc03e5"
+checksum = "856ff0dba6e023dd78189c8f4667126842dfe88392b5d4e94118bd18b8f2afbf"
 dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -1085,7 +1084,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -1096,14 +1095,14 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
 name = "deranged"
-version = "0.3.11"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+checksum = "28cfac68e08048ae1883171632c2aef3ebc555621ae56fbccce1cbf22dd7f058"
 dependencies = [
  "powerfmt",
 ]
@@ -1126,7 +1125,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -1136,7 +1135,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -1178,7 +1177,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -1198,9 +1197,9 @@ dependencies = [
 
 [[package]]
 name = "either"
-version = "1.13.0"
+version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 
 [[package]]
 name = "email_address"
@@ -1258,7 +1257,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f83197f59927b46c04a183a619b7c29df34e63e63c7869320862268c0ef687e0"
 dependencies = [
  "bit_field",
- "half 2.4.1",
+ "half 2.5.0",
  "lebe",
  "miniz_oxide",
  "rayon-core",
@@ -1300,9 +1299,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
 
 [[package]]
 name = "flate2"
-version = "1.0.35"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c"
+checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
@@ -1339,9 +1338,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
 [[package]]
 name = "foldhash"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 
 [[package]]
 name = "foreign-types"
@@ -1439,7 +1438,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -1504,14 +1503,14 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8"
+checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
 dependencies = [
  "cfg-if",
  "libc",
- "wasi 0.13.3+wasi-0.2.2",
- "windows-targets 0.52.6",
+ "r-efi",
+ "wasi 0.14.2+wasi-0.2.4",
 ]
 
 [[package]]
@@ -1558,7 +1557,7 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "http 0.2.12",
- "indexmap 2.7.1",
+ "indexmap 2.8.0",
  "slab",
  "tokio",
  "tokio-util",
@@ -1567,17 +1566,17 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e"
+checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2"
 dependencies = [
  "atomic-waker",
  "bytes",
  "fnv",
  "futures-core",
  "futures-sink",
- "http 1.2.0",
- "indexmap 2.7.1",
+ "http 1.3.1",
+ "indexmap 2.8.0",
  "slab",
  "tokio",
  "tokio-util",
@@ -1592,9 +1591,9 @@ checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403"
 
 [[package]]
 name = "half"
-version = "2.4.1"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+checksum = "7db2ff139bba50379da6aa0766b52fdcb62cb5b263009b09ed58ba604e14bbd1"
 dependencies = [
  "cfg-if",
  "crunchy",
@@ -1678,17 +1677,17 @@ checksum = "cc03dcb0b0a83ae3f3363ec811014ae669f083e4e499c66602f447c4828737a1"
 dependencies = [
  "dirs",
  "futures",
- "http 1.2.0",
+ "http 1.3.1",
  "indicatif",
  "libc",
  "log",
  "native-tls",
  "num_cpus",
  "rand 0.8.5",
- "reqwest 0.12.12",
+ "reqwest 0.12.15",
  "serde",
  "serde_json",
- "thiserror 2.0.11",
+ "thiserror 2.0.12",
  "tokio",
  "ureq",
  "windows-sys 0.59.0",
@@ -1727,9 +1726,9 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "1.2.0"
+version = "1.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea"
+checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
 dependencies = [
  "bytes",
  "fnv",
@@ -1754,27 +1753,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
 dependencies = [
  "bytes",
- "http 1.2.0",
+ "http 1.3.1",
 ]
 
 [[package]]
 name = "http-body-util"
-version = "0.1.2"
+version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
 dependencies = [
  "bytes",
- "futures-util",
- "http 1.2.0",
+ "futures-core",
+ "http 1.3.1",
  "http-body 1.0.1",
  "pin-project-lite",
 ]
 
 [[package]]
 name = "httparse"
-version = "1.10.0"
+version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
 
 [[package]]
 name = "httpdate"
@@ -1815,8 +1814,8 @@ dependencies = [
  "bytes",
  "futures-channel",
  "futures-util",
- "h2 0.4.7",
- "http 1.2.0",
+ "h2 0.4.8",
+ "http 1.3.1",
  "http-body 1.0.1",
  "httparse",
  "httpdate",
@@ -1834,11 +1833,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2"
 dependencies = [
  "futures-util",
- "http 1.2.0",
+ "http 1.3.1",
  "hyper 1.6.0",
  "hyper-util",
  "log",
- "rustls 0.23.23",
+ "rustls 0.23.25",
  "rustls-native-certs",
  "rustls-pki-types",
  "tokio",
@@ -1896,7 +1895,7 @@ dependencies = [
  "bytes",
  "futures-channel",
  "futures-util",
- "http 1.2.0",
+ "http 1.3.1",
  "http-body 1.0.1",
  "hyper 1.6.0",
  "pin-project-lite",
@@ -2044,7 +2043,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2125,9 +2124,9 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.7.1"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652"
+checksum = "3954d50fe15b02142bf25d3b8bdadb634ec3948f103d04ffe3031bc8fe9d7058"
 dependencies = [
  "equivalent",
  "hashbrown 0.15.2",
@@ -2149,9 +2148,9 @@ dependencies = [
 
 [[package]]
 name = "indoc"
-version = "2.0.5"
+version = "2.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
+checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
 
 [[package]]
 name = "init-tracing-opentelemetry"
@@ -2176,7 +2175,7 @@ dependencies = [
  "indoc",
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2187,7 +2186,7 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2240,9 +2239,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.14"
+version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
 
 [[package]]
 name = "jobserver"
@@ -2288,7 +2287,7 @@ dependencies = [
  "percent-encoding",
  "referencing",
  "regex-syntax 0.8.5",
- "reqwest 0.12.12",
+ "reqwest 0.12.15",
  "serde",
  "serde_json",
  "uuid-simd",
@@ -2314,9 +2313,9 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
 
 [[package]]
 name = "libc"
-version = "0.2.169"
+version = "0.2.171"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
+checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
 
 [[package]]
 name = "libfuzzer-sys"
@@ -2350,15 +2349,15 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "libc",
 ]
 
 [[package]]
 name = "link-cplusplus"
-version = "1.0.9"
+version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d240c6f7e1ba3a28b0249f774e6a9dd0175054b52dfbb61b16eb8505c3785c9"
+checksum = "4a6f6da007f968f9def0d65a05b187e2960183de70c160204ecfccf0ee330212"
 dependencies = [
  "cc",
 ]
@@ -2370,10 +2369,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
 
 [[package]]
-name = "litemap"
-version = "0.7.4"
+name = "linux-raw-sys"
+version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
+checksum = "fe7db12097d22ec582439daf8618b8fdd1a7bef6270e9af3b1ebcd30893cf413"
+
+[[package]]
+name = "litemap"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856"
 
 [[package]]
 name = "lock_api"
@@ -2387,9 +2392,9 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.25"
+version = "0.4.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
+checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e"
 
 [[package]]
 name = "loop9"
@@ -2492,7 +2497,7 @@ dependencies = [
  "hyper 1.6.0",
  "hyper-rustls",
  "hyper-util",
- "indexmap 2.7.1",
+ "indexmap 2.8.0",
  "ipnet",
  "metrics",
  "metrics-util",
@@ -2535,9 +2540,9 @@ dependencies = [
 
 [[package]]
 name = "minijinja"
-version = "2.7.0"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cff7b8df5e85e30b87c2b0b3f58ba3a87b68e133738bf512a7713769326dbca9"
+checksum = "6e36f1329330bb1614c94b78632b9ce45dd7d761f3304a1bed07b2990a7c5097"
 dependencies = [
  "serde",
  "serde_json",
@@ -2545,9 +2550,9 @@ dependencies = [
 
 [[package]]
 name = "minijinja-contrib"
-version = "2.7.0"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ac3e47a9006ed0500425a092c9f8b2e56d10f8aeec8ce870c5e8a7c6ef2d7c3"
+checksum = "8e807b6b15e36a4c808e92f78c2ac1f6776519a50d9cf6649819c759a8e7133c"
 dependencies = [
  "minijinja",
  "serde",
@@ -2561,9 +2566,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b"
+checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5"
 dependencies = [
  "adler2",
  "simd-adler32",
@@ -2583,9 +2588,9 @@ dependencies = [
 
 [[package]]
 name = "monostate"
-version = "0.1.13"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e"
+checksum = "aafe1be9d0c75642e3e50fedc7ecadf1ef1cbce6eb66462153fc44245343fbee"
 dependencies = [
  "monostate-impl",
  "serde",
@@ -2593,13 +2598,13 @@ dependencies = [
 
 [[package]]
 name = "monostate-impl"
-version = "0.1.13"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
+checksum = "c402a4092d5e204f32c9e155431046831fa712637043c58cb73bc6bc6c9663b5"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2629,9 +2634,9 @@ dependencies = [
 
 [[package]]
 name = "native-tls"
-version = "0.2.13"
+version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0dab59f8e050d5df8e4dd87d9206fb6f65a483e20ac9fda365ade4fab353196c"
+checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e"
 dependencies = [
  "libc",
  "log",
@@ -2687,7 +2692,7 @@ version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "cfg-if",
  "cfg_aliases 0.1.1",
  "libc",
@@ -2699,7 +2704,7 @@ version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "cfg-if",
  "cfg_aliases 0.2.1",
  "libc",
@@ -2799,7 +2804,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2879,9 +2884,9 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.20.3"
+version = "1.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e"
+checksum = "d75b0bedcc4fe52caa0e03d9f1151a323e4aa5e2d78ba3580400cd3c9e2bc4bc"
 
 [[package]]
 name = "onig"
@@ -2907,9 +2912,9 @@ dependencies = [
 
 [[package]]
 name = "oorandom"
-version = "11.1.4"
+version = "11.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
 
 [[package]]
 name = "openssl"
@@ -2917,7 +2922,7 @@ version = "0.10.71"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e14130c6a98cd258fdcb0fb6d744152343ff729cbfcb28c656a9d12b999fbcd"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -2934,7 +2939,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -2973,7 +2978,7 @@ checksum = "1e32339a5dc40459130b3bd269e9892439f55b33e772d2a9d402a789baaf4e8a"
 dependencies = [
  "futures-core",
  "futures-sink",
- "indexmap 2.7.1",
+ "indexmap 2.8.0",
  "js-sys",
  "once_cell",
  "pin-project-lite",
@@ -3180,27 +3185,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
  "fixedbitset",
- "indexmap 2.7.1",
+ "indexmap 2.8.0",
 ]
 
 [[package]]
 name = "pin-project"
-version = "1.1.9"
+version = "1.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfe2e71e1471fe07709406bf725f710b02927c9c54b2b5b2ec0e8087d97c327d"
+checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.1.9"
+version = "1.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67"
+checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -3217,9 +3222,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.31"
+version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
+checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
 [[package]]
 name = "plotters"
@@ -3264,9 +3269,9 @@ dependencies = [
 
 [[package]]
 name = "portable-atomic"
-version = "1.10.0"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
+checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e"
 
 [[package]]
 name = "powerfmt"
@@ -3276,21 +3281,21 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
 
 [[package]]
 name = "ppv-lite86"
-version = "0.2.20"
+version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
 dependencies = [
- "zerocopy 0.7.35",
+ "zerocopy 0.8.24",
 ]
 
 [[package]]
 name = "prettyplease"
-version = "0.2.29"
+version = "0.2.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac"
+checksum = "5316f57387668042f561aae71480de936257848f9c43ce528e311d89a07cadeb"
 dependencies = [
  "proc-macro2",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -3319,9 +3324,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.93"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
+checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
 dependencies = [
  "unicode-ident",
 ]
@@ -3342,7 +3347,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a65f2e60fbf1063868558d69c6beacf412dc755f9fc020f514b7955fc914fe30"
 dependencies = [
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -3382,7 +3387,7 @@ dependencies = [
  "prost 0.12.6",
  "prost-types",
  "regex",
- "syn 2.0.98",
+ "syn 2.0.100",
  "tempfile",
 ]
 
@@ -3409,7 +3414,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -3468,7 +3473,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -3481,7 +3486,7 @@ dependencies = [
  "proc-macro2",
  "pyo3-build-config",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -3516,13 +3521,19 @@ checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
 
 [[package]]
 name = "quote"
-version = "1.0.38"
+version = "1.0.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
 dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r-efi"
+version = "5.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+
 [[package]]
 name = "rand"
 version = "0.8.5"
@@ -3541,8 +3552,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94"
 dependencies = [
  "rand_chacha 0.9.0",
- "rand_core 0.9.1",
- "zerocopy 0.8.18",
+ "rand_core 0.9.3",
+ "zerocopy 0.8.24",
 ]
 
 [[package]]
@@ -3562,7 +3573,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
 dependencies = [
  "ppv-lite86",
- "rand_core 0.9.1",
+ "rand_core 0.9.3",
 ]
 
 [[package]]
@@ -3576,12 +3587,11 @@ dependencies = [
 
 [[package]]
 name = "rand_core"
-version = "0.9.1"
+version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a88e0da7a2c97baa202165137c158d0a2e824ac465d13d81046727b34cb247d3"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
 dependencies = [
- "getrandom 0.3.1",
- "zerocopy 0.8.18",
+ "getrandom 0.3.2",
 ]
 
 [[package]]
@@ -3590,7 +3600,7 @@ version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fdef7f9be5c0122f890d58bdf4d964349ba6a6161f705907526d891efabba57d"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "cassowary",
  "compact_str",
  "crossterm",
@@ -3657,11 +3667,11 @@ dependencies = [
 
 [[package]]
 name = "raw-cpuid"
-version = "11.4.0"
+version = "11.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "529468c1335c1c03919960dfefdb1b3648858c20d7ec2d0663e728e4a717efbc"
+checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
 ]
 
 [[package]]
@@ -3697,11 +3707,11 @@ dependencies = [
 
 [[package]]
 name = "redox_syscall"
-version = "0.5.8"
+version = "0.5.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
+checksum = "0b8c0c260b63a8219631167be35e6a988e9554dbd323f8bd08439c8ed1302bd1"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
 ]
 
 [[package]]
@@ -3717,22 +3727,22 @@ dependencies = [
 
 [[package]]
 name = "ref-cast"
-version = "1.0.23"
+version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccf0a6f84d5f1d581da8b41b47ec8600871962f2a528115b542b362d4b744931"
+checksum = "4a0ae411dbe946a674d89546582cea4ba2bb8defac896622d6496f14c23ba5cf"
 dependencies = [
  "ref-cast-impl",
 ]
 
 [[package]]
 name = "ref-cast-impl"
-version = "1.0.23"
+version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
+checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -3834,9 +3844,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest"
-version = "0.12.12"
+version = "0.12.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da"
+checksum = "d19c46a6fdd48bc4dab94b6103fccc55d34c67cc0ad04653aad4ea2a07cd7bbb"
 dependencies = [
  "base64 0.22.1",
  "bytes",
@@ -3844,8 +3854,8 @@ dependencies = [
  "futures-channel",
  "futures-core",
  "futures-util",
- "h2 0.4.7",
- "http 1.2.0",
+ "h2 0.4.8",
+ "http 1.3.1",
  "http-body 1.0.1",
  "http-body-util",
  "hyper 1.6.0",
@@ -3902,9 +3912,9 @@ dependencies = [
 
 [[package]]
 name = "ring"
-version = "0.17.9"
+version = "0.17.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
 dependencies = [
  "cc",
  "cfg-if",
@@ -3916,9 +3926,9 @@ dependencies = [
 
 [[package]]
 name = "rust-embed"
-version = "8.5.0"
+version = "8.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa66af4a4fdd5e7ebc276f115e895611a34739a9c1c01028383d612d550953c0"
+checksum = "0b3aba5104622db5c9fc61098de54708feb732e7763d7faa2fa625899f00bf6f"
 dependencies = [
  "rust-embed-impl",
  "rust-embed-utils",
@@ -3927,22 +3937,22 @@ dependencies = [
 
 [[package]]
 name = "rust-embed-impl"
-version = "8.5.0"
+version = "8.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6125dbc8867951125eec87294137f4e9c2c96566e61bf72c45095a7c77761478"
+checksum = "1f198c73be048d2c5aa8e12f7960ad08443e56fd39cc26336719fdb4ea0ebaae"
 dependencies = [
  "proc-macro2",
  "quote",
  "rust-embed-utils",
- "syn 2.0.98",
+ "syn 2.0.100",
  "walkdir",
 ]
 
 [[package]]
 name = "rust-embed-utils"
-version = "8.5.0"
+version = "8.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e5347777e9aacb56039b0e1f28785929a8a3b709e87482e7442c72e7c12529d"
+checksum = "5a2fcdc9f40c8dc2922842ca9add611ad19f332227fc651d015881ad1552bd9a"
 dependencies = [
  "sha2",
  "walkdir",
@@ -3981,10 +3991,23 @@ version = "0.38.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "errno",
  "libc",
- "linux-raw-sys",
+ "linux-raw-sys 0.4.15",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "rustix"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e56a18552996ac8d29ecc3b190b4fdbb2d91ca4ec396de7bbffaf43f3d637e96"
+dependencies = [
+ "bitflags 2.9.0",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.9.3",
  "windows-sys 0.59.0",
 ]
 
@@ -4007,24 +4030,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
  "log",
- "ring 0.17.9",
+ "ring 0.17.14",
  "rustls-pki-types",
- "rustls-webpki",
+ "rustls-webpki 0.102.8",
  "subtle",
  "zeroize",
 ]
 
 [[package]]
 name = "rustls"
-version = "0.23.23"
+version = "0.23.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395"
+checksum = "822ee9188ac4ec04a2f0531e55d035fb2de73f18b41a63c70c2712503b6fb13c"
 dependencies = [
  "aws-lc-rs",
  "log",
  "once_cell",
  "rustls-pki-types",
- "rustls-webpki",
+ "rustls-webpki 0.103.0",
  "subtle",
  "zeroize",
 ]
@@ -4070,24 +4093,35 @@ name = "rustls-webpki"
 version = "0.102.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
+dependencies = [
+ "ring 0.17.14",
+ "rustls-pki-types",
+ "untrusted 0.9.0",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.103.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0aa4eeac2588ffff23e9d7a7e9b3f971c5fb5b7ebc9452745e0c232c64f83b2f"
 dependencies = [
  "aws-lc-rs",
- "ring 0.17.9",
+ "ring 0.17.14",
  "rustls-pki-types",
  "untrusted 0.9.0",
 ]
 
 [[package]]
 name = "rustversion"
-version = "1.0.19"
+version = "1.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4"
+checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2"
 
 [[package]]
 name = "ryu"
-version = "1.0.19"
+version = "1.0.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd"
+checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
 
 [[package]]
 name = "same-file"
@@ -4115,9 +4149,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "scratch"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3cf7c11c38cb994f3d40e8a8cde3bbd1f72a435e4c49e85d6553d8312306152"
+checksum = "9f6280af86e5f559536da57a45ebc84948833b3bee313a7dd25232e09c878a52"
 
 [[package]]
 name = "sct"
@@ -4125,7 +4159,7 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.17.9",
+ "ring 0.17.14",
  "untrusted 0.9.0",
 ]
 
@@ -4135,7 +4169,7 @@ version = "2.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "core-foundation 0.9.4",
  "core-foundation-sys",
  "libc",
@@ -4148,7 +4182,7 @@ version = "3.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "core-foundation 0.10.0",
  "core-foundation-sys",
  "libc",
@@ -4167,18 +4201,18 @@ dependencies = [
 
 [[package]]
 name = "semver"
-version = "1.0.25"
+version = "1.0.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03"
+checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
 dependencies = [
  "serde",
 ]
 
 [[package]]
 name = "serde"
-version = "1.0.217"
+version = "1.0.219"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
+checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
 dependencies = [
  "serde_derive",
 ]
@@ -4205,22 +4239,22 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.217"
+version = "1.0.219"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
+checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.138"
+version = "1.0.140"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949"
+checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
 dependencies = [
- "indexmap 2.7.1",
+ "indexmap 2.8.0",
  "itoa",
  "memchr",
  "ryu",
@@ -4229,9 +4263,9 @@ dependencies = [
 
 [[package]]
 name = "serde_path_to_error"
-version = "0.1.16"
+version = "0.1.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af99884400da37c88f5e9146b7f1fd0fbcae8f6eec4e9da38b67d05486f814a6"
+checksum = "59fab13f937fa393d08645bf3a84bdfe86e296747b506ada67bb15f10f218b2a"
 dependencies = [
  "itoa",
  "serde",
@@ -4435,7 +4469,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -4457,9 +4491,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.98"
+version = "2.0.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1"
+checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4489,7 +4523,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -4524,7 +4558,7 @@ version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "core-foundation 0.9.4",
  "system-configuration-sys 0.6.0",
 ]
@@ -4594,15 +4628,14 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
 
 [[package]]
 name = "tempfile"
-version = "3.17.1"
+version = "3.19.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230"
+checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf"
 dependencies = [
- "cfg-if",
  "fastrand",
- "getrandom 0.3.1",
+ "getrandom 0.3.2",
  "once_cell",
- "rustix",
+ "rustix 1.0.3",
  "windows-sys 0.59.0",
 ]
 
@@ -4620,7 +4653,7 @@ name = "text-generation-backends-trtllm"
 version = "3.2.1-dev0"
 dependencies = [
  "async-trait",
- "clap 4.5.30",
+ "clap 4.5.32",
  "cmake",
  "cxx",
  "cxx-build",
@@ -4641,7 +4674,7 @@ name = "text-generation-benchmark"
 version = "3.2.1-dev0"
 dependencies = [
  "average",
- "clap 4.5.30",
+ "clap 4.5.32",
  "float-ord",
  "hf-hub 0.4.2",
  "ratatui",
@@ -4678,7 +4711,7 @@ dependencies = [
 name = "text-generation-launcher"
 version = "3.2.1-dev0"
 dependencies = [
- "clap 4.5.30",
+ "clap 4.5.32",
  "ctrlc",
  "float_eq",
  "hf-hub 0.4.2",
@@ -4706,7 +4739,7 @@ dependencies = [
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
  "chrono",
- "clap 4.5.30",
+ "clap 4.5.32",
  "csv",
  "futures",
  "futures-util",
@@ -4753,12 +4786,12 @@ version = "3.2.1-dev0"
 dependencies = [
  "async-trait",
  "bindgen 0.71.1",
- "clap 4.5.30",
+ "clap 4.5.32",
  "hf-hub 0.4.2",
  "num_cpus",
  "pkg-config",
  "text-generation-router",
- "thiserror 2.0.11",
+ "thiserror 2.0.12",
  "tokenizers",
  "tokio",
  "tokio-stream",
@@ -4774,7 +4807,7 @@ dependencies = [
  "axum 0.7.9",
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
- "clap 4.5.30",
+ "clap 4.5.32",
  "futures",
  "futures-util",
  "grpc-metadata",
@@ -4823,7 +4856,7 @@ dependencies = [
  "axum 0.7.9",
  "axum-tracing-opentelemetry",
  "base64 0.22.1",
- "clap 4.5.30",
+ "clap 4.5.32",
  "criterion",
  "futures",
  "futures-util",
@@ -4886,11 +4919,11 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "2.0.11"
+version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
+checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
 dependencies = [
- "thiserror-impl 2.0.11",
+ "thiserror-impl 2.0.12",
 ]
 
 [[package]]
@@ -4901,18 +4934,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "2.0.11"
+version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
+checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -4938,9 +4971,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.37"
+version = "0.3.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21"
+checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40"
 dependencies = [
  "deranged",
  "itoa",
@@ -4955,15 +4988,15 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.2"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c"
 
 [[package]]
 name = "time-macros"
-version = "0.2.19"
+version = "0.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de"
+checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49"
 dependencies = [
  "num-conv",
  "time-core",
@@ -5024,9 +5057,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.43.0"
+version = "1.44.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
+checksum = "f382da615b842244d4b8738c82ed1275e6c5dd90c459a30941cd07080b06c91a"
 dependencies = [
  "backtrace",
  "bytes",
@@ -5058,7 +5091,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5084,11 +5117,11 @@ dependencies = [
 
 [[package]]
 name = "tokio-rustls"
-version = "0.26.1"
+version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37"
+checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
 dependencies = [
- "rustls 0.23.23",
+ "rustls 0.23.25",
  "tokio",
 ]
 
@@ -5105,9 +5138,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.13"
+version = "0.7.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078"
+checksum = "6b9590b93e6fcc1739458317cccd391ad3955e2bde8913edf6f95f9e65a8f034"
 dependencies = [
  "bytes",
  "futures-core",
@@ -5144,7 +5177,7 @@ version = "0.22.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474"
 dependencies = [
- "indexmap 2.7.1",
+ "indexmap 2.8.0",
  "serde",
  "serde_spanned",
  "toml_datetime",
@@ -5216,7 +5249,7 @@ dependencies = [
  "proc-macro2",
  "prost-build",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5261,9 +5294,9 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
  "bytes",
- "http 1.2.0",
+ "http 1.3.1",
  "http-body 1.0.1",
  "http-body-util",
  "pin-project-lite",
@@ -5303,7 +5336,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5378,7 +5411,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9920abb6a3ee3a2af7d30c9ff02900f8481935d36723c3da95cf807468218e8c"
 dependencies = [
- "http 1.2.0",
+ "http 1.3.1",
  "opentelemetry 0.21.0",
  "tracing",
  "tracing-opentelemetry 0.22.0",
@@ -5435,9 +5468,9 @@ checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.16"
+version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
 
 [[package]]
 name = "unicode-normalization-alignments"
@@ -5485,9 +5518,9 @@ checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
 
 [[package]]
 name = "unindent"
-version = "0.2.3"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
+checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
 
 [[package]]
 name = "untrusted"
@@ -5514,7 +5547,7 @@ dependencies = [
  "once_cell",
  "rustls 0.22.4",
  "rustls-pki-types",
- "rustls-webpki",
+ "rustls-webpki 0.102.8",
  "serde",
  "serde_json",
  "socks",
@@ -5563,7 +5596,7 @@ version = "4.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c5afb1a60e207dca502682537fefcfd9921e71d0b83e9576060f09abc6efab23"
 dependencies = [
- "indexmap 2.7.1",
+ "indexmap 2.8.0",
  "serde",
  "serde_json",
  "utoipa-gen",
@@ -5579,7 +5612,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "regex",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5600,24 +5633,24 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.13.2"
+version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c1f41ffb7cf259f1ecc2876861a17e7142e63ead296f671f81f6ae85903e0d6"
+checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9"
 dependencies = [
- "getrandom 0.3.1",
+ "getrandom 0.3.2",
  "rand 0.9.0",
  "uuid-macro-internal",
 ]
 
 [[package]]
 name = "uuid-macro-internal"
-version = "1.13.2"
+version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f02e7142329d47ebfcb5fc40e357b867f971230181e6f78744e52d9325d8f052"
+checksum = "72dcd78c4f979627a754f5522cea6e6a25e55139056535fe6e69c506cd64a862"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
@@ -5715,9 +5748,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasi"
-version = "0.13.3+wasi-0.2.2"
+version = "0.14.2+wasi-0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
 dependencies = [
  "wit-bindgen-rt",
 ]
@@ -5744,7 +5777,7 @@ dependencies = [
  "log",
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
  "wasm-bindgen-shared",
 ]
 
@@ -5779,7 +5812,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -5842,7 +5875,7 @@ version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53"
 dependencies = [
- "ring 0.17.9",
+ "ring 0.17.14",
  "untrusted 0.9.0",
 ]
 
@@ -5870,7 +5903,7 @@ dependencies = [
  "either",
  "home",
  "once_cell",
- "rustix",
+ "rustix 0.38.44",
 ]
 
 [[package]]
@@ -5924,33 +5957,38 @@ dependencies = [
 ]
 
 [[package]]
-name = "windows-registry"
-version = "0.2.0"
+name = "windows-link"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0"
+checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38"
+
+[[package]]
+name = "windows-registry"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4286ad90ddb45071efd1a66dfa43eb02dd0dfbae1545ad6cc3c51cf34d7e8ba3"
 dependencies = [
  "windows-result",
  "windows-strings",
- "windows-targets 0.52.6",
+ "windows-targets 0.53.0",
 ]
 
 [[package]]
 name = "windows-result"
-version = "0.2.0"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e"
+checksum = "c64fd11a4fd95df68efcfee5f44a294fe71b8bc6a91993e2791938abcc712252"
 dependencies = [
- "windows-targets 0.52.6",
+ "windows-link",
 ]
 
 [[package]]
 name = "windows-strings"
-version = "0.1.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10"
+checksum = "87fa48cc5d406560701792be122a10132491cff9d0aeb23583cc2dcafc847319"
 dependencies = [
- "windows-result",
- "windows-targets 0.52.6",
+ "windows-link",
 ]
 
 [[package]]
@@ -6028,13 +6066,29 @@ dependencies = [
  "windows_aarch64_gnullvm 0.52.6",
  "windows_aarch64_msvc 0.52.6",
  "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm",
+ "windows_i686_gnullvm 0.52.6",
  "windows_i686_msvc 0.52.6",
  "windows_x86_64_gnu 0.52.6",
  "windows_x86_64_gnullvm 0.52.6",
  "windows_x86_64_msvc 0.52.6",
 ]
 
+[[package]]
+name = "windows-targets"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.53.0",
+ "windows_aarch64_msvc 0.53.0",
+ "windows_i686_gnu 0.53.0",
+ "windows_i686_gnullvm 0.53.0",
+ "windows_i686_msvc 0.53.0",
+ "windows_x86_64_gnu 0.53.0",
+ "windows_x86_64_gnullvm 0.53.0",
+ "windows_x86_64_msvc 0.53.0",
+]
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.42.2"
@@ -6053,6 +6107,12 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.42.2"
@@ -6071,6 +6131,12 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.42.2"
@@ -6089,12 +6155,24 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
+
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.42.2"
@@ -6113,6 +6191,12 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.42.2"
@@ -6131,6 +6215,12 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.42.2"
@@ -6149,6 +6239,12 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.42.2"
@@ -6168,10 +6264,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
-name = "winnow"
-version = "0.7.2"
+name = "windows_x86_64_msvc"
+version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59690dea168f2198d1a3b0cac23b8063efcd11012f10ae4698f284808c8ef603"
+checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
+
+[[package]]
+name = "winnow"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e97b544156e9bebe1a0ffbc03484fc1ffe3100cbce3ffb17eac35f7cdd7ab36"
 dependencies = [
  "memchr",
 ]
@@ -6188,11 +6290,11 @@ dependencies = [
 
 [[package]]
 name = "wit-bindgen-rt"
-version = "0.33.0"
+version = "0.39.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
 dependencies = [
- "bitflags 2.8.0",
+ "bitflags 2.9.0",
 ]
 
 [[package]]
@@ -6227,7 +6329,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
  "synstructure",
 ]
 
@@ -6237,17 +6339,16 @@ version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
 dependencies = [
- "byteorder",
  "zerocopy-derive 0.7.35",
 ]
 
 [[package]]
 name = "zerocopy"
-version = "0.8.18"
+version = "0.8.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79386d31a42a4996e3336b0919ddb90f81112af416270cff95b5f5af22b839c2"
+checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879"
 dependencies = [
- "zerocopy-derive 0.8.18",
+ "zerocopy-derive 0.8.24",
 ]
 
 [[package]]
@@ -6258,38 +6359,38 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.18"
+version = "0.8.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76331675d372f91bf8d17e13afbd5fe639200b73d01f0fc748bb059f9cca2db7"
+checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
 name = "zerofrom"
-version = "0.1.5"
+version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
+checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
 dependencies = [
  "zerofrom-derive",
 ]
 
 [[package]]
 name = "zerofrom-derive"
-version = "0.1.5"
+version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
+checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
  "synstructure",
 ]
 
@@ -6318,7 +6419,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.98",
+ "syn 2.0.100",
 ]
 
 [[package]]
diff --git a/Dockerfile b/Dockerfile
index cbb0977f..f804c2b8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index b2e0eb2c..d22ca222 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -18,7 +18,7 @@ RUN apt-get update -y \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.0 --profile minimal -y
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.1 --profile minimal -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN cargo install cargo-chef --locked
 
diff --git a/Dockerfile_amd b/Dockerfile_amd
index 9d035c2d..d03ae596 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_gaudi b/Dockerfile_gaudi
index 9009f95b..eff87ab6 100644
--- a/Dockerfile_gaudi
+++ b/Dockerfile_gaudi
@@ -3,7 +3,7 @@ ARG HABANA_VERSION=1.20.0
 ARG PYTORCH_VERSION=2.6.0
 
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 10ed914a..5bf7632c 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -1,6 +1,6 @@
 ARG PLATFORM=xpu
 
-FROM lukemathwalker/cargo-chef:latest-rust-1.85.0 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp
index 4fc61347..291ae88c 100644
--- a/Dockerfile_llamacpp
+++ b/Dockerfile_llamacpp
@@ -41,7 +41,7 @@ RUN mkdir -p llama.cpp \
 
 WORKDIR /app
 COPY rust-toolchain.toml rust-toolchain.toml
-RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.0 --profile minimal -y
+RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
 ENV PATH="/root/.cargo/bin:$PATH"
 RUN cargo install cargo-chef --locked
 
diff --git a/Dockerfile_trtllm b/Dockerfile_trtllm
index d099ec87..7df2e368 100644
--- a/Dockerfile_trtllm
+++ b/Dockerfile_trtllm
@@ -71,7 +71,7 @@ ARG actions_runtime_token
 
 # Install Rust
 ENV PATH="/root/.cargo/bin:$PATH"
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.0 --profile minimal -y && \
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.1 --profile minimal -y && \
     chmod -R a+w /root/.rustup && \
     chmod -R a+w /root/.cargo && \
     cargo install sccache --locked
diff --git a/flake.lock b/flake.lock
index d049a71d..d3c5490b 100644
--- a/flake.lock
+++ b/flake.lock
@@ -853,11 +853,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1741141853,
-        "narHash": "sha256-FauVtC+FbOgkKpGVuQTNxSqrvgbmVc7hFkjn/DacwMo=",
+        "lastModified": 1742783666,
+        "narHash": "sha256-IwdSl51NL6V0f+mYXZR0UTKaGleOsk9zV3l6kt5SUWw=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "02edad1f19d6dec824e0812e4cdc0aa7930ff8ae",
+        "rev": "60766d63c227d576510ecfb5edd3a687d56f6bc7",
         "type": "github"
       },
       "original": {
@@ -978,16 +978,15 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1741617161,
-        "narHash": "sha256-cwKYAsIVSLtoLbG48+oi3NkSrvuZRLYs8lkJmpDsTw0=",
+        "lastModified": 1742807335,
+        "narHash": "sha256-580CXhhCcK1cXRWahk/mDKo6zUsOD2JvNOg5SBBMAKc=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "5946021ec6cb6aae18158a9dc27f893cfbab2925",
+        "rev": "8d9ea4691f49369aa5d3230a51366ff6c744d658",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "kernels-0.2.0",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index 7ddd3b92..c733cdd2 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/kernels-0.2.0";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
diff --git a/nix/overlay.nix b/nix/overlay.nix
index 63398f07..7274d903 100644
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@@ -18,8 +18,8 @@ final: prev: {
             src = final.fetchFromGitHub {
               owner = "huggingface";
               repo = "transformers";
-              rev = "v4.49.0";
-              hash = "sha256-drq7RWoRaRejiQjCUHIYuzaKa9rA4eQZI2do74scp1c=";
+              rev = "v4.50.0";
+              hash = "sha256-/scrMPUY43n+XAMbwWCtmiJKXscXGLrklyDg9XZTaqw=";
             };
           }
         );
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 8e51cc69..700c6d9d 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
 # Released on: 30 January, 2025
 # https://releases.rs/docs/1.84.1/
-channel = "1.85.0"
+channel = "1.85.1"
 components = ["rustfmt", "clippy"]

From f5f14dc66074cec610a6813c9944dc12d101f324 Mon Sep 17 00:00:00 2001
From: Yuan Wu <yuan.wu@intel.com>
Date: Tue, 25 Mar 2025 22:08:15 +0800
Subject: [PATCH 163/183] Gaudi: Fix llava-next and mllama crash issue (#3127)

Signed-off-by: yuanwu <yuan.wu@intel.com>
---
 .../models/custom_modeling/llava_next.py      | 180 +++++++++++-------
 .../models/vlm_causal_lm.py                   |   8 +-
 2 files changed, 116 insertions(+), 72 deletions(-)

diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py
index 70449f6b..00ecdf95 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -20,6 +20,7 @@ import torch
 import torch.utils.checkpoint
 import numpy as np
 
+from loguru import logger
 from transformers.models.llava_next.modeling_llava_next import (
     unpad_image,
 )
@@ -92,23 +93,6 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
 
 class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration):
 
-    def _merge_input_ids_with_image_features(
-        self,
-        inputs_embeds: torch.Tensor,
-        image_features: torch.Tensor,
-        input_ids: torch.Tensor,
-    ):
-        """In place merges in vision_embeddings with inputs_embeds."""
-        mask = input_ids == self.config.image_token_index
-        # Let's pray we have enabled enough slots !
-        try:
-            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
-        except Exception as e:
-            raise RuntimeError(
-                f"Cannot fill images right now. If error happens at warmup, make sure you have enough `--max-input-tokens`  to handle images. If error happens at regular runtime, please fill in an issue: {e}"
-            )
-        return inputs_embeds
-
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -169,6 +153,92 @@ class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration):
 
         return outputs
 
+    # Copied from https://github.com/huggingface/transformers/blob/6966fa190172b48b2fb46fe4552a13b943e692cf/src/transformers/models/llava_next/modeling_llava_next.py#L411
+    def pack_image_features(
+        self,
+        image_features,
+        image_sizes,
+        vision_feature_select_strategy,
+        image_newline=None,
+    ):
+        """
+        Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
+
+        Args:
+            image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
+                List of image feature tensor, each contains all the visual feature of all patches.
+            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
+                Actual image size of each images (H, W).
+            vision_feature_select_strategy (`str`)
+                The feature selection strategy used to select the vision feature from the vision backbone.
+            image_newline (`torch.Tensor` of shape `(embed_dim)`)
+                New line embedding vector.
+        Returns:
+            image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
+            feature_lens (`List[int]`)
+                token length of each image in image_features
+        """
+        new_image_features = []
+        feature_lens = []
+        for image_idx, image_feature in enumerate(image_features):
+            if image_feature.shape[0] > 1:
+                base_image_feature = image_feature[0]
+                image_feature = image_feature[1:]
+                height = width = (
+                    self.config.vision_config.image_size
+                    // self.config.vision_config.patch_size
+                )
+
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    image_sizes[image_idx],
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+
+                if (
+                    np.prod(image_feature.shape)
+                    % (num_patch_height * num_patch_width * height * width)
+                    != 0
+                    and vision_feature_select_strategy == "default"
+                ):
+                    logger.warning_once(
+                        "Image feature shape does not line up with the provided patch size. "
+                        "You may be using the `default` vision_feature_select_strategy with a"
+                        " visual encoder that does not have CLS."
+                    )
+
+                image_feature = image_feature.view(
+                    num_patch_height, num_patch_width, height, width, -1
+                )
+                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                if image_newline is not None:
+                    image_feature = torch.cat(
+                        (
+                            image_feature,
+                            image_newline[:, None, None]
+                            .expand(*image_feature.shape[:-1], 1)
+                            .to(image_feature.device, image_feature.dtype),
+                        ),
+                        dim=-1,
+                    )
+                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+            else:
+                image_feature = image_feature[0]
+                if image_newline is not None:
+                    image_feature = torch.cat(
+                        (image_feature, image_newline[None].to(image_feature)), dim=0
+                    )
+            new_image_features.append(image_feature)
+            feature_lens.append(image_feature.size(0))
+        image_features = torch.cat(new_image_features, dim=0)
+        feature_lens = torch.tensor(
+            feature_lens, dtype=torch.long, device=image_features.device
+        )
+        return image_features, feature_lens
+
     # Copied from https://github.com/huggingface/transformers/blob/6966fa190172b48b2fb46fe4552a13b943e692cf/src/transformers/models/llava_next/modeling_llava_next.py#L479
     def get_image_features(
         self,
@@ -303,61 +373,33 @@ class LlavaNextForConditionalGeneration(GaudiLlavaNextForConditionalGeneration):
                 )
 
                 # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
-                height = width = (
-                    self.config.vision_config.image_size
-                    // self.config.vision_config.patch_size
+                image_features, feature_lens = self.pack_image_features(
+                    image_features,
+                    image_sizes,
+                    vision_feature_select_strategy=vision_feature_select_strategy,
+                    image_newline=self.image_newline,
                 )
 
-                new_image_features = []
-                for image_idx, image_feature in enumerate(image_features):
-                    if image_feature.shape[0] > 1:
-                        base_image_feature = image_feature[0]
-                        image_feature = image_feature[1:]
-
-                        if height * width != base_image_feature.shape[0]:
-                            raise ValueError(
-                                "The number of patches is not consistent with the image size."
-                            )
-
-                        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-                            image_sizes[image_idx].tolist(),
-                            self.config.image_grid_pinpoints,
-                            self.config.vision_config.image_size,
-                        )
-
-                        image_feature = image_feature.view(
-                            num_patch_height, num_patch_width, height, width, -1
-                        )
-                        image_feature = image_feature.permute(
-                            4, 0, 2, 1, 3
-                        ).contiguous()
-                        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-                        image_feature = unpad_image(
-                            image_feature, image_sizes[image_idx]
-                        )
-                        image_feature = torch.cat(
-                            (
-                                image_feature,
-                                self.image_newline[:, None, None].expand(
-                                    *image_feature.shape[:-1], 1
-                                ),
-                            ),
-                            dim=-1,
-                        )
-                        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-                        image_feature = torch.cat(
-                            (base_image_feature, image_feature), dim=0
-                        )
-                    else:
-                        image_feature = image_feature[0]
-                        image_feature = torch.cat(
-                            (image_feature, self.image_newline[None]), dim=0
-                        )
-                    new_image_features.append(image_feature)
-                image_features = torch.cat(new_image_features, dim=0)
-                inputs_embeds = self._merge_input_ids_with_image_features(
-                    inputs_embeds, image_features, input_ids
+                special_image_mask = (
+                    input_ids == self.config.image_token_index
+                ).unsqueeze(-1)
+                special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
+                    inputs_embeds.device
                 )
+                if inputs_embeds[special_image_mask].numel() != image_features.numel():
+                    n_image_tokens = (input_ids == self.config.image_token_index).sum()
+                    n_image_features = image_features.shape[0]
+                    raise ValueError(
+                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                    )
+
+                image_features = image_features.to(
+                    inputs_embeds.device, inputs_embeds.dtype
+                )
+                inputs_embeds = inputs_embeds.masked_scatter(
+                    special_image_mask, image_features
+                )
+
             # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
             # generation with cache
             elif past_key_values is not None:
diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
index 66e00171..543b07e8 100644
--- a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
@@ -428,6 +428,9 @@ class VlmCausalLMBatch(CausalLMBatch):
                 else:
                     images.append(curr_image)
 
+        if is_warmup is True:
+            images += [images[0]] * (len(texts) - len(images))
+
         missing_inputs = 0
         dummy_images = None
         if is_warmup is False:
@@ -1464,7 +1467,6 @@ class VlmCausalLM(Model):
         batch = self.batch_from_pb(request.batch, is_warmup=True)
         max_input_tokens = request.max_input_tokens
         max_prefill_batch_size = batch.input_ids.shape[0]
-
         try:
             # max prefill batch size warmup
             _, prefill_batch, _ = self.generate_token([batch], is_warmup=True)
@@ -1548,7 +1550,7 @@ class VlmCausalLM(Model):
                             request,
                             PREFILL_WARMUP_SEQLEN_LIST[0] - 1,
                             max_prefill_batch_size,
-                            is_warmup=False,
+                            is_warmup=True,
                         )
                         _, prefill_batch, _ = self.generate_token(
                             [batch], is_warmup=True
@@ -1568,7 +1570,7 @@ class VlmCausalLM(Model):
                             request,
                             PREFILL_WARMUP_SEQLEN_LIST[0] - 1,
                             2,
-                            is_warmup=False,
+                            is_warmup=True,
                         )
                         _, prefill_batch, _ = self.generate_token(
                             [batch], is_warmup=True

From 014255009651cd3233c6c9fc0e5e3c8a3df9ff89 Mon Sep 17 00:00:00 2001
From: Corentin REGAL <corentin.regal@gmail.com>
Date: Wed, 26 Mar 2025 15:36:43 +0100
Subject: [PATCH 164/183] nix-v3.2.1 -> v3.2.1-nix (#3129)

make it easier to check for version using semver semantic (same major
and minor)
---
 .github/workflows/nix_build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nix_build.yaml b/.github/workflows/nix_build.yaml
index 5a10bd39..71ad59d0 100644
--- a/.github/workflows/nix_build.yaml
+++ b/.github/workflows/nix_build.yaml
@@ -47,7 +47,7 @@ jobs:
         if [ "${{ github.event_name }}" = "pull_request" ]; then
           export TAG=nix-sha-${{ env.GITHUB_SHA_SHORT }}
         else
-          export TAG=nix-${{ github.ref_name }}
+          export TAG=${{ github.ref_name }}-nix
         fi
         export IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:$TAG
         nix-shell -p skopeo --command "skopeo --insecure-policy copy docker-archive:$(readlink -f ./result) docker://$IMAGE --dest-compress-format zstd"

From 3d059f91ab6e2d643e9b1b7c0d8cad478317a541 Mon Sep 17 00:00:00 2001
From: Yuan Wu <yuan.wu@intel.com>
Date: Thu, 3 Apr 2025 16:34:53 +0800
Subject: [PATCH 165/183] Gaudi: Use exponential growth to replace
 BATCH_BUCKET_SIZE (#3131)

* Gaudi: Use exponential growth to replace BATCH_BUCKET_SIZE

Signed-off-by: yuanwu <yuan.wu@intel.com>

* Remove debug modifications

Signed-off-by: yuanwu <yuan.wu@intel.com>

---------

Signed-off-by: yuanwu <yuan.wu@intel.com>
---
 .../models/causal_lm.py                       | 38 ++++++++++---------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/backends/gaudi/server/text_generation_server/models/causal_lm.py b/backends/gaudi/server/text_generation_server/models/causal_lm.py
index 776c109f..c1ce3335 100644
--- a/backends/gaudi/server/text_generation_server/models/causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/causal_lm.py
@@ -57,8 +57,7 @@ MAX_TOTAL_TOKENS = int(os.environ.get("MAX_TOTAL_TOKENS", 2048))
 PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 256))
 CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
 LAZY_MODE = int(os.environ.get("PT_HPU_LAZY_MODE", 1))
-BATCH_BUCKET_SIZE = int(os.environ.get("BATCH_BUCKET_SIZE", 8))
-PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get("PREFILL_BATCH_BUCKET_SIZE", 2))
+BATCH_SIZE_EXPONENT_BASE = int(os.environ.get("BATCH_SIZE_EXPONENT_BASE", 2))
 MAX_BATCH_SIZE = (
     int(os.environ.get("MAX_BATCH_SIZE"))
     if os.environ.get("MAX_BATCH_SIZE") is not None
@@ -74,10 +73,16 @@ def torch_compile_for_eager(func):
     )
 
 
-def round_up(number, k):
+def round_up_seq(number, k):
     return (number + k - 1) // k * k
 
 
+def round_up_batch(number):
+    return BATCH_SIZE_EXPONENT_BASE ** (
+        math.ceil(math.log(number, BATCH_SIZE_EXPONENT_BASE))
+    )
+
+
 def to_tensor_indices(indices, device):
     return torch.tensor(indices, dtype=torch.long, device=device)
 
@@ -399,7 +404,7 @@ class CausalLMBatch(Batch):
 
         total_requests = sum(len(b) for b in batches)
         new_bs = total_requests
-        new_bs = round_up(total_requests, BATCH_BUCKET_SIZE)
+        new_bs = round_up_batch(total_requests)
 
         batch_id = batches[0].batch_id
         device = batches[0].input_ids.device
@@ -540,7 +545,7 @@ class CausalLMBatch(Batch):
         # TODO: by tokenizing all inputs at once we loose information on actual input lengths
         # this means that we cannot shift inputs to the left after a long input sequence
         # was filtered out
-        new_bs = round_up(len(requests), PREFILL_BATCH_BUCKET_SIZE)
+        new_bs = round_up_batch(len(requests))
         missing_inputs = new_bs - len(inputs)
         dummy_inputs = ["?"] * missing_inputs
         parameters = [r.parameters for r in pb.requests]
@@ -572,7 +577,7 @@ class CausalLMBatch(Batch):
             assert (
                 PAD_SEQUENCE_TO_MULTIPLE_OF <= max_input_length
             ), "PAD_SEQUENCE_TO_MULTIPLE_OF cannot be higher than max_input_length"
-            rounded_seq_len = round_up(input_len + 1, PAD_SEQUENCE_TO_MULTIPLE_OF)
+            rounded_seq_len = round_up_seq(input_len + 1, PAD_SEQUENCE_TO_MULTIPLE_OF)
             if rounded_seq_len <= max_input_length:
                 bucket_size = rounded_seq_len - 1
             else:
@@ -1068,10 +1073,10 @@ class CausalLM(Model):
         if (
             self.enable_hpu_graph
             and self.limit_hpu_graph
-            and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs
+            and round_up_batch(batch.batch_size) != self.prev_bs
         ):
             self.model.clear_cache()
-            self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE)
+            self.prev_bs = round_up_batch(batch.batch_size)
         dbg_trace(
             scenario,
             f"bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}",
@@ -1325,15 +1330,14 @@ class CausalLM(Model):
 
         # Warmup prefill batch_size
         max_input_tokens = request.max_input_tokens
+        max_exp = math.ceil(math.log(max_prefill_batch_size, BATCH_SIZE_EXPONENT_BASE))
         prefill_batch_size_list = [
-            batch
-            for batch in range(
-                PREFILL_BATCH_BUCKET_SIZE,
-                max_prefill_batch_size,
-                PREFILL_BATCH_BUCKET_SIZE,
+            BATCH_SIZE_EXPONENT_BASE**exp
+            for exp in range(
+                0,
+                max_exp + 1,
             )
         ]
-        prefill_batch_size_list.append(max_prefill_batch_size)
         prefill_seqlen_list = [
             seq
             for seq in range(
@@ -1370,12 +1374,10 @@ class CausalLM(Model):
         )
 
         max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS)
-        max_decode_batch_size = round_up(max_decode_batch_size, BATCH_BUCKET_SIZE)
+        max_exp = math.ceil(math.log(max_decode_batch_size, BATCH_SIZE_EXPONENT_BASE))
         decode_batch_size_list = [
-            i
-            for i in range(BATCH_BUCKET_SIZE, max_decode_batch_size, BATCH_BUCKET_SIZE)
+            BATCH_SIZE_EXPONENT_BASE**exp for exp in range(0, max_exp + 1)
         ]
-        decode_batch_size_list.append(max_decode_batch_size)
         decode_batch_size_list.sort(reverse=True)
 
         try:

From d9bb9bebc9eba54287c4f759cbef70543f63ef2f Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Sun, 6 Apr 2025 13:50:22 +0530
Subject: [PATCH 166/183] Add llama4 (#3145)

* initial changes

* Add support for other vlm

* cleanup comment

* Improve attn_implementation

* Add comments for support of models

* add model

* add model

* fixes and improvements

* update docker

* Add cache position

* Add tests

* remove redundant changes

* remove tr version

* Upgrade doc + fix linting.

* Fixing the CI.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 Dockerfile                                    |   5 +-
 docs/source/supported_models.md               |   1 +
 .../test_flash_llama4.json                    | 613 ++++++++++++++++++
 ...est_flash_llama4_image_base64_rgb_jpg.json |  26 +
 ...est_flash_llama4_image_base64_rgb_png.json |  26 +
 .../test_flash_llama4_image_base64_rgba.json  |  26 +
 .../test_flash_llama4_image_cow.json          |  26 +
 .../test_flash_llama4_image_cow_dog.json      |  26 +
 .../models/test_transformers_llama4.py        | 155 +++++
 router/src/config.rs                          | 137 ++++
 router/src/lib.rs                             |   7 +
 router/src/validation.rs                      |  45 +-
 server/pyproject.toml                         |   3 +-
 .../text_generation_server/models/__init__.py | 190 +++++-
 .../models/flash_causal_lm.py                 |   9 -
 .../models/transformers_flash_causal_lm.py    |  11 +-
 .../models/transformers_flash_vlm.py          | 566 ++++++++++++++++
 .../models/vlm_causal_lm.py                   |  57 +-
 server/uv.lock                                |  25 +-
 19 files changed, 1893 insertions(+), 61 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4.json
 create mode 100644 integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_jpg.json
 create mode 100644 integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_png.json
 create mode 100644 integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgba.json
 create mode 100644 integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow.json
 create mode 100644 integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow_dog.json
 create mode 100644 integration-tests/models/test_transformers_llama4.py
 create mode 100644 server/text_generation_server/models/transformers_flash_vlm.py

diff --git a/Dockerfile b/Dockerfile
index f804c2b8..820468c9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -65,7 +65,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
 COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
 ENV PATH="$PATH:/root/.local/bin"
 RUN uv python install ${PYTHON_VERSION}
-RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} pip setuptools packaging
+RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} torchvision pip setuptools packaging
 ENV VIRTUAL_ENV=/usr/src/.venv/
 ENV PATH="$PATH:/usr/src/.venv/bin/"
 
@@ -193,6 +193,9 @@ RUN cd server && \
     pwd && \
     text-generation-server --help
 
+# This shouldn't be necessary.
+# RUN uv pip install torchvision --no-deps
+
 # Copy build artifacts from flash attention builder
 COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index f168fd76..5d855a5b 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -9,6 +9,7 @@ Text Generation Inference enables serving optimized models. The following sectio
 - [Idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
+- [Llama4](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
 - [Granite](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)
 - [Gemma](https://huggingface.co/google/gemma-7b)
diff --git a/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4.json b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4.json
new file mode 100644
index 00000000..eefcb353
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4.json
@@ -0,0 +1,613 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 100,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 2721,
+        "logprob": -0.21582031,
+        "special": false,
+        "text": " people"
+      },
+      {
+        "id": 21807,
+        "logprob": -0.26953125,
+        "special": false,
+        "text": " died"
+      },
+      {
+        "id": 310,
+        "logprob": -0.95703125,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 290,
+        "logprob": -1.3359375,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 220,
+        "logprob": -1.3828125,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 7284,
+        "logprob": -0.011291504,
+        "special": false,
+        "text": "191"
+      },
+      {
+        "id": 36,
+        "logprob": -0.011413574,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.23242188,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 27650,
+        "logprob": -0.0010070801,
+        "special": false,
+        "text": " pandemic"
+      },
+      {
+        "id": 26,
+        "logprob": -0.69140625,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 114059,
+        "logprob": -1.4375,
+        "special": false,
+        "text": " Estimating"
+      },
+      {
+        "id": 290,
+        "logprob": -0.24316406,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 10593,
+        "logprob": -0.37304688,
+        "special": false,
+        "text": " death"
+      },
+      {
+        "id": 49973,
+        "logprob": -0.025390625,
+        "special": false,
+        "text": " toll"
+      },
+      {
+        "id": 323,
+        "logprob": -0.27539062,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 290,
+        "logprob": -0.057617188,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 220,
+        "logprob": -0.040527344,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 7284,
+        "logprob": -0.00050735474,
+        "special": false,
+        "text": "191"
+      },
+      {
+        "id": 36,
+        "logprob": -9.298325e-06,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.09863281,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 27650,
+        "logprob": -0.0011749268,
+        "special": false,
+        "text": " pandemic"
+      },
+      {
+        "id": 373,
+        "logprob": -0.32421875,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 8210,
+        "logprob": -0.58203125,
+        "special": false,
+        "text": " difficult"
+      },
+      {
+        "id": 2895,
+        "logprob": -0.40429688,
+        "special": false,
+        "text": " because"
+      },
+      {
+        "id": 323,
+        "logprob": -1.2734375,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 49119,
+        "logprob": -0.51171875,
+        "special": false,
+        "text": " incomplete"
+      },
+      {
+        "id": 13308,
+        "logprob": -0.38085938,
+        "special": false,
+        "text": " records"
+      },
+      {
+        "id": 341,
+        "logprob": -0.55859375,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 2895,
+        "logprob": -0.765625,
+        "special": false,
+        "text": " because"
+      },
+      {
+        "id": 323,
+        "logprob": -1.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 290,
+        "logprob": -0.828125,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2304,
+        "logprob": -1.015625,
+        "special": false,
+        "text": " fact"
+      },
+      {
+        "id": 511,
+        "logprob": -0.004638672,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 2233,
+        "logprob": -0.953125,
+        "special": false,
+        "text": " many"
+      },
+      {
+        "id": 323,
+        "logprob": -0.87890625,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 290,
+        "logprob": -0.60546875,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 6759,
+        "logprob": -1.6484375,
+        "special": false,
+        "text": " extra"
+      },
+      {
+        "id": 40657,
+        "logprob": -0.00022125244,
+        "special": false,
+        "text": " deaths"
+      },
+      {
+        "id": 1610,
+        "logprob": -0.67578125,
+        "special": false,
+        "text": " were"
+      },
+      {
+        "id": 702,
+        "logprob": -0.30664062,
+        "special": false,
+        "text": " not"
+      },
+      {
+        "id": 48692,
+        "logprob": -0.1953125,
+        "special": false,
+        "text": " attributed"
+      },
+      {
+        "id": 328,
+        "logprob": -0.0079956055,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 290,
+        "logprob": -0.515625,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.0040893555,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 26,
+        "logprob": -0.083496094,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 13618,
+        "logprob": -0.515625,
+        "special": false,
+        "text": " Many"
+      },
+      {
+        "id": 22215,
+        "logprob": -1.5703125,
+        "special": false,
+        "text": " experts"
+      },
+      {
+        "id": 11081,
+        "logprob": -0.96875,
+        "special": false,
+        "text": " believe"
+      },
+      {
+        "id": 511,
+        "logprob": -0.1171875,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 290,
+        "logprob": -0.25195312,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 220,
+        "logprob": -0.828125,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 7284,
+        "logprob": -0.00010967255,
+        "special": false,
+        "text": "191"
+      },
+      {
+        "id": 36,
+        "logprob": -8.535385e-05,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.056152344,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 27650,
+        "logprob": -0.0007095337,
+        "special": false,
+        "text": " pandemic"
+      },
+      {
+        "id": 26132,
+        "logprob": -0.18847656,
+        "special": false,
+        "text": " killed"
+      },
+      {
+        "id": 1867,
+        "logprob": -0.71484375,
+        "special": false,
+        "text": " between"
+      },
+      {
+        "id": 220,
+        "logprob": -0.0062561035,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 1175,
+        "logprob": -0.009277344,
+        "special": false,
+        "text": "50"
+      },
+      {
+        "id": 341,
+        "logprob": -0.15332031,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 220,
+        "logprob": -8.34465e-07,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 1135,
+        "logprob": -0.00065612793,
+        "special": false,
+        "text": "100"
+      },
+      {
+        "id": 5534,
+        "logprob": -1.4066696e-05,
+        "special": false,
+        "text": " million"
+      },
+      {
+        "id": 2721,
+        "logprob": -0.0008392334,
+        "special": false,
+        "text": " people"
+      },
+      {
+        "id": 26,
+        "logprob": -0.54296875,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 372,
+        "logprob": -1.8046875,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 140680,
+        "logprob": -0.578125,
+        "special": false,
+        "text": "assistant"
+      },
+      {
+        "id": 200006,
+        "logprob": 0.0,
+        "special": true,
+        "text": "<|header_end|>"
+      },
+      {
+        "id": 368,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 954,
+        "logprob": -0.032226562,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 220,
+        "logprob": -4.4345856e-05,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 7284,
+        "logprob": 0.0,
+        "special": false,
+        "text": "191"
+      },
+      {
+        "id": 36,
+        "logprob": 0.0,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.015625,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 27650,
+        "logprob": 0.0,
+        "special": false,
+        "text": " pandemic"
+      },
+      {
+        "id": 24,
+        "logprob": -0.0072021484,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1437,
+        "logprob": -0.0001707077,
+        "special": false,
+        "text": " also"
+      },
+      {
+        "id": 5711,
+        "logprob": 0.0,
+        "special": false,
+        "text": " known"
+      },
+      {
+        "id": 486,
+        "logprob": 0.0,
+        "special": false,
+        "text": " as"
+      },
+      {
+        "id": 290,
+        "logprob": -5.9604645e-07,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 25836,
+        "logprob": -1.4305115e-06,
+        "special": false,
+        "text": " Spanish"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.0015029907,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 24,
+        "logprob": -0.0052490234,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 373,
+        "logprob": -0.3125,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 26078,
+        "logprob": -0.21289062,
+        "special": false,
+        "text": " indeed"
+      },
+      {
+        "id": 1085,
+        "logprob": -0.080078125,
+        "special": false,
+        "text": " one"
+      },
+      {
+        "id": 323,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 290,
+        "logprob": 0.0,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2167,
+        "logprob": -0.20117188,
+        "special": false,
+        "text": " most"
+      },
+      {
+        "id": 92679,
+        "logprob": -0.12695312,
+        "special": false,
+        "text": " devastating"
+      },
+      {
+        "id": 854,
+        "logprob": -0.25976562,
+        "special": false,
+        "text": " public"
+      },
+      {
+        "id": 4500,
+        "logprob": 0.0,
+        "special": false,
+        "text": " health"
+      },
+      {
+        "id": 93079,
+        "logprob": -0.50390625,
+        "special": false,
+        "text": " crises"
+      },
+      {
+        "id": 310,
+        "logprob": 0.0,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 6023,
+        "logprob": -0.0015182495,
+        "special": false,
+        "text": " human"
+      },
+      {
+        "id": 7068,
+        "logprob": 0.0,
+        "special": false,
+        "text": " history"
+      },
+      {
+        "id": 26,
+        "logprob": -0.0012664795,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 114059,
+        "logprob": -0.004119873,
+        "special": false,
+        "text": " Estimating"
+      },
+      {
+        "id": 290,
+        "logprob": -0.00033569336,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 6318,
+        "logprob": -0.20117188,
+        "special": false,
+        "text": " exact"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " people died in the 1918 flu pandemic. Estimating the death toll of the 1918 flu pandemic is difficult because of incomplete records and because of the fact that many of the extra deaths were not attributed to the flu. Many experts believe that the 1918 flu pandemic killed between 50 and 100 million people. Iassistant\n\nThe 1918 flu pandemic, also known as the Spanish flu, is indeed one of the most devastating public health crises in human history. Estimating the exact"
+}
diff --git a/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_jpg.json b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_jpg.json
new file mode 100644
index 00000000..cab30700
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_jpg.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image is a blank white space with no visible objects or features. It appears to be an empty or placeholder image, devoid of any content or visual elements.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743861910,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 34,
+    "prompt_tokens": 166,
+    "total_tokens": 200
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_png.json b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_png.json
new file mode 100644
index 00000000..9d418e81
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_png.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image is a blank white space with no visible objects or features.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743861909,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 15,
+    "prompt_tokens": 166,
+    "total_tokens": 181
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgba.json b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgba.json
new file mode 100644
index 00000000..dd34797d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgba.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image is a black background with no discernible objects or features. The image appears to be a blank or empty space, devoid of any visual elements.\n\n**Key Features:**\n\n* **Color:** The dominant color of the image is black.\n* **Objects:** There are no visible objects or shapes in the image.\n* **Background:** The background of the image is a solid black color.\n\n**Conclusion:**\nIn summary, the image is a simple and empty visual representation with a black background and no",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743861909,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 100,
+    "prompt_tokens": 166,
+    "total_tokens": 266
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow.json b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow.json
new file mode 100644
index 00000000..e89c9925
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image shows a brown cow standing on the beach with a white face and black and white marking on its ears. The cow has a white patch around its nose and mouth. The ocean and blue sky are in the background.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743863057,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 46,
+    "prompt_tokens": 164,
+    "total_tokens": 210
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow_dog.json b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow_dog.json
new file mode 100644
index 00000000..f5a4a02d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow_dog.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image does not depict a dog; it shows a cow standing on a beach. Therefore, there is no breed of a dog to identify.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743863056,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 30,
+    "prompt_tokens": 168,
+    "total_tokens": 198
+  }
+}
diff --git a/integration-tests/models/test_transformers_llama4.py b/integration-tests/models/test_transformers_llama4.py
new file mode 100644
index 00000000..a20d3284
--- /dev/null
+++ b/integration-tests/models/test_transformers_llama4.py
@@ -0,0 +1,155 @@
+import base64
+from io import BytesIO
+from PIL import Image
+
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama4_handle(launcher):
+    with launcher("ll-re/Llama-4-Scout-17B-16E-Instruct", num_shard=8) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama4(flash_llama4_handle):
+    await flash_llama4_handle.health(300)
+    return flash_llama4_handle.client
+
+
+async def test_flash_llama4(flash_llama4, response_snapshot):
+    response = await flash_llama4.generate(
+        "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
+        seed=42,
+        max_new_tokens=100,
+    )
+
+    assert (
+        response.generated_text
+        == " people died in the 1918 flu pandemic. Estimating the death toll of the 1918 flu pandemic is difficult because of incomplete records and because of the fact that many of the extra deaths were not attributed to the flu. Many experts believe that the 1918 flu pandemic killed between 50 and 100 million people. Iassistant\n\nThe 1918 flu pandemic, also known as the Spanish flu, is indeed one of the most devastating public health crises in human history. Estimating the exact"
+    )
+    assert response.details.generated_tokens == 100
+    assert response == response_snapshot
+
+
+async def test_flash_llama4_image_cow_dog(flash_llama4, response_snapshot):
+    image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+    response = await flash_llama4.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {
+                        "type": "text",
+                        "text": "What is the breed of the dog in the image?",
+                    },
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
+
+    assert (
+        response.choices[0].message.content
+        == "The image does not depict a dog; it shows a cow standing on a beach. Therefore, there is no breed of a dog to identify."
+    )
+    assert response.usage["completion_tokens"] == 30
+    assert response == response_snapshot
+
+
+async def test_flash_llama4_image_cow(flash_llama4, response_snapshot):
+    image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+    response = await flash_llama4.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
+    assert (
+        response.choices[0].message.content
+        == "The image shows a brown cow standing on the beach with a white face and black and white marking on its ears. The cow has a white patch around its nose and mouth. The ocean and blue sky are in the background."
+    )
+    assert response.usage["completion_tokens"] == 46
+    assert response == response_snapshot
+
+
+# Helper function to convert a Pillow image to a base64 data URL
+def image_to_data_url(img: Image.Image, fmt: str) -> str:
+    buffer = BytesIO()
+    img.save(buffer, format=fmt)
+    img_data = buffer.getvalue()
+    b64_str = base64.b64encode(img_data).decode("utf-8")
+    mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
+    return f"data:{mime_type};base64,{b64_str}"
+
+
+async def test_flash_llama4_image_base64_rgba(flash_llama4, response_snapshot):
+    # Create an empty 100x100 PNG image with alpha (transparent background)
+    img = Image.new("RGBA", (100, 100), (0, 0, 0, 0))
+    data_url = image_to_data_url(img, "PNG")
+    response = await flash_llama4.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": data_url}},
+                    {
+                        "type": "text",
+                        "text": "What do you see in this transparent image?",
+                    },
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
+    assert response == response_snapshot
+
+
+async def test_flash_llama4_image_base64_rgb_png(flash_llama4, response_snapshot):
+    # Create an empty 100x100 PNG image without alpha (white background)
+    img = Image.new("RGB", (100, 100), (255, 255, 255))
+    data_url = image_to_data_url(img, "PNG")
+    response = await flash_llama4.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": data_url}},
+                    {"type": "text", "text": "What do you see in this plain image?"},
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
+    assert response == response_snapshot
+
+
+async def test_flash_llama4_image_base64_rgb_jpg(flash_llama4, response_snapshot):
+    # Create an empty 100x100 JPEG image (white background)
+    img = Image.new("RGB", (100, 100), (255, 255, 255))
+    data_url = image_to_data_url(img, "JPEG")
+    response = await flash_llama4.chat(
+        seed=42,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": data_url}},
+                    {"type": "text", "text": "What do you see in this JPEG image?"},
+                ],
+            },
+        ],
+        max_tokens=100,
+    )
+    assert response == response_snapshot
diff --git a/router/src/config.rs b/router/src/config.rs
index 4460eb00..8188e535 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -1,4 +1,5 @@
 use serde::{Deserialize, Serialize};
+use std::collections::{HashMap, HashSet};
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "model_type")]
@@ -103,6 +104,141 @@ impl LlavaNext {
     }
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Llama4VisionConfig {
+    image_size: usize,
+    patch_size: usize,
+    pixel_shuffle_ratio: f64,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Llama4 {
+    text_config: TextConfig,
+    vision_config: Llama4VisionConfig,
+}
+
+fn gcd(a: usize, b: usize) -> usize {
+    if b == 0 {
+        a
+    } else {
+        gcd(b, a % b)
+    }
+}
+
+fn get_factors(dividend: usize) -> HashSet<usize> {
+    let mut factors_set = HashSet::new();
+
+    for i in 1..=((dividend as f64).sqrt() as usize) {
+        if dividend % i == 0 {
+            factors_set.insert(i);
+            factors_set.insert(dividend / i);
+        }
+    }
+
+    factors_set
+}
+
+fn find_supported_resolutions(max_num_chunks: usize, height: usize) -> Vec<(usize, usize)> {
+    let patch_size = height;
+
+    let mut asp_dict: HashMap<(usize, usize), Vec<(usize, usize)>> = HashMap::new();
+
+    for chunk_size in (1..=max_num_chunks).rev() {
+        let mut _factors: Vec<_> = get_factors(chunk_size).into_iter().collect();
+        _factors.sort();
+        let _asp_ratios: Vec<(usize, usize)> =
+            _factors.iter().map(|&f| (f, chunk_size / f)).collect();
+
+        for (h, w) in _asp_ratios {
+            let divisor = gcd(h, w);
+            let key = (h / divisor, w / divisor); // reduced aspect ratio as key
+
+            asp_dict.entry(key).or_default().push((h, w));
+        }
+    }
+
+    let mut possible_resolutions = vec![];
+
+    for (_key, value) in asp_dict {
+        for (h, w) in value {
+            possible_resolutions.push((h * patch_size, w * patch_size));
+        }
+    }
+
+    possible_resolutions
+}
+
+fn get_best_fit(
+    original_height: usize,
+    original_width: usize,
+    possible_resolutions: &[(usize, usize)],
+    resize_to_max_canvas: bool,
+) -> (usize, usize) {
+    let orig_h = original_height as f32;
+    let orig_w = original_width as f32;
+
+    let mut scales = Vec::with_capacity(possible_resolutions.len());
+
+    for &(h, w) in possible_resolutions.iter() {
+        let scale_h = h as f32 / orig_h;
+        let scale_w = w as f32 / orig_w;
+        let scale = scale_h.min(scale_w);
+        scales.push(scale);
+    }
+
+    let upscaling_options: Vec<f32> = scales.iter().copied().filter(|&s| s >= 1.0).collect();
+    let selected_scale = if !upscaling_options.is_empty() {
+        if resize_to_max_canvas {
+            upscaling_options.into_iter().fold(f32::MIN, f32::max)
+        } else {
+            upscaling_options.into_iter().fold(f32::MAX, f32::min)
+        }
+    } else {
+        let downscaling_options: Vec<f32> = scales.iter().copied().filter(|&s| s < 1.0).collect();
+        downscaling_options.into_iter().fold(f32::MIN, f32::max)
+    };
+
+    let chosen_canvas: Vec<(usize, usize)> = possible_resolutions
+        .iter()
+        .zip(scales.iter())
+        .filter(|&(_, &s)| (s - selected_scale).abs() < f32::EPSILON)
+        .map(|(&(h, w), _)| (h, w))
+        .collect();
+
+    if chosen_canvas.len() > 1 {
+        chosen_canvas
+            .into_iter()
+            .min_by_key(|(h, w)| h * w)
+            .unwrap()
+    } else {
+        chosen_canvas[0]
+    }
+}
+
+impl Llama4 {
+    pub fn image_size(&self) -> usize {
+        self.vision_config.image_size
+    }
+
+    pub fn patch_size(&self) -> usize {
+        self.vision_config.patch_size
+    }
+
+    pub fn pixel_shuffle_ratio(&self) -> f64 {
+        self.vision_config.pixel_shuffle_ratio
+    }
+    pub fn get_aspect_ratios(&self, height: usize, width: usize) -> (usize, usize) {
+        let patch_size = self.vision_config.image_size;
+        // How to avoid hardcoding this?
+        let max_chunks = 15;
+        let supported = find_supported_resolutions(max_chunks, patch_size);
+        let (target_h, target_w) = get_best_fit(height, width, &supported, false);
+        (target_h / patch_size, target_w / patch_size)
+    }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ClipVisionModel {
@@ -258,6 +394,7 @@ pub enum Config {
     Phi3,
     Phimoe,
     Llama,
+    Llama4(Llama4),
     Baichuan,
     Paligemma(Paligemma),
     Gemma,
diff --git a/router/src/lib.rs b/router/src/lib.rs
index e8b8f663..50adb5cf 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -179,6 +179,7 @@ pub enum HubPreprocessorConfig {
     Idefics2Processor(Idefics2Preprocessor),
     Idefics3Processor(Idefics2Preprocessor),
     Gemma3Processor(Gemma3Processor),
+    Llama4Processor(Llama4Processor),
 }
 
 impl HubPreprocessorConfig {
@@ -200,6 +201,12 @@ pub struct Gemma3Processor {
     do_image_splitting: bool,
 }
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Llama4Processor {
+    #[serde(default)]
+    do_image_splitting: bool,
+}
+
 #[derive(Debug, Clone, Deserialize, Default)]
 pub struct HubProcessorConfig {
     pub chat_template: Option<ChatTemplateVersions>,
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 1119347d..2d1d9a3d 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -687,6 +687,47 @@ fn image_tokens(
         }
         Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
         LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
+        Llama4(config) => {
+            const IMAGE_START: &str = "<|image_start|>";
+            const IMAGE: &str = "<|image|>";
+            const IMAGE_END: &str = "<|image_end|>";
+            const PATCH: &str = "<|patch|>";
+            const TILE_X_SEP: &str = "<|tile_x_separator|>";
+            const TILE_Y_SEP: &str = "<|tile_y_separator|>";
+
+            let image_height = config.image_size();
+            let patch_size = config.patch_size();
+            let pixel_shuffle_ratio = config.pixel_shuffle_ratio();
+            let downsample_ratio =
+                (1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize;
+
+            let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width);
+            let image_width = image_height; // Assuming pixel shape: [H][W][C]
+
+            let num_patches_per_chunk =
+                (image_height / patch_size) * (image_width / patch_size) / downsample_ratio;
+
+            let mut img_string = String::new();
+            img_string.push_str(IMAGE_START);
+
+            if ratio_h * ratio_w > 1 {
+                for _yy in 0..ratio_h {
+                    for xx in 0..ratio_w {
+                        img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
+                        if xx < ratio_w - 1 {
+                            img_string.push_str(TILE_X_SEP);
+                        }
+                    }
+                    img_string.push_str(TILE_Y_SEP);
+                }
+            }
+
+            img_string.push_str(IMAGE);
+            img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
+            img_string.push_str(IMAGE_END);
+
+            img_string
+        }
         Qwen2Vl(config) => format!(
             "<|vision_start|>{:?}<|vision_end|>",
             "<|image_pad|>".repeat(config.get_number_of_features(height, width))
@@ -730,8 +771,8 @@ fn prepare_input<T: TokenizerTrait>(
     static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
     let (tokenizer_query, input_chunks) = match config {
         Some(
-            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Paligemma(_)
-            | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)),
+            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Llama4(_)
+            | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)),
         ) => {
             let mut input_chunks = Vec::new();
             let mut tokenizer_query = String::with_capacity(inputs.len());
diff --git a/server/pyproject.toml b/server/pyproject.toml
index e3ec734a..86cb4922 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -32,7 +32,8 @@ dependencies = [
     "tokenizers>=0.20.3",
     "typer>=0.15.1",
     "transformers>=4.49.0",
-    "huggingface-hub>=0.29.0",
+    "huggingface-hub>=0.30.1",
+    "hf-xet>=1.0.0",
 ]
 
 [build-system]
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index ab830b58..84437bf3 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -206,7 +206,13 @@ try:
     from text_generation_server.models.transformers_flash_causal_lm import (
         TransformersFlashCausalLM,
     )
-except ImportError:
+    from text_generation_server.models.transformers_flash_vlm import (
+        TransformersFlashVlmCausalLM,
+        TransformersGemma3VlmCausalLM,
+        TransformersLlama4VlmCausalLM,
+    )
+except ImportError as e:
+    log_master(logger.warning, f"Could not import Flash Transformers Backend: {e}")
     FLASH_TRANSFORMERS_BACKEND = False
 
 
@@ -244,6 +250,11 @@ class ModelType(enum.Enum):
         "name": "Llama",
         "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
     }
+    LLAMA4 = {
+        "type": "llama4",
+        "name": "Llama4",
+        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
+    }
     PHI3 = {
         "type": "phi3",
         "name": "Phi 3",
@@ -648,7 +659,6 @@ def get_model(
         raise ValueError(
             f"The backend {SYSTEM} does not support sliding window attention that is used by the model type {model_type}. To use this model nonetheless with the {SYSTEM} backend, please launch TGI with the argument `--max-input-tokens` smaller than sliding_window={sliding_window} (got here max_input_tokens={max_input_tokens})."
         )
-
     if model_type == DEEPSEEK_V2:
         if FLASH_ATTENTION:
             head_size = max(
@@ -1017,7 +1027,23 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+    elif model_type == LLAMA4:
+        if FLASH_TRANSFORMERS_BACKEND:
+            from transformers import Llama4ForConditionalGeneration as Llama4Model
 
+            return TransformersLlama4VlmCausalLM.fallback(
+                model_id,
+                Llama4Model,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                processor_kwargs={
+                    "use_fast": True,
+                    "size": {"height": 336, "width": 336},
+                },
+            )
     elif model_type == BAICHUAN:
         if FLASH_ATTENTION:
             return FlashCausalLM(
@@ -1155,7 +1181,6 @@ def get_model(
             )
     elif model_type == GEMMA3:
         if FLASH_ATTENTION:
-            # TODO: Use VlmCausalLM when image support is added.
             return VlmCausalLM(
                 model_id=model_id,
                 model_class=Gemma3ForConditionalGeneration,
@@ -1173,12 +1198,15 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
             )
         elif FLASH_TRANSFORMERS_BACKEND:
-            return TransformersFlashCausalLM.fallback(
+            from transformers import Gemma3ForConditionalGeneration as Gemma3Model
+
+            return TransformersGemma3VlmCausalLM.fallback(
                 model_id,
+                Gemma3Model,
                 revision,
                 quantize=quantize,
                 speculator=speculator,
-                dtype=dtype,
+                dtype=torch.bfloat16,
                 trust_remote_code=trust_remote_code,
             )
         elif sharded:
@@ -1483,33 +1511,65 @@ def get_model(
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
     if model_type == QWEN2_VL:
-        return VlmCausalLM(
-            model_id=model_id,
-            model_class=Qwen2VLForConditionalGeneration,
-            revision=revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            default_dtype=torch.bfloat16,
-            kv_cache_dtype=kv_cache_dtype,
-            trust_remote_code=trust_remote_code,
-            lora_adapter_ids=lora_adapter_ids,
-        )
+        if FLASH_ATTENTION:
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=Qwen2VLForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        # TODO: Uncomment when transformers is refactored
+        # elif FLASH_TRANSFORMERS_BACKEND:
+        #     from transformers import Qwen2VLForConditionalGeneration as Qwen2VLModel
+
+        #     return TransformersQwen2VlmCausalLM.fallback(
+        #         model_id,
+        #         Qwen2VLModel,
+        #         revision,
+        #         quantize=quantize,
+        #         speculator=speculator,
+        #         dtype=torch.bfloat16,
+        #         trust_remote_code=trust_remote_code,
+        #     )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Qwen2_VL"))
     if model_type == QWEN2_5_VL:
-        return VlmCausalLM(
-            model_id=model_id,
-            model_class=Qwen2_5VLForConditionalGeneration,
-            revision=revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            default_dtype=torch.bfloat16,
-            kv_cache_dtype=kv_cache_dtype,
-            trust_remote_code=trust_remote_code,
-            lora_adapter_ids=lora_adapter_ids,
-            config_class=Qwen2_5_VLConfig,
-            processor_class=Qwen2_5_VLProcessor,
-        )
+        if FLASH_ATTENTION:
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=Qwen2_5VLForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=Qwen2_5_VLConfig,
+                processor_class=Qwen2_5_VLProcessor,
+            )
+        # TODO: Uncomment when transformers is refactored
+        # elif FLASH_TRANSFORMERS_BACKEND:
+        #     return TransformersQwen2VlmCausalLM.fallback(
+        #         model_id,
+        #         Qwen2VLModel,
+        #         revision,
+        #         quantize=quantize,
+        #         speculator=speculator,
+        #         dtype=torch.bfloat16,
+        #         trust_remote_code=trust_remote_code,
+        #         config_class=Qwen2_5_VLConfig,
+        #         processor_class=Qwen2_5_VLProcessor,
+        #     )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Qwen2_5_VL"))
     if model_type == MLLAMA:
         if FLASH_ATTENTION:
             return MllamaCausalLM(
@@ -1524,6 +1584,20 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 lora_adapter_ids=lora_adapter_ids,
             )
+        # TODO: Uncomment when transformers is refactored and cross attn is added
+        # elif FLASH_TRANSFORMERS_BACKEND:
+        #     from transformers import MllamaForConditionalGeneration as MllamaModel
+
+        #     return TransformersFlashVlmCausalLM.fallback(
+        #         model_id,
+        #         MllamaModel,
+        #         revision,
+        #         quantize=quantize,
+        #         speculator=speculator,
+        #         dtype=torch.bfloat16,
+        #         trust_remote_code=trust_remote_code,
+        #         batch_class=MllamaCausalLMBatch,
+        #     )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Mllama"))
     if model_type == IDEFICS2:
@@ -1542,6 +1616,19 @@ def get_model(
                 # VRAM usage.
                 processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            from transformers import Idefics2ForConditionalGeneration as Idefics2Model
+
+            return TransformersFlashVlmCausalLM.fallback(
+                model_id,
+                Idefics2Model,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
+            )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
     if model_type == IDEFICS3:
@@ -1560,6 +1647,19 @@ def get_model(
                 # VRAM usage.
                 processor_kwargs={"size": {"longest_edge": 1456}},
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            from transformers import Idefics3ForConditionalGeneration as Idefics3Model
+
+            return TransformersFlashVlmCausalLM.fallback(
+                model_id,
+                Idefics3Model,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                processor_kwargs={"size": {"longest_edge": 1456}},
+            )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
     if model_type == PALIGEMMA:
@@ -1578,9 +1678,21 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
                 batch_class=PaliGemmaBatch,
             )
-        else:
-            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
+        elif FLASH_TRANSFORMERS_BACKEND:
+            from transformers import PaliGemmaForConditionalGeneration as PaliGemmaModel
 
+            return TransformersFlashVlmCausalLM.fallback(
+                model_id,
+                PaliGemmaModel,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                batch_class=PaliGemmaBatch,
+            )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("PaliGemma"))
     if model_type == LLAVA_NEXT:
         if FLASH_ATTENTION:
             return VlmCausalLM(
@@ -1593,6 +1705,18 @@ def get_model(
                 kv_cache_dtype=kv_cache_dtype,
                 trust_remote_code=trust_remote_code,
             )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            from transformers import LlavaNextForConditionalGeneration as LlavaNextModel
+
+            return TransformersFlashVlmCausalLM.fallback(
+                model_id,
+                LlavaNextModel,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("LlavaNext"))
 
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index d3a83e27..c7c5a374 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1344,9 +1344,6 @@ class FlashCausalLM(Model):
     def batch_type(self) -> Type[FlashCausalLMBatch]:
         return FlashCausalLMBatch
 
-    def max_past(self) -> int:
-        return getattr(self.model, "max_past", None)
-
     def init_kv_cache(
         self,
         num_blocks: int,
@@ -1792,12 +1789,6 @@ class FlashCausalLM(Model):
             max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
-        if cu_seqlen_prefill is None and self.max_past() is not None:
-            # In decode, not prefill, we're actually overwriting the KV-cache
-            # in a circular buffer mode.
-            # This makes sure the max_s for the decode pass is correct.
-            max_s = min(self.max_past(), max_s)
-
         bs = input_ids.shape[0]
         sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
         if sorted_padded_bs:
diff --git a/server/text_generation_server/models/transformers_flash_causal_lm.py b/server/text_generation_server/models/transformers_flash_causal_lm.py
index 8773bfd3..77659dd0 100644
--- a/server/text_generation_server/models/transformers_flash_causal_lm.py
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@@ -36,6 +36,7 @@ def tgi_flash_attention_forward(
     softcap: Optional[float] = None,
     **kwargs,  # This is needed to "absorb" other args passed by Transformers modeling
 ):
+
     kv_cache = kv_cache[module.layer_idx]
     query_states = query_states.transpose(1, 2).squeeze(dim=0)
     key_states = key_states.transpose(1, 2).squeeze(dim=0)
@@ -72,6 +73,7 @@ def tgi_flash_attention_forward(
             max_s,
             kv_scales=kv_scales,
             softcap=softcap,
+            window_size_left=sliding_window,
         )
 
     attn_output = attn_output.view(-1, num_heads * head_dim)
@@ -157,7 +159,14 @@ class TransformersFlashCausalLM(FlashCausalLM):
         self.num_layers = model.config.num_hidden_layers
         self.num_heads = model.config.num_attention_heads
         self.num_kv_heads = model.config.num_key_value_heads
-        self.head_size = model.config.hidden_size // model.config.num_attention_heads
+        # Some models use GQA and different sizes for o_proj
+        # and q_proj, that allows for that.
+        if hasattr(model.config, "head_dim"):
+            self.head_size = model.config.head_dim
+        else:
+            self.head_size = (
+                model.config.hidden_size // model.config.num_attention_heads
+            )
 
         # Skip it for models in the exception list
         if model.config.model_type not in REPLICATED_ATTENTION_MODELS:
diff --git a/server/text_generation_server/models/transformers_flash_vlm.py b/server/text_generation_server/models/transformers_flash_vlm.py
new file mode 100644
index 00000000..a7beb68b
--- /dev/null
+++ b/server/text_generation_server/models/transformers_flash_vlm.py
@@ -0,0 +1,566 @@
+import math
+from typing import List, Optional
+
+import torch
+from opentelemetry import trace
+from transformers import AutoTokenizer, AutoProcessor
+import transformers.modeling_utils
+
+from text_generation_server.models.flash_causal_lm import FlashCausalLM
+from text_generation_server.models.vlm_causal_lm import VlmCausalLM, VlmCausalLMBatch
+from text_generation_server.utils import initialize_torch_distributed
+
+from text_generation_server.layers.attention import paged_attention, attention, Seqlen
+from text_generation_server.layers.attention.kv_cache import KVScales, KVCache
+from text_generation_server.models.globals import ATTENTION
+import torch.nn.functional as F
+
+tracer = trace.get_tracer(__name__)
+
+# The base TP plan of these models has replicated q/k/v. This means that each process will see the full states,
+# hence we should not divide the number of heads by the world size. This is a known waste of VRAM (the cache
+# will be fully replicated on each process) and GPU communication (additional all-gather operations), however due
+# to internal constraints it was not (yet?) possible to circumvent
+REPLICATED_ATTENTION_MODELS = [
+    "olmo2",
+    "phi3",
+]
+
+
+# # Qwen2VL
+# transformers.models.qwen2_vl.modeling_qwen2_vl.QWEN2_VL_VISION_ATTENTION_CLASSES[
+#     "tgi"
+# ] = transformers.models.qwen2_vl.modeling_qwen2_vl.QWEN2_VL_VISION_ATTENTION_CLASSES[
+#     "eager"
+# ]
+def tgi_flash_attention_forward(
+    module,
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],  # This is a positional arg in Transformers
+    kv_cache: List[KVCache],
+    kv_head_mapping: torch.Tensor,
+    slots: torch.Tensor,
+    cu_seqlen_prefill: Optional[torch.Tensor],
+    seqlen: Seqlen,
+    block_tables: torch.Tensor,
+    max_s: int,
+    kv_scales: KVScales,
+    softmax_scale: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    softcap: Optional[float] = None,
+    use_sdpa: Optional[bool] = False,
+    **kwargs,  # This is needed to "absorb" other args passed by Transformers modeling
+):
+    kv_cache = kv_cache[module.layer_idx]
+    query_states = query_states.transpose(1, 2).squeeze(dim=0)
+    key_states = key_states.transpose(1, 2).squeeze(dim=0)
+    value_states = value_states.transpose(1, 2).squeeze(dim=0)
+
+    # Take care of updating the cache in-place
+    kv_cache.store(key=key_states, value=value_states, slots=slots, kv_scales=kv_scales)
+
+    _, num_heads, head_dim = query_states.shape
+    softmax_scale = 1 / math.sqrt(head_dim) if softmax_scale is None else softmax_scale
+    sliding_window = -1 if sliding_window is None else sliding_window
+
+    if cu_seqlen_prefill is not None:
+        if not use_sdpa:
+            attn_output = attention(
+                query=query_states,
+                key=key_states,
+                value=value_states,
+                kv_cache=kv_cache,
+                kv_scales=kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=softmax_scale,
+                window_size_left=sliding_window,
+                softcap=softcap,
+            )
+        else:
+            lengths = cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]
+            max_length = max(lengths)
+            attention_mask = attention_mask[:, :, :, :max_length]
+            enable_gqa = query_states.shape[1] != key_states.shape[1]
+            # Split tensors using vectorized split
+            query_list = torch.split(query_states, lengths.tolist(), dim=0)
+            key_list = torch.split(key_states, lengths.tolist(), dim=0)
+            value_list = torch.split(value_states, lengths.tolist(), dim=0)
+
+            padded_query = torch.nn.utils.rnn.pad_sequence(query_list, batch_first=True)
+            padded_key = torch.nn.utils.rnn.pad_sequence(key_list, batch_first=True)
+            padded_value = torch.nn.utils.rnn.pad_sequence(value_list, batch_first=True)
+
+            padded_query = padded_query.transpose(1, 2).contiguous()
+            padded_key = padded_key.transpose(1, 2).contiguous()
+            padded_value = padded_value.transpose(1, 2).contiguous()
+
+            # Compute attention
+            attn_output = F.scaled_dot_product_attention(
+                padded_query,
+                padded_key,
+                padded_value,
+                attn_mask=attention_mask,
+                scale=softmax_scale,
+                enable_gqa=enable_gqa,
+            )
+
+            attn_output = attn_output.transpose(
+                1, 2
+            )  # [batch_size, seq_len, num_heads, head_dim]
+            max_seq_len = padded_query.size(2)
+            seq_range = torch.arange(max_seq_len, device=padded_query.device).unsqueeze(
+                0
+            )
+            lengths_tensor = torch.tensor(
+                lengths, device=padded_query.device
+            ).unsqueeze(1)
+            mask = seq_range < lengths_tensor  # [batch, max_seq_len]
+            attn_output = attn_output[mask]  # [total_seq_len, num_heads, head_dim]
+
+    else:
+        attn_output = paged_attention(
+            query_states,
+            kv_cache,
+            kv_head_mapping,
+            softmax_scale,
+            block_tables,
+            seqlen,
+            max_s,
+            kv_scales=kv_scales,
+            softcap=softcap,
+            window_size_left=sliding_window,
+        )
+
+    attn_output = attn_output.view(-1, num_heads * head_dim)
+
+    return attn_output, None
+
+
+transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS["tgi"] = tgi_flash_attention_forward
+
+
+# TODO: implement
+# tgi_cross_attention_forward
+
+
+class TransformersFlashVlmCausalLM(VlmCausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        tokenizer_class=AutoTokenizer,
+        processor_class=AutoProcessor,
+        processor_kwargs=None,
+        kv_cache_dtype: Optional[torch.dtype] = None,
+        batch_class=VlmCausalLMBatch,
+    ):
+        self.batch_class = batch_class
+        self.quantize = quantize
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.dtype = dtype
+
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
+
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            device = torch.device("xpu")
+            dtype = default_dtype if dtype is None else dtype
+        else:
+            raise ValueError(
+                "Flash `Transformers` modeling backend is not available on cpu."
+            )
+
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+
+        if processor_kwargs is None:
+            processor_kwargs = {}
+
+        self.processor = processor_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **processor_kwargs,
+        )
+
+        attn_implementation = {
+            "text_config": "tgi",
+            "vision_config": "sdpa",
+        }
+
+        model = model_class.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=dtype,
+            load_in_8bit=quantize == "bitsandbytes",
+            trust_remote_code=trust_remote_code,
+            attn_implementation=attn_implementation,
+            device_map=device if world_size == 1 else None,
+            tp_plan="auto" if world_size > 1 else None,
+        )
+
+        torch.distributed.barrier(group=self.process_group)
+        self.config = model.config
+        config = model.config
+
+        # VLM models define the config we care about in their text_config
+        text_config = getattr(model.config, "text_config", None)
+        if text_config is not None:
+            config = text_config
+
+        if tokenizer.pad_token_id is None:
+            if model.config.pad_token_id is not None:
+                tokenizer.pad_token_id = model.config.pad_token_id
+            elif model.config.eos_token_id is not None and isinstance(
+                model.config.eos_token_id, int
+            ):
+                tokenizer.pad_token_id = model.config.eos_token_id
+            elif tokenizer.eos_token_id is not None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            else:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+        self.num_layers = config.num_hidden_layers
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        # Some models use GQA and different sizes for o_proj
+        # and q_proj, that allows for that.
+        if hasattr(config, "head_dim"):
+            self.head_size = config.head_dim
+        else:
+            self.head_size = config.hidden_size // config.num_attention_heads
+
+        # Skip it for models in the exception list
+        if config.model_type not in REPLICATED_ATTENTION_MODELS:
+            self.num_heads = self.num_heads // self.process_group.size()
+            self.num_kv_heads = (
+                self.num_kv_heads // self.process_group.size()
+                if self.num_kv_heads > 1
+                else self.num_kv_heads
+            )
+
+        self.cuda_graphs = {}
+        self.kv_cache = []
+        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
+
+        if ATTENTION == "flashinfer":
+            from text_generation_server.layers.attention.flashinfer import (
+                create_prefill_state,
+                create_decode_state,
+                create_prefill_with_paged_kv_state,
+            )
+
+            self.prefill_state = create_prefill_state(device=device)
+            self.prefill_with_paged_kv_state = create_prefill_with_paged_kv_state(
+                device=device
+            )
+
+            self.decode_state = create_decode_state(
+                device=device,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+            )
+
+        self.num_groups = self.num_heads // self.num_kv_heads
+
+        # Those will never change and will be used in the forwards
+        self.kv_head_mapping = torch.arange(
+            0, self.num_kv_heads, dtype=torch.int32, device=device
+        ).repeat_interleave(self.num_groups)
+        # This means no scale
+        self.kv_scales = KVScales(
+            torch.tensor(1.0, device=device),
+            torch.tensor(1.0, device=device),
+        )
+
+        # Skip FlashCausalLM init.
+        super(FlashCausalLM, self).__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=False,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+
+        # Monkey patch of `self.model.forward` to match `FlashCausalLM`. It avoids duplicating a lot of code
+        # We first copy the original model.forward because we still need it in the monkey patch
+        self.model.original_forward = self.model.forward
+        self.model.forward = self._model_forward
+        self.model.get_position_ids = self.get_position_ids
+
+        torch.distributed.barrier(group=self.process_group)
+
+    def get_position_ids(self, input_ids, image_grid_thw, position_ids):
+        return position_ids
+
+    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
+        return {
+            "input_ids": input_ids.unsqueeze(0),
+            "position_ids": position_ids.unsqueeze(0),
+        }
+
+    def post_process_outputs(self, logits, lm_head_indices):
+        return logits.squeeze(dim=0)
+
+    @classmethod
+    def fallback(
+        cls,
+        model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+        batch_class: Optional[type] = VlmCausalLMBatch,
+        processor_kwargs: Optional[dict] = None,
+    ):
+        return cls(
+            model_id=model_id,
+            model_class=model_class,
+            revision=revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+            batch_class=batch_class,
+            processor_kwargs=processor_kwargs,
+        )
+
+    def _model_forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[KVCache],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        lm_head_indices: Optional[torch.Tensor],
+        prefill_cache_indices=None,  # not used, but passed to match original signature
+        adapter_data=None,  # not supported, but passed to match original signature
+        pixel_values: torch.FloatTensor = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_attention_mask=None,
+        image_sizes: Optional[torch.LongTensor] = None,
+    ):
+        # A value of `None` (i.e. no logit slicing) translates to `0` in Transformers
+        logits_to_keep = lm_head_indices if lm_head_indices is not None else 0
+
+        inputs = self.pre_process_inputs(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+        )
+        # This is equivalent to `self.model.forward`, see the monkey patch in __init__
+        logits = self.model.original_forward(
+            input_ids=inputs["input_ids"],
+            position_ids=inputs["position_ids"],
+            past_key_values=None,  # we use self.kv_cache instead of transformers cache object
+            use_cache=False,  # we use self.kv_cache instead of transformers cache object
+            logits_to_keep=logits_to_keep,
+            return_dict=True,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+            kv_head_mapping=self.kv_head_mapping,
+            kv_scales=self.kv_scales,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_sizes=image_sizes,
+            image_grid_thw=image_grid_thw,
+            attention_mask=inputs.get("attention_mask", None),
+            use_sdpa=inputs.get("use_sdpa", False),
+            cache_position=inputs.get("cache_position", None),
+        ).logits
+
+        logits = self.post_process_outputs(logits, lm_head_indices)
+
+        return logits, None
+
+
+class TransformersQwen2VlmCausalLM(TransformersFlashVlmCausalLM):
+    def get_position_ids(self, input_ids: torch.Tensor, image_grid_thw: torch.Tensor):
+        if image_grid_thw is None:
+            return (
+                torch.arange(input_ids.shape[0], device=input_ids.device)
+                .unsqueeze(1)
+                .repeat(1, 3)
+            )
+
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        vision_start_token_id = self.config.vision_start_token_id
+        vision_end_token_id = self.config.vision_end_token_id
+        device = input_ids.device
+        dtype = input_ids.dtype
+        input_ids_len = input_ids.shape[0]
+
+        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
+        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
+        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
+        prev_vision_end = torch.cat(
+            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
+        )
+        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
+        vision_widths_max = torch.cat(
+            [
+                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
+                image_grid_thw[:-1, 2] // spatial_merge_size,
+            ]
+        )
+        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
+        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
+        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision
+
+        # create position ids for each vision segment based on the image grid
+        llm_pos_ids_list = []
+        for i, _ in enumerate(vision_segments):
+            t, h, w = (
+                image_grid_thw[i][0],
+                image_grid_thw[i][1] // spatial_merge_size,
+                image_grid_thw[i][2] // spatial_merge_size,
+            )
+            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
+            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
+            w_indices = torch.arange(w, device=device).repeat(t * h)
+            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)
+
+            # offset by the position of the last vision segment
+            im = image_position_ids + vision_segment_lengths[i]
+            llm_pos_ids_list.append(im)
+
+        # create position ids for each text segment
+        text_ranges = [
+            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
+            + text_segment_lengths[i]
+            for i, seq_len in enumerate(text_lengths_between_vision)
+        ]
+
+        full_llm_pos_ids_list = [
+            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
+        ]
+        # import ipdb
+
+        # ipdb.set_trace()
+        max_s = full_llm_pos_ids_list[-1].max() + 1
+        final_text_len = input_ids_len - vision_ends[-1]
+        if final_text_len > 0:
+            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
+            full_llm_pos_ids_list.append(m + max_s)
+
+        position_ids = (
+            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
+        )
+        return position_ids
+
+    def post_process_outputs(self, logits, lm_head_indices):
+        return logits.squeeze(dim=0)[lm_head_indices].unsqueeze(0)
+
+    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
+        input_ids = input_ids.unsqueeze(0)
+        position_ids = position_ids.transpose(0, 1).unsqueeze(1)
+        return {"input_ids": input_ids, "position_ids": position_ids}
+
+
+class TransformersGemma3VlmCausalLM(TransformersFlashVlmCausalLM):
+    def get_attention_mask(self, input_ids, cu_seqlen_prefill):
+        device = input_ids.device
+        dtype = self.dtype
+        min_dtype = torch.finfo(dtype).min
+
+        lengths = (cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]).tolist()
+        batch_size = len(lengths)
+
+        sequence_length = max(lengths)
+        target_length = sequence_length
+        # Create the padding mask from the computed lengths.
+        # pad_mask: [batch, sequence_length] where True indicates valid tokens.
+        seq_range = torch.arange(sequence_length, device=device).unsqueeze(0)
+        lengths_tensor = torch.tensor(lengths, device=device).unsqueeze(1)
+        pad_mask = seq_range < lengths_tensor  # shape: [batch, sequence_length]
+
+        # Build the base causal mask (for non-image tokens):
+        causal_mask = torch.tril(
+            torch.ones(
+                (sequence_length, sequence_length), dtype=torch.bool, device=device
+            )
+        )
+        base_mask = pad_mask.unsqueeze(2) & pad_mask.unsqueeze(
+            1
+        )  # [batch, sequence_length, sequence_length]
+        base_mask = base_mask & causal_mask.unsqueeze(0)  # apply causal constraint
+
+        image_token_mask = (input_ids == self.config.image_token_index).to(
+            input_ids.device
+        )
+
+        image_token_mask = torch.nn.utils.rnn.pad_sequence(
+            torch.split(image_token_mask, lengths), batch_first=True, padding_value=0
+        )
+        bidirectional_mask = image_token_mask.unsqueeze(2) & image_token_mask.unsqueeze(
+            1
+        )
+
+        # Combine the causal base mask and the bidirectional mask.
+        combined_mask = torch.logical_or(
+            base_mask.unsqueeze(1), bidirectional_mask.unsqueeze(1)
+        ).to(device)
+        # combined_mask now has shape [batch, 1, sequence_length, sequence_length]
+
+        full_attention_mask = torch.zeros(
+            (batch_size, 1, sequence_length, target_length),
+            device=device,
+            dtype=torch.bool,
+        )
+        full_attention_mask[:, :, :, :sequence_length] = combined_mask
+
+        final_attention_mask = torch.where(full_attention_mask, 0, min_dtype).to(device)
+
+        return final_attention_mask
+
+    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
+        inputs = {
+            "input_ids": input_ids.unsqueeze(0),
+            "position_ids": position_ids.unsqueeze(0),
+        }
+
+        if cu_seqlen_prefill is not None:
+            attention_mask = self.get_attention_mask(
+                input_ids.squeeze(0), cu_seqlen_prefill
+            )
+            inputs["attention_mask"] = attention_mask
+            inputs["use_sdpa"] = True
+
+        return inputs
+
+
+class TransformersLlama4VlmCausalLM(TransformersFlashVlmCausalLM):
+    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
+        inputs = super().pre_process_inputs(input_ids, position_ids, cu_seqlen_prefill)
+        inputs["cache_position"] = position_ids
+        inputs["attention_mask"] = torch.zeros((1, 1, 1, 1), device=input_ids.device)
+        return inputs
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 9111fdc0..5f8eb906 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -29,6 +29,33 @@ IDEFICS3_FAKE_IMAGE_TOKEN = "<fake_token_around_image>"
 IDEFICS3_GLOBAL_IMG_TOKEN = "<global-img>"
 
 
+def prompt_split_image_llama4(aspect_ratio, num_patches_per_chunk):
+    """
+    Create a structured string representation of image tokens
+
+    Args:
+       num_patches: Number of patches in the image
+
+    Returns:
+        String with appropriate image tokens
+    """
+    img_string = "<|image_start|>"
+    ratio_h, ratio_w = aspect_ratio
+    if ratio_h * ratio_w > 1:
+        for yy in range(ratio_h):
+            for xx in range(ratio_w):
+                img_string += "<|patch|>" * num_patches_per_chunk
+                if xx < ratio_w - 1:
+                    img_string += "<|tile_x_separator|>"
+
+            img_string += "<|tile_y_separator|>"
+    img_string += "<|image|>"
+    img_string += "<|patch|>" * num_patches_per_chunk
+    img_string += "<|image_end|>"
+
+    return img_string
+
+
 # copied from: https://github.com/huggingface/transformers/blob/02ed609285c2448b3b54c31e362f2c389fa952ab/src/transformers/models/idefics3/processing_idefics3.py#L44-L60
 def _prompt_split_image(
     *,
@@ -134,6 +161,23 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
         num_pads = 256
         padding = "<image_soft_token>" * num_pads
         return f"\n\n<start_of_image>{padding}<end_of_image>\n\n"
+    elif config.model_type == "llama4":
+        patch_size = config.vision_config.patch_size
+        pixel_shuffle_ratio = config.vision_config.pixel_shuffle_ratio
+        downsample_ratio = int(round(1.0 / (pixel_shuffle_ratio**2)))
+        aspect_ratios = image_input["aspect_ratios"][image_id]
+        image_height, image_width = image_input["pixel_values"][image_id].shape[-2:]
+
+        num_patches_per_chunk = int(
+            (image_height // patch_size)
+            * (image_width // patch_size)
+            // downsample_ratio
+        )
+        tokens_for_this_image = prompt_split_image_llama4(
+            aspect_ratios, num_patches_per_chunk
+        )
+
+        return tokens_for_this_image
     else:
         raise RuntimeError(f"Unknown config {config.model_type} for multimodal")
 
@@ -252,6 +296,8 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                         images.append(image)
                     elif config.model_type == "gemma3":
                         images.append(image)
+                    elif config.model_type == "llama4":
+                        images.append(image)
                     else:
                         images.append([image])
                 else:
@@ -285,7 +331,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                         processor, image_inputs, config, image_id
                     )
                     image_id += 1
-
+            # from pdb import set_trace; set_trace()
             full_text = image_text_replacement_fixup(config, full_text)
             input_ids = tokenizer(
                 full_text,
@@ -372,9 +418,6 @@ class VlmCausalLM(FlashCausalLM):
     def batch_type(self) -> Type[VlmCausalLMBatch]:
         return self.batch_class
 
-    def max_past(self) -> Optional[int]:
-        return getattr(self.model.text_model, "max_past", None)
-
     def forward(
         self,
         batch: VlmCausalLMBatch,
@@ -442,12 +485,6 @@ class VlmCausalLM(FlashCausalLM):
                 )
                 batch.position_ids = position_ids
 
-        if cu_seqlen_prefill is None and self.max_past() is not None:
-            # In decode, not prefill, we're actually overwriting the KV-cache
-            # in a circular buffer mode.
-            # This makes sure the max_s for the decode pass is correct.
-            max_s = min(self.max_past(), max_s)
-
         # Try to find an associated cuda graph
         bs = input_ids.shape[0]
         sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
diff --git a/server/uv.lock b/server/uv.lock
index d4bbb955..c3f3f089 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -707,9 +707,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a1/14/f1e15b851d1c2af5b0b1a82bf8eb10bda2da62d98180220ba6fd8879bb5b/hf_transfer-0.1.9-cp38-abi3-win_amd64.whl", hash = "sha256:16f208fc678911c37e11aa7b586bc66a37d02e636208f18b6bc53d29b5df40ad", size = 1160240 },
 ]
 
+[[package]]
+name = "hf-xet"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/64/46/db229dddc55121478105940b610fef1b466c414da02be9d4daa5602a2527/hf_xet-1.0.0.tar.gz", hash = "sha256:5e0ca891ce599fd753e7ffbdc182207d952a93e6252eeb92118475d6866bb093", size = 257192 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/0a/c16f8766fa3cd520292b1a765e9b50b8390bce4c2ed7657db9534551f5ed/hf_xet-1.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6106304f92bbce7c9b8509f6f735f2e8ce95e4dc32af8876e874c48b15ca1903", size = 5001841 },
+    { url = "https://files.pythonhosted.org/packages/e3/9f/cca55edd85d03fc98c743bcc093965740a7440e909779c558039d6838f03/hf_xet-1.0.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:4d0bc7a3e6c1d21fcbb48e8726e3b19a2460e95971375e55e9a5f73ec7079a86", size = 4805318 },
+    { url = "https://files.pythonhosted.org/packages/d1/0b/28bda7ac9d699dcfb96f628aa135ddca3f0f77e9716351aab2b83966f957/hf_xet-1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23dee64f114ea9a272ff71a6a755e025d7a075a6a0dbf6da0990fe9211831ccf", size = 53504907 },
+    { url = "https://files.pythonhosted.org/packages/cb/04/ef1f7249a813841d193cbab2ef4d1d7d67c66c61d21d45223a72fdc5c88e/hf_xet-1.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d5f160550508ed87783d1eca4288602a713d1c45ec517d937acb9d93120f0cab", size = 52410434 },
+    { url = "https://files.pythonhosted.org/packages/81/b3/e7abec2619ecd9d1c743adfe79fa69cf84530f530969daf3dc804efef65b/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:5ebd79db87df0b9d3607e7c9a6bb0662c10e36992f522f66b1d2a7fe93f53f27", size = 53465113 },
+    { url = "https://files.pythonhosted.org/packages/df/82/b51f3b6e5c6f33e91220c37b17760229704c58e79ab0fcfd0fd3b55803d3/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8e6d2625971b4affad634835db82d5392f38de874205a9573e0dd3f0f9cb136f", size = 53461632 },
+    { url = "https://files.pythonhosted.org/packages/95/d2/32defba26d995f7acdc4fe3e5911473b25aff5b75c5a2532786435a709e8/hf_xet-1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:b446964bd75eb7f6b4d983c47241b2023eadfad1f56137ed00e1ca6fc278faec", size = 4121808 },
+]
+
 [[package]]
 name = "huggingface-hub"
-version = "0.29.1"
+version = "0.30.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -720,9 +735,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/22/37/797d6476f13e5ef6af5fc48a5d641d32b39c37e166ccf40c3714c5854a85/huggingface_hub-0.29.1.tar.gz", hash = "sha256:9524eae42077b8ff4fc459ceb7a514eca1c1232b775276b009709fe2a084f250", size = 389776 }
+sdist = { url = "https://files.pythonhosted.org/packages/78/be/049689a7197630e75c4bb53021cb209a56617c9bf39b3a0950650d1f96e1/huggingface_hub-0.30.1.tar.gz", hash = "sha256:f379e8b8d0791295602538856638460ae3cf679c7f304201eb80fb98c771950e", size = 400784 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ae/05/75b90de9093de0aadafc868bb2fa7c57651fd8f45384adf39bd77f63980d/huggingface_hub-0.29.1-py3-none-any.whl", hash = "sha256:352f69caf16566c7b6de84b54a822f6238e17ddd8ae3da4f8f2272aea5b198d5", size = 468049 },
+    { url = "https://files.pythonhosted.org/packages/99/e3/2232d0e726d4d6ea69643b9593d97d0e7e6ea69c2fe9ed5de34d476c1c47/huggingface_hub-0.30.1-py3-none-any.whl", hash = "sha256:0f6aa5ec5a4e68e5b9e45d556b4e5ea180c58f5a5ffa734e7f38c9d573028959", size = 481170 },
 ]
 
 [[package]]
@@ -2563,6 +2578,7 @@ dependencies = [
     { name = "grpcio-reflection" },
     { name = "grpcio-status" },
     { name = "hf-transfer" },
+    { name = "hf-xet" },
     { name = "huggingface-hub" },
     { name = "kernels" },
     { name = "loguru" },
@@ -2628,7 +2644,8 @@ requires-dist = [
     { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" },
     { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
     { name = "hf-transfer", specifier = ">=0.1.8" },
-    { name = "huggingface-hub", specifier = ">=0.29.0" },
+    { name = "hf-xet", specifier = ">=1.0.0" },
+    { name = "huggingface-hub", specifier = ">=0.30.1" },
     { name = "kernels", specifier = ">=0.2.1" },
     { name = "loguru", specifier = ">=0.7.3" },
     { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },

From d23b385eeeb1e7f2e2ca8449346936a2dd378856 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Sun, 6 Apr 2025 11:36:00 +0200
Subject: [PATCH 167/183] Preparing for release. (#3147)

* Preparing for release.

* Adding hf-xet dependency.

* Merged tgi-nix update.
---
 Cargo.lock                                       | 16 ++++++++--------
 Cargo.toml                                       |  2 +-
 README.md                                        |  6 +++---
 docs/openapi.json                                |  2 +-
 docs/source/backends/gaudi.mdx                   | 10 +++++-----
 docs/source/backends/neuron.md                   |  2 +-
 .../source/basic_tutorials/gated_model_access.md |  2 +-
 docs/source/conceptual/quantization.md           |  6 +++---
 docs/source/installation_amd.md                  |  2 +-
 docs/source/installation_intel.md                |  4 ++--
 docs/source/installation_nvidia.md               |  2 +-
 docs/source/quicktour.md                         |  4 ++--
 docs/source/reference/api_reference.md           |  2 +-
 flake.lock                                       |  6 +++---
 nix/server.nix                                   |  2 ++
 15 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6cde83b5..d64634a6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4650,7 +4650,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.2.1-dev0"
+version = "3.2.2-dev0"
 dependencies = [
  "async-trait",
  "clap 4.5.32",
@@ -4671,7 +4671,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-benchmark"
-version = "3.2.1-dev0"
+version = "3.2.2-dev0"
 dependencies = [
  "average",
  "clap 4.5.32",
@@ -4691,7 +4691,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "3.2.1-dev0"
+version = "3.2.2-dev0"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -4709,7 +4709,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "3.2.1-dev0"
+version = "3.2.2-dev0"
 dependencies = [
  "clap 4.5.32",
  "ctrlc",
@@ -4730,7 +4730,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "3.2.1-dev0"
+version = "3.2.2-dev0"
 dependencies = [
  "anyhow",
  "async-stream",
@@ -4782,7 +4782,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-llamacpp"
-version = "3.2.1-dev0"
+version = "3.2.2-dev0"
 dependencies = [
  "async-trait",
  "bindgen 0.71.1",
@@ -4800,7 +4800,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v2"
-version = "3.2.1-dev0"
+version = "3.2.2-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -4849,7 +4849,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v3"
-version = "3.2.1-dev0"
+version = "3.2.2-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
diff --git a/Cargo.toml b/Cargo.toml
index d52adec4..9b818837 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "3.2.1-dev0"
+version = "3.2.2-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
diff --git a/README.md b/README.md
index 274cd86c..23d87f9a 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/docs/openapi.json b/docs/openapi.json
index 85ca3f97..55fc7ec6 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "3.2.1-dev0"
+    "version": "3.2.2-dev0"
   },
   "paths": {
     "/": {
diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx
index 7e0e69ab..9c56ed58 100644
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
     --model-id $model
 ```
 
@@ -74,7 +74,7 @@ hf_token=YOUR_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
     --model-id $model
     <text-generation-inference-launcher-arguments>
 ```
@@ -137,7 +137,7 @@ docker run -p 8080:80 \
    -e BATCH_BUCKET_SIZE=256 \
    -e PREFILL_BATCH_BUCKET_SIZE=4 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -163,7 +163,7 @@ docker run -p 8080:80 \
    -v $volume:/data \
     -e PREFILL_BATCH_BUCKET_SIZE=1 \
     -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
    --model-id $model \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
    --max-total-tokens 8192 --max-batch-size 4
@@ -230,7 +230,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \
     -e PROF_PATH=/tmp/hpu_profile \
     -e PROF_RANKS=0 \
     -e PROF_RECORD_SHAPES=True \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
     --model-id $model
 ```
 
diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md
index f3a5dac7..e528197f 100644
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:
 
 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.2.1-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.2.2-neuron <service_parameters>
 ```
 
 - system parameters are used to map ports, volumes and devices between the host and the service,
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 29ced4f8..48d3abfe 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2 \
     --model-id $model
 ```
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index 4288b0a2..4790578d 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model --quantize bitsandbytes
 ```
 
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model --quantize bitsandbytes-nf4
 ```
 
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model --quantize gptq
 ```
 
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
index 4242cffb..bd2f9148 100644
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
     --device=/dev/kfd --device=/dev/dri --group-add video \
     --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.2.2-rocm \
     --model-id $model
 ```
 
diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md
index c0b40e02..60d2f896 100644
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.2-intel-xpu \
     --model-id $model --cuda-graphs 0
 ```
 
@@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.2-intel-cpu \
     --model-id $model --cuda-graphs 0
 ```
 
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
index 31fba9d6..0ed0da32 100644
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1 \
+    ghcr.io/huggingface/text-generation-inference:3.2.2 \
     --model-id $model
 ```
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index 9ed60efd..2ce9e686 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1 \
+    ghcr.io/huggingface/text-generation-inference:3.2.2 \
     --model-id $model
 ```
 
@@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.2.1 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.2.2 --help
 ```
 
 </Tip>
diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
index e563a9b1..7dc6768e 100644
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@@ -163,7 +163,7 @@ hub = {
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.1"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.2"),
  env=hub,
  role=role,
 )
diff --git a/flake.lock b/flake.lock
index d3c5490b..77593fd1 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,11 +978,11 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1742807335,
-        "narHash": "sha256-580CXhhCcK1cXRWahk/mDKo6zUsOD2JvNOg5SBBMAKc=",
+        "lastModified": 1743931123,
+        "narHash": "sha256-MDQrbJkweLYsMYh44Gx+c1gAZOCR1fmZF1lkavAHDto=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "8d9ea4691f49369aa5d3230a51366ff6c744d658",
+        "rev": "1ad3feaadfdedca90278ee7676bca15019519189",
         "type": "github"
       },
       "original": {
diff --git a/nix/server.nix b/nix/server.nix
index 1d00b978..e6493531 100644
--- a/nix/server.nix
+++ b/nix/server.nix
@@ -17,6 +17,7 @@
   grpcio-status,
   grpcio-tools,
   hf-transfer,
+  hf-xet,
   kernels,
   loguru,
   mamba-ssm,
@@ -92,6 +93,7 @@ buildPythonPackage {
     grpcio-status
     grpcio-tools
     hf-transfer
+    hf-xet
     kernels
     loguru
     mamba-ssm

From 9c26b52940f620e1228fced55b3892c43c9289fa Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 7 Apr 2025 16:25:11 +0530
Subject: [PATCH 168/183] Use ROCM 6.3.1 (#3141)

* update dockerfile

* add updated makefile

* fix docker

* Lint.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 Dockerfile_amd       | 379 ++++++++++++++++++-------------------------
 server/Makefile-vllm |   5 +-
 2 files changed, 163 insertions(+), 221 deletions(-)

diff --git a/Dockerfile_amd b/Dockerfile_amd
index d03ae596..e3e9efda 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -41,303 +41,244 @@ COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt --frozen
 
-# Text Generation Inference base image for RoCm
-FROM rocm/dev-ubuntu-22.04:6.2 AS base
+FROM rocm/dev-ubuntu-22.04:6.3.1-complete AS base
 
+ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG LEGACY_HIPBLASLT_OPTION=
+ARG RCCL_BRANCH="648a58d"
+ARG RCCL_REPO="https://github.com/ROCm/rccl"
+ARG TRITON_BRANCH="e5be006"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG PYTORCH_BRANCH="3a585126"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG FA_BRANCH="b7d29fb"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG AITER_BRANCH="21d47a9"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+
+ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV ROCM_PATH=/opt/rocm
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+ARG PYTHON_VERSION=3.11
+
+RUN mkdir -p /app
+WORKDIR /app
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-    build-essential \
-    ca-certificates \
-    ccache \
-    curl \
-    git \
-    make \
-    libmsgpack-dev \
-    libssl-dev \
-    llvm-dev \
-    g++ \
-    # Needed to build VLLM & flash.
-    rocthrust-dev \
-    hipsparse-dev \
-    hipblas-dev \
-    hipcub-dev \
-    rocblas-dev \
-    hiprand-dev \
-    hipfft-dev \
-    rocrand-dev \
-    miopen-hip-dev \
-    hipsolver-dev \
-    rccl-dev \
-    cmake \
-    python3.11-venv && \
-    rm -rf /var/lib/apt/lists/*
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        git \
+        ninja-build \
+        cmake \
+        software-properties-common \
+        python3.11-dev \
+        python3.11-venv && \
+        rm -rf /var/lib/apt/lists/*
 
-# Keep in sync with `server/pyproject.toml
-ARG MAMBA_VERSION=23.1.0-1
-ARG PYTHON_VERSION='3.11.10'
-# Automatically set by buildx
-ARG TARGETPLATFORM
-ENV PATH=/opt/conda/bin:$PATH
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
+ENV PATH="$PATH:/root/.local/bin"
+RUN uv python install ${PYTHON_VERSION}
+RUN uv venv --python ${PYTHON_VERSION} && uv pip install pip setuptools packaging
+ENV VIRTUAL_ENV=/usr/src/.venv/
+ENV PATH="$PATH:/usr/src/.venv/bin/"
 
-ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+RUN . .venv/bin/activate && pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
 
-# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
-# Install mamba
-# translating Docker's TARGETPLATFORM into mamba arches
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
-         *)              MAMBA_ARCH=x86_64   ;; \
-    esac && \
-    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
-RUN chmod +x ~/mambaforge.sh && \
-    bash ~/mambaforge.sh -b -p /opt/conda && \
-    mamba init && \
-    rm ~/mambaforge.sh
-
-# RUN conda install intel::mkl-static intel::mkl-include
-# Install pytorch
-# On arm64 we exit with an error code
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  exit 1 ;; \
-         *)              /opt/conda/bin/conda update -y conda &&  \
-                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
-    esac && \
-    /opt/conda/bin/conda clean -ya
-
-# Install flash-attention, torch dependencies
-RUN python3 -m pip install --upgrade pip uv && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
-
-RUN conda install mkl=2021
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
-
-
-ARG COMMON_WORKDIR=/
-WORKDIR ${COMMON_WORKDIR}
-
-
-# Install HIPBLASLt
 FROM base AS build_hipblaslt
-ARG HIPBLASLT_BRANCH="e6da924"
-RUN git clone https://github.com/ROCm/hipBLASLt.git \
-    && cd hipBLASLt \
+ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
+# Set to "--legacy_hipblas_direct" for ROCm<=6.2
+ARG LEGACY_HIPBLASLT_OPTION
+RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN . .venv/bin/activate && cd hipBLAS-common \
+    && git checkout ${HIPBLAS_COMMON_BRANCH} \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+    && make package \
+    && dpkg -i ./*.deb
+RUN git clone https://github.com/ROCm/hipBLASLt
+RUN . .venv/bin/activate && cd hipBLASLt \
     && git checkout ${HIPBLASLT_BRANCH} \
-    && SCCACHE_IDLE_TIMEOUT=1800 ./install.sh --architecture ${PYTORCH_ROCM_ARCH} --legacy_hipblas_direct \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
     && cd build/release \
     && make package
+RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
 
-FROM scratch AS export_hipblaslt
-ARG COMMON_WORKDIR
-COPY --from=build_hipblaslt ${COMMON_WORKDIR}/hipBLASLt/build/release/*.deb /
-
-# RCCL build stages
 FROM base AS build_rccl
-ARG RCCL_BRANCH="rocm-6.2.0"
-RUN git clone https://github.com/ROCm/rccl \
-    && cd rccl \
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+RUN git clone ${RCCL_REPO}
+RUN . .venv/bin/activate && cd rccl \
     && git checkout ${RCCL_BRANCH} \
     && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
-FROM scratch AS export_rccl
-ARG COMMON_WORKDIR
-COPY --from=build_rccl ${COMMON_WORKDIR}/rccl/build/release/*.deb /
+RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
 
-# Triton build stages
 FROM base AS build_triton
-ARG TRITON_BRANCH="e192dba"
-ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-RUN python3 -m pip install ninja cmake wheel pybind11 && git clone ${TRITON_REPO} \
-    && cd triton \
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+RUN git clone ${TRITON_REPO}
+RUN . .venv/bin/activate && cd triton \
     && git checkout ${TRITON_BRANCH} \
     && cd python \
     && python3 setup.py bdist_wheel --dist-dir=dist
-FROM scratch AS export_triton
-ARG COMMON_WORKDIR
-COPY --from=build_triton ${COMMON_WORKDIR}/triton/python/dist/*.whl /
+RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
 
-# # AMD-SMI build stages
 FROM base AS build_amdsmi
-RUN cd /opt/rocm/share/amd_smi \
+RUN . .venv/bin/activate && cd /opt/rocm/share/amd_smi \
     && pip wheel . --wheel-dir=dist
-FROM scratch AS export_amdsmi
-COPY --from=build_amdsmi /opt/rocm/share/amd_smi/dist/*.whl /
+RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 
+FROM base AS build_pytorch
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN git clone ${PYTORCH_REPO} pytorch
+RUN . .venv/bin/activate && cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+    pip install -r requirements.txt && git submodule update --init --recursive \
+    && python3 tools/amd_build/build_amd.py \
+    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${PYTORCH_VISION_REPO} vision
+RUN . .venv/bin/activate && cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${FA_REPO}
+RUN . .venv/bin/activate && cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/flash-attention/dist/*.whl /app/install
 
-FROM base as build_pytorch
+FROM base AS final
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    . .venv/bin/activate && \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    . .venv/bin/activate && \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    . .venv/bin/activate && \
+    pip install /install/*.whl
 
-RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
-    if ls /install/*.deb; then \
-        dpkg -i /install/*.deb \
-        && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-        && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
-    fi
+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN . .venv/bin/activate && cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
 
-ARG BUILD_ENVIRONMENT=pytorch-linux-jammy-rocm6.2-py3.11
-ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-
-# A commit to fix the output scaling factor issue in _scaled_mm
-# Not yet in 2.5.0-rc1
-ARG PYTORCH_BRANCH="cedc116"
-ARG PYTORCH_VISION_BRANCH="v0.19.1"
-ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
-
-RUN git clone ${PYTORCH_REPO} pytorch \
-    && cd pytorch && git checkout ${PYTORCH_BRANCH} && git submodule update --init --recursive \
-    && pip install -r requirements.txt --no-cache-dir  \
-    && python tools/amd_build/build_amd.py \
-    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist
-FROM scratch as export_pytorch
-ARG COMMON_WORKDIR
-COPY --from=build_pytorch ${COMMON_WORKDIR}/pytorch/dist/*.whl /
-
-FROM base AS install_deps
-
-ARG COMMON_WORKDIR
-
-# Install hipblaslt
-RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
-    if ls /install/*.deb; then \
-        dpkg -i /install/*.deb \
-        && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-        && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
-    fi
-
-RUN --mount=type=bind,from=export_rccl,src=/,target=/install \
-    if ls /install/*.deb; then \
-        dpkg -i /install/*.deb \
-        # RCCL needs to be installed twice
-        && dpkg -i /install/*.deb \
-        && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
-        && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status; \
-    fi
-
-RUN --mount=type=bind,from=export_triton,src=/,target=/install \
-    if ls /install/*.whl; then \
-        # Preemptively uninstall to prevent pip same-version no-installs
-        pip uninstall -y triton \
-        && pip install /install/*.whl; \
-    fi
-
-RUN --mount=type=bind,from=export_amdsmi,src=/,target=/install \
-    # Preemptively uninstall to prevent pip same-version no-installs
-    pip uninstall -y amdsmi \
-    && pip install /install/*.whl;
-
-RUN --mount=type=bind,from=export_pytorch,src=/,target=/install \
-    if ls /install/*.whl; then \
-        # Preemptively uninstall to prevent pip same-version no-installs
-        pip uninstall -y torch torchvision \
-        && pip install /install/*.whl; \
-    fi
-
-FROM install_deps AS kernel-builder
+RUN rm -rf /var/lib/apt/lists/*
 
+FROM final AS kernel-builder
 # # Build vllm kernels
 FROM kernel-builder AS vllm-builder
-WORKDIR /usr/src
 
 COPY server/Makefile-vllm Makefile
-RUN pip install setuptools_scm
+RUN . .venv/bin/activate && pip install setuptools_scm
 
 # Build specific version of vllm
-RUN make build-vllm-rocm
-
-# Build Flash Attention v2 kernels
-FROM kernel-builder AS flash-att-v2-builder
-WORKDIR /usr/src
-
-COPY server/Makefile-flash-att-v2 Makefile
-
-# Build specific version of flash attention v2
-RUN make build-flash-attention-v2-rocm
+RUN . .venv/bin/activate && make build-vllm-rocm
 
 # Build Transformers CUDA kernels (gpt-neox and bloom)
 FROM kernel-builder AS custom-kernels-builder
-WORKDIR /usr/src
 COPY server/custom_kernels/ .
-RUN python setup.py build
+RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
 
 # Build exllama kernels
 FROM kernel-builder AS exllama-kernels-builder
-WORKDIR /usr/src
 COPY server/exllama_kernels/ .
-
-RUN python setup.py build
+RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
 
 # Build exllama v2 kernels
 FROM kernel-builder AS exllamav2-kernels-builder
-WORKDIR /usr/src
 COPY server/exllamav2_kernels/ .
-
-RUN python setup.py build
+RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
 
 FROM kernel-builder AS marlin-kernels
-WORKDIR /usr/src
 ENV MARLIN_KERNELS_BRANCH=v0.3.6
 ENV VLLM_TARGET_DEVICE=rocm
-RUN git clone https://github.com/danieldk/marlin-kernels.git && \
+RUN . .venv/bin/activate && git clone https://github.com/danieldk/marlin-kernels.git && \
     cd marlin-kernels && \
     git checkout ${MARLIN_KERNELS_BRANCH} && \
-    python setup.py install
+    python3 setup.py bdist_wheel --dist-dir=dist
 
 FROM kernel-builder AS moe-kernels
-WORKDIR /usr/src
 ENV MOE_KERNELS_BRANCH=v0.8.2
 ENV VLLM_TARGET_DEVICE=rocm
-RUN git clone https://github.com/danieldk/moe-kernels.git && \
+RUN . .venv/bin/activate && git clone https://github.com/danieldk/moe-kernels.git && \
     cd moe-kernels && \
     git checkout ${MOE_KERNELS_BRANCH} && \
-    python setup.py install
+    python3 setup.py bdist_wheel --dist-dir=dist
 
-FROM install_deps AS base-copy
+FROM final AS base-copy
 
 # Text Generation Inference base env
 ENV HF_HOME=/data \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
     PORT=80
 
-# Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from marlin kernels
-COPY --from=marlin-kernels /usr/src/marlin-kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from moe kernels
-COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+ENV VIRTUAL_ENV=/app/.venv/
+ENV PATH="$PATH:/app/.venv/bin/"
 
 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
-ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
-    pip install -U pip uv && \
-	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project && \
-    . ./.venv/bin/activate && \
+    uv pip install grpcio-tools mypy-protobuf && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir && \
     make gen-server-raw
-
 RUN cd server && \
-	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines && \
-    . ./.venv/bin/activate && \
     pwd && \
     text-generation-server --help
 
+RUN --mount=type=bind,from=vllm-builder,src=/app/vllm/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=exllama-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=exllamav2-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=marlin-kernels,src=/app/marlin-kernels/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=moe-kernels,src=/app/moe-kernels/dist,target=/install \
+    uv pip install /install/*.whl
+
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router
 COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
 # Install launcher
 COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 
 # AWS Sagemaker compatible image
 FROM base AS sagemaker
@@ -368,4 +309,6 @@ COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
 RUN chmod +x /tgi-entrypoint.sh
 
 ENTRYPOINT ["/tgi-entrypoint.sh"]
-CMD ["--json-output"]
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib"
+ENV PYTHONPATH=/app/.venv/lib/python3.11/site-packages
+# CMD ["--json-output"]
diff --git a/server/Makefile-vllm b/server/Makefile-vllm
index 90da96d2..6936b136 100644
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@@ -6,8 +6,7 @@ build-vllm-rocm:
 		git clone https://github.com/mht-sharma/vllm.git vllm; \
 	fi
 	cd vllm && git fetch && git checkout $(commit_rocm) &&  \
-	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python3 setup.py bdist_wheel --dist-dir=dist
 
 install-vllm-rocm: build-vllm-rocm
-	cd vllm && git fetch && git checkout $(commit_rocm) && \
-	PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .
+	cd vllm && git fetch && git checkout $(commit_rocm)

From 87a0af4ec2758e6e30572fcf8ff4277054ba311d Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 7 Apr 2025 16:25:43 +0530
Subject: [PATCH 169/183] Update transformers to 4.51 (#3148)

* update transformres

* Upgrading the nix deps too.

* Forcing torchvision to be in there.

* Fixing bug in mllama.

* Those tests cannot be run in CI.

* Lint.

---------

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 Dockerfile                                    |  12 +-
 flake.lock                                    |   6 +-
 .../models/test_transformers_llama4.py        | 310 +++++++++---------
 nix/overlay.nix                               |  14 +-
 router/src/validation.rs                      |   4 +-
 server/pyproject.toml                         |  19 +-
 .../models/mllama_causal_lm.py                |   6 -
 server/uv.lock                                | 232 +++++++++----
 8 files changed, 366 insertions(+), 237 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 820468c9..03840b97 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -165,8 +165,9 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         git \
         && rm -rf /var/lib/apt/lists/*
 
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
-ENV PATH="$PATH:/root/.local/bin"
+# RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+# ENV PATH="$PATH:/root/.local/bin"
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
 # Install flash-attention dependencies
 # RUN pip install einops --no-cache-dir
 
@@ -183,19 +184,16 @@ COPY server server
 COPY server/Makefile server/Makefile
 ENV HF_KERNELS_CACHE=/kernels
 RUN cd server && \
-	uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project --active && \
+	uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --extra torch --no-install-project --active && \
     make gen-server-raw && \
     kernels download .
 
 RUN cd server && \
-    uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --active --python=${PYTHON_VERSION} && \
+    uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --extra torch --active --python=${PYTHON_VERSION} && \
     uv pip install nvidia-nccl-cu12==2.25.1 && \
     pwd && \
     text-generation-server --help
 
-# This shouldn't be necessary.
-# RUN uv pip install torchvision --no-deps
-
 # Copy build artifacts from flash attention builder
 COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
diff --git a/flake.lock b/flake.lock
index 77593fd1..6e187510 100644
--- a/flake.lock
+++ b/flake.lock
@@ -853,11 +853,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1742783666,
-        "narHash": "sha256-IwdSl51NL6V0f+mYXZR0UTKaGleOsk9zV3l6kt5SUWw=",
+        "lastModified": 1743993291,
+        "narHash": "sha256-u8GHvduU1gCtoFXvTS/wGjH1ouv5S/GRGq6MAT+sG/k=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "60766d63c227d576510ecfb5edd3a687d56f6bc7",
+        "rev": "0cb3c8979c65dc6a5812dfe67499a8c7b8b4325b",
         "type": "github"
       },
       "original": {
diff --git a/integration-tests/models/test_transformers_llama4.py b/integration-tests/models/test_transformers_llama4.py
index a20d3284..029a9fb1 100644
--- a/integration-tests/models/test_transformers_llama4.py
+++ b/integration-tests/models/test_transformers_llama4.py
@@ -1,155 +1,155 @@
-import base64
-from io import BytesIO
-from PIL import Image
-
-import pytest
-
-
-@pytest.fixture(scope="module")
-def flash_llama4_handle(launcher):
-    with launcher("ll-re/Llama-4-Scout-17B-16E-Instruct", num_shard=8) as handle:
-        yield handle
-
-
-@pytest.fixture(scope="module")
-async def flash_llama4(flash_llama4_handle):
-    await flash_llama4_handle.health(300)
-    return flash_llama4_handle.client
-
-
-async def test_flash_llama4(flash_llama4, response_snapshot):
-    response = await flash_llama4.generate(
-        "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
-        seed=42,
-        max_new_tokens=100,
-    )
-
-    assert (
-        response.generated_text
-        == " people died in the 1918 flu pandemic. Estimating the death toll of the 1918 flu pandemic is difficult because of incomplete records and because of the fact that many of the extra deaths were not attributed to the flu. Many experts believe that the 1918 flu pandemic killed between 50 and 100 million people. Iassistant\n\nThe 1918 flu pandemic, also known as the Spanish flu, is indeed one of the most devastating public health crises in human history. Estimating the exact"
-    )
-    assert response.details.generated_tokens == 100
-    assert response == response_snapshot
-
-
-async def test_flash_llama4_image_cow_dog(flash_llama4, response_snapshot):
-    image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
-    response = await flash_llama4.chat(
-        seed=42,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": image_url}},
-                    {
-                        "type": "text",
-                        "text": "What is the breed of the dog in the image?",
-                    },
-                ],
-            },
-        ],
-        max_tokens=100,
-    )
-
-    assert (
-        response.choices[0].message.content
-        == "The image does not depict a dog; it shows a cow standing on a beach. Therefore, there is no breed of a dog to identify."
-    )
-    assert response.usage["completion_tokens"] == 30
-    assert response == response_snapshot
-
-
-async def test_flash_llama4_image_cow(flash_llama4, response_snapshot):
-    image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
-    response = await flash_llama4.chat(
-        seed=42,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": image_url}},
-                    {"type": "text", "text": "What is shown in this image?"},
-                ],
-            },
-        ],
-        max_tokens=100,
-    )
-    assert (
-        response.choices[0].message.content
-        == "The image shows a brown cow standing on the beach with a white face and black and white marking on its ears. The cow has a white patch around its nose and mouth. The ocean and blue sky are in the background."
-    )
-    assert response.usage["completion_tokens"] == 46
-    assert response == response_snapshot
-
-
-# Helper function to convert a Pillow image to a base64 data URL
-def image_to_data_url(img: Image.Image, fmt: str) -> str:
-    buffer = BytesIO()
-    img.save(buffer, format=fmt)
-    img_data = buffer.getvalue()
-    b64_str = base64.b64encode(img_data).decode("utf-8")
-    mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
-    return f"data:{mime_type};base64,{b64_str}"
-
-
-async def test_flash_llama4_image_base64_rgba(flash_llama4, response_snapshot):
-    # Create an empty 100x100 PNG image with alpha (transparent background)
-    img = Image.new("RGBA", (100, 100), (0, 0, 0, 0))
-    data_url = image_to_data_url(img, "PNG")
-    response = await flash_llama4.chat(
-        seed=42,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": data_url}},
-                    {
-                        "type": "text",
-                        "text": "What do you see in this transparent image?",
-                    },
-                ],
-            },
-        ],
-        max_tokens=100,
-    )
-    assert response == response_snapshot
-
-
-async def test_flash_llama4_image_base64_rgb_png(flash_llama4, response_snapshot):
-    # Create an empty 100x100 PNG image without alpha (white background)
-    img = Image.new("RGB", (100, 100), (255, 255, 255))
-    data_url = image_to_data_url(img, "PNG")
-    response = await flash_llama4.chat(
-        seed=42,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": data_url}},
-                    {"type": "text", "text": "What do you see in this plain image?"},
-                ],
-            },
-        ],
-        max_tokens=100,
-    )
-    assert response == response_snapshot
-
-
-async def test_flash_llama4_image_base64_rgb_jpg(flash_llama4, response_snapshot):
-    # Create an empty 100x100 JPEG image (white background)
-    img = Image.new("RGB", (100, 100), (255, 255, 255))
-    data_url = image_to_data_url(img, "JPEG")
-    response = await flash_llama4.chat(
-        seed=42,
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": data_url}},
-                    {"type": "text", "text": "What do you see in this JPEG image?"},
-                ],
-            },
-        ],
-        max_tokens=100,
-    )
-    assert response == response_snapshot
+# import base64
+# from io import BytesIO
+# from PIL import Image
+#
+# import pytest
+#
+#
+# @pytest.fixture(scope="module")
+# def flash_llama4_handle(launcher):
+#     with launcher("ll-re/Llama-4-Scout-17B-16E-Instruct", num_shard=8) as handle:
+#         yield handle
+#
+#
+# @pytest.fixture(scope="module")
+# async def flash_llama4(flash_llama4_handle):
+#     await flash_llama4_handle.health(300)
+#     return flash_llama4_handle.client
+#
+#
+# async def test_flash_llama4(flash_llama4, response_snapshot):
+#     response = await flash_llama4.generate(
+#         "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
+#         seed=42,
+#         max_new_tokens=100,
+#     )
+#
+#     assert (
+#         response.generated_text
+#         == " people died in the 1918 flu pandemic. Estimating the death toll of the 1918 flu pandemic is difficult because of incomplete records and because of the fact that many of the extra deaths were not attributed to the flu. Many experts believe that the 1918 flu pandemic killed between 50 and 100 million people. Iassistant\n\nThe 1918 flu pandemic, also known as the Spanish flu, is indeed one of the most devastating public health crises in human history. Estimating the exact"
+#     )
+#     assert response.details.generated_tokens == 100
+#     assert response == response_snapshot
+#
+#
+# async def test_flash_llama4_image_cow_dog(flash_llama4, response_snapshot):
+#     image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": image_url}},
+#                     {
+#                         "type": "text",
+#                         "text": "What is the breed of the dog in the image?",
+#                     },
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#
+#     assert (
+#         response.choices[0].message.content
+#         == "The image does not depict a dog; it shows a cow standing on a beach. Therefore, there is no breed of a dog to identify."
+#     )
+#     assert response.usage["completion_tokens"] == 30
+#     assert response == response_snapshot
+#
+#
+# async def test_flash_llama4_image_cow(flash_llama4, response_snapshot):
+#     image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": image_url}},
+#                     {"type": "text", "text": "What is shown in this image?"},
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#     assert (
+#         response.choices[0].message.content
+#         == "The image shows a brown cow standing on the beach with a white face and black and white marking on its ears. The cow has a white patch around its nose and mouth. The ocean and blue sky are in the background."
+#     )
+#     assert response.usage["completion_tokens"] == 46
+#     assert response == response_snapshot
+#
+#
+# # Helper function to convert a Pillow image to a base64 data URL
+# def image_to_data_url(img: Image.Image, fmt: str) -> str:
+#     buffer = BytesIO()
+#     img.save(buffer, format=fmt)
+#     img_data = buffer.getvalue()
+#     b64_str = base64.b64encode(img_data).decode("utf-8")
+#     mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
+#     return f"data:{mime_type};base64,{b64_str}"
+#
+#
+# async def test_flash_llama4_image_base64_rgba(flash_llama4, response_snapshot):
+#     # Create an empty 100x100 PNG image with alpha (transparent background)
+#     img = Image.new("RGBA", (100, 100), (0, 0, 0, 0))
+#     data_url = image_to_data_url(img, "PNG")
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": data_url}},
+#                     {
+#                         "type": "text",
+#                         "text": "What do you see in this transparent image?",
+#                     },
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#     assert response == response_snapshot
+#
+#
+# async def test_flash_llama4_image_base64_rgb_png(flash_llama4, response_snapshot):
+#     # Create an empty 100x100 PNG image without alpha (white background)
+#     img = Image.new("RGB", (100, 100), (255, 255, 255))
+#     data_url = image_to_data_url(img, "PNG")
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": data_url}},
+#                     {"type": "text", "text": "What do you see in this plain image?"},
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#     assert response == response_snapshot
+#
+#
+# async def test_flash_llama4_image_base64_rgb_jpg(flash_llama4, response_snapshot):
+#     # Create an empty 100x100 JPEG image (white background)
+#     img = Image.new("RGB", (100, 100), (255, 255, 255))
+#     data_url = image_to_data_url(img, "JPEG")
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": data_url}},
+#                     {"type": "text", "text": "What do you see in this JPEG image?"},
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#     assert response == response_snapshot
diff --git a/nix/overlay.nix b/nix/overlay.nix
index 7274d903..069fdd80 100644
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@@ -18,8 +18,18 @@ final: prev: {
             src = final.fetchFromGitHub {
               owner = "huggingface";
               repo = "transformers";
-              rev = "v4.50.0";
-              hash = "sha256-/scrMPUY43n+XAMbwWCtmiJKXscXGLrklyDg9XZTaqw=";
+              rev = "v4.51.0";
+              hash = "sha256-dnVpc6fm1SYGcx7FegpwVVxUY6XRlsxLs5WOxYv11y8=";
+            };
+          }
+        );
+        huggingface-hub = python-super.huggingface-hub.overrideAttrs (
+          _: _: {
+            src = final.fetchFromGitHub {
+              owner = "huggingface";
+              repo = "huggingface_hub";
+              rev = "v0.30.0";
+              hash = "sha256-sz+n1uoWrSQPqJFiG/qCT6b4r08kD9MsoPZXbfWNB2o=";
             };
           }
         );
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 2d1d9a3d..dfe9dd4d 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -566,7 +566,7 @@ fn fetch_image(input: &str) -> Result<(Vec<u8>, String, usize, usize), Validatio
             return Err(ValidationError::InvalidImageContent(content.to_string()));
         }
 
-        let data = STANDARD.decode(content["base64,".len()..].as_bytes())?;
+        let data = STANDARD.decode(&content["base64,".len()..])?;
         let img = if let Some(format) = format_from_mimetype(mimetype) {
             ImageReader::with_format(Cursor::new(&data), format).decode()?
         } else {
@@ -603,7 +603,7 @@ fn image_tokens(
 
             let mut image_string = String::with_capacity(2 * FAKE.len() + slots * IMAGE.len());
             image_string.push_str(FAKE);
-            image_string.extend(iter::repeat(IMAGE).take(slots));
+            image_string.extend(iter::repeat_n(IMAGE, slots));
             image_string.push_str(FAKE);
 
             if matches!(
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 86cb4922..53347f52 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -31,11 +31,24 @@ dependencies = [
     "sentencepiece>=0.2.0",
     "tokenizers>=0.20.3",
     "typer>=0.15.1",
-    "transformers>=4.49.0",
+    "transformers>=4.51.0",
     "huggingface-hub>=0.30.1",
     "hf-xet>=1.0.0",
 ]
 
+[[tool.uv.index]]
+name = "pytorch-cu124"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cu124", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+torchvision = [
+  { index = "pytorch-cu124", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+
 [build-system]
 requires = ["kernels>=0.1.7", "setuptools"]
 build-backend = "setuptools.build_meta"
@@ -78,6 +91,10 @@ gen = [
     "grpcio-tools>=1.69.0",
     "mypy-protobuf>=3.6.0",
 ]
+torch = [
+    "torch==2.6.0",
+    "torchvision==0.21.0",
+]
 
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
diff --git a/server/text_generation_server/models/mllama_causal_lm.py b/server/text_generation_server/models/mllama_causal_lm.py
index 28e7489e..c268ff9a 100644
--- a/server/text_generation_server/models/mllama_causal_lm.py
+++ b/server/text_generation_server/models/mllama_causal_lm.py
@@ -256,12 +256,6 @@ class MllamaCausalLM(VlmCausalLM):
             max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
-        if cu_seqlen_prefill is None and self.max_past() is not None:
-            # In decode, not prefill, we're actually overwriting the KV-cache
-            # in a circular buffer mode.
-            # This makes sure the max_s for the decode pass is correct.
-            max_s = min(self.max_past(), max_s)
-
         # Try to find an associated cuda graph
         bs = input_ids.shape[0]
         sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
diff --git a/server/uv.lock b/server/uv.lock
index c3f3f089..b4a95c13 100644
--- a/server/uv.lock
+++ b/server/uv.lock
@@ -2,10 +2,14 @@ version = 1
 revision = 1
 requires-python = ">=3.9"
 resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version < '3.10'",
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 
 [[package]]
@@ -20,7 +24,8 @@ dependencies = [
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/85/15/0fab0260ab4069e5224e637d2e400538bb27b0dfc36f17daf68db9770d78/accelerate-1.3.0.tar.gz", hash = "sha256:518631c0adb80bd3d42fb29e7e2dc2256bcd7c786b0ba9119bbaa08611b36d9c", size = 342758 }
 wheels = [
@@ -189,7 +194,8 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
     { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/db/9d/9382259196d7ad7f3550702390081224e673a705e75b5660ee377b592fc0/bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:ba3a720187f518b172ebce4081049c682ae3fd8284947e22499b256ff99a2bc3", size = 69680042 },
@@ -315,7 +321,8 @@ version = "0.9.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pydantic" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
     { name = "transformers" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/40/e0/d9529aae2d2425d214e5a50497df4532d3f9e21c8d2023037c701f8a37d3/compressed-tensors-0.9.1.tar.gz", hash = "sha256:3cf5cd637f0186c184dd5bbbbf941356b1225199b49c6a45bf0909d65907f686", size = 63060 }
@@ -826,7 +833,8 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "packaging" },
     { name = "tomli", marker = "python_full_version < '3.11'" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/26/99/41af9dce502bb1682977fee1bc487a73fa8418cebbce16b8d27733947375/kernels-0.2.1.tar.gz", hash = "sha256:918942332819b28377b9d07070daddecfd8a5e7bab574dd3dc64a209ca6008b2", size = 9395 }
 
@@ -1084,7 +1092,8 @@ name = "networkx"
 version = "3.2.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.10'",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c4/80/a84676339aaae2f1cfdf9f418701dd634aef9cc76f708ef55c36ff39c3ca/networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", size = 2073928 }
 wheels = [
@@ -1096,9 +1105,12 @@ name = "networkx"
 version = "3.4.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368 }
 wheels = [
@@ -1110,7 +1122,8 @@ name = "numpy"
 version = "2.0.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.10'",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015 }
 wheels = [
@@ -1165,9 +1178,12 @@ name = "numpy"
 version = "2.2.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ec/d0/c12ddfd3a02274be06ffc71f3efc6d0e457b0409c4481596881e748cb264/numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f", size = 20233295 }
 wheels = [
@@ -1264,7 +1280,7 @@ name = "nvidia-cudnn-cu12"
 version = "9.1.0.70"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 },
@@ -1275,7 +1291,7 @@ name = "nvidia-cufft-cu12"
 version = "11.2.1.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
@@ -1294,9 +1310,9 @@ name = "nvidia-cusolver-cu12"
 version = "11.6.1.9"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12" },
-    { name = "nvidia-cusparse-cu12" },
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
@@ -1307,7 +1323,7 @@ name = "nvidia-cusparse-cu12"
 version = "12.3.1.170"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
@@ -1509,7 +1525,8 @@ dependencies = [
     { name = "pydantic" },
     { name = "referencing" },
     { name = "requests" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
@@ -1632,7 +1649,8 @@ dependencies = [
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
     { name = "tqdm" },
     { name = "transformers" },
 ]
@@ -2400,7 +2418,8 @@ name = "scipy"
 version = "1.13.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.10'",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 dependencies = [
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
@@ -2438,9 +2457,12 @@ name = "scipy"
 version = "1.15.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 dependencies = [
     { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
@@ -2629,6 +2651,12 @@ quantize = [
     { name = "datasets" },
     { name = "texttable" },
 ]
+torch = [
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "torchvision", version = "0.21.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torchvision", version = "0.21.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
 
 [package.metadata]
 requires-dist = [
@@ -2666,10 +2694,14 @@ requires-dist = [
     { name = "sentencepiece", specifier = ">=0.2.0" },
     { name = "texttable", marker = "extra == 'quantize'", specifier = ">=1.6.7,<2" },
     { name = "tokenizers", specifier = ">=0.20.3" },
-    { name = "transformers", specifier = ">=4.49.0" },
+    { name = "torch", marker = "(sys_platform == 'linux' and extra == 'torch') or (sys_platform == 'win32' and extra == 'torch')", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu124" },
+    { name = "torch", marker = "sys_platform != 'linux' and sys_platform != 'win32' and extra == 'torch'", specifier = "==2.6.0" },
+    { name = "torchvision", marker = "(sys_platform == 'linux' and extra == 'torch') or (sys_platform == 'win32' and extra == 'torch')", specifier = "==0.21.0", index = "https://download.pytorch.org/whl/cu124" },
+    { name = "torchvision", marker = "sys_platform != 'linux' and sys_platform != 'win32' and extra == 'torch'", specifier = "==0.21.0" },
+    { name = "transformers", specifier = ">=4.51.0" },
     { name = "typer", specifier = ">=0.15.1" },
 ]
-provides-extras = ["accelerate", "bnb", "compressed-tensors", "peft", "outlines", "dev", "quantize", "gen"]
+provides-extras = ["accelerate", "bnb", "compressed-tensors", "peft", "outlines", "dev", "quantize", "gen", "torch"]
 
 [[package]]
 name = "texttable"
@@ -2748,12 +2780,46 @@ wheels = [
 name = "torch"
 version = "2.6.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
 dependencies = [
-    { name = "filelock" },
-    { name = "fsspec" },
-    { name = "jinja2" },
-    { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "filelock", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "fsspec", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "jinja2", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "setuptools", marker = "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "sympy", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "typing-extensions", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/16/ea1b7842413a7b8a5aaa5e99e8eaf3da3183cc3ab345ad025a07ff636301/torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628", size = 66520221 },
+    { url = "https://files.pythonhosted.org/packages/0b/fa/f33a4148c6fb46ca2a3f8de39c24d473822d5774d652b66ed9b1214da5f7/torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21", size = 66530713 },
+    { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538 },
+    { url = "https://files.pythonhosted.org/packages/88/8b/d60c0491ab63634763be1537ad488694d316ddc4a20eaadd639cedc53971/torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2", size = 66536783 },
+    { url = "https://files.pythonhosted.org/packages/b3/17/41f681b87290a1d2f1394f943e470f8b0b3c2987b7df8dc078d8831fce5b/torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c", size = 66520446 },
+]
+
+[[package]]
+name = "torch"
+version = "2.6.0+cu124"
+source = { registry = "https://download.pytorch.org/whl/cu124" }
+resolution-markers = [
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+]
+dependencies = [
+    { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "fsspec", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "jinja2", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and sys_platform == 'linux') or (python_full_version >= '3.10' and sys_platform == 'win32')" },
     { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
@@ -2767,32 +2833,76 @@ dependencies = [
     { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "setuptools", marker = "python_full_version >= '3.12'" },
-    { name = "sympy" },
+    { name = "setuptools", marker = "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')" },
+    { name = "sympy", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
     { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/37/81/aa9ab58ec10264c1abe62c8b73f5086c3c558885d6beecebf699f0dbeaeb/torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961", size = 766685561 },
-    { url = "https://files.pythonhosted.org/packages/86/86/e661e229df2f5bfc6eab4c97deb1286d598bbeff31ab0cdb99b3c0d53c6f/torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab", size = 95751887 },
-    { url = "https://files.pythonhosted.org/packages/20/e0/5cb2f8493571f0a5a7273cd7078f191ac252a402b5fb9cb6091f14879109/torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341", size = 204165139 },
-    { url = "https://files.pythonhosted.org/packages/e5/16/ea1b7842413a7b8a5aaa5e99e8eaf3da3183cc3ab345ad025a07ff636301/torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628", size = 66520221 },
-    { url = "https://files.pythonhosted.org/packages/78/a9/97cbbc97002fff0de394a2da2cdfa859481fdca36996d7bd845d50aa9d8d/torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1", size = 766715424 },
-    { url = "https://files.pythonhosted.org/packages/6d/fa/134ce8f8a7ea07f09588c9cc2cea0d69249efab977707cf67669431dcf5c/torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d", size = 95759416 },
-    { url = "https://files.pythonhosted.org/packages/11/c5/2370d96b31eb1841c3a0883a492c15278a6718ccad61bb6a649c80d1d9eb/torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7", size = 204164970 },
-    { url = "https://files.pythonhosted.org/packages/0b/fa/f33a4148c6fb46ca2a3f8de39c24d473822d5774d652b66ed9b1214da5f7/torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21", size = 66530713 },
-    { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563 },
-    { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867 },
-    { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469 },
-    { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538 },
-    { url = "https://files.pythonhosted.org/packages/24/85/ead1349fc30fe5a32cadd947c91bda4a62fbfd7f8c34ee61f6398d38fb48/torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf", size = 766626191 },
-    { url = "https://files.pythonhosted.org/packages/dd/b0/26f06f9428b250d856f6d512413e9e800b78625f63801cbba13957432036/torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b", size = 95611439 },
-    { url = "https://files.pythonhosted.org/packages/c2/9c/fc5224e9770c83faed3a087112d73147cd7c7bfb7557dcf9ad87e1dda163/torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc", size = 204126475 },
-    { url = "https://files.pythonhosted.org/packages/88/8b/d60c0491ab63634763be1537ad488694d316ddc4a20eaadd639cedc53971/torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2", size = 66536783 },
-    { url = "https://files.pythonhosted.org/packages/40/bb/feb5644baa621fd8e1e88bf51f6fa38ab3f985d472a764144ff4867ac1d6/torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053", size = 766680961 },
-    { url = "https://files.pythonhosted.org/packages/ee/11/08774a8198a33263947c59e04b8a0bf85a61a44e82100c46cf833bbce35e/torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e", size = 95782656 },
-    { url = "https://files.pythonhosted.org/packages/c1/0d/56fb07032accbfebb4555638b6002ec5678d0942da85497e40f9405ab756/torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a", size = 204061417 },
-    { url = "https://files.pythonhosted.org/packages/b3/17/41f681b87290a1d2f1394f943e470f8b0b3c2987b7df8dc078d8831fce5b/torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c", size = 66520446 },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-linux_x86_64.whl", hash = "sha256:7f2ba7f7c0459320a521696f6b5bccc187f59890b23c9dfb6c49b0b87c6bfc97" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-win_amd64.whl", hash = "sha256:7cc45c5b39d74875cfafe908b7f55c544147cc16b01e795feb2fe766583efe78" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl", hash = "sha256:d4c3e9a8d31a7c0fcbb9da17c31a1917e1fac26c566a4cfbd8c9568ad7cade79" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-win_amd64.whl", hash = "sha256:6a1fb2714e9323f11edb6e8abf7aad5f79e45ad25c081cde87681a18d99c29eb" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl", hash = "sha256:a393b506844035c0dac2f30ea8478c343b8e95a429f06f3b3cadfc7f53adb597" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-win_amd64.whl", hash = "sha256:3313061c1fec4c7310cf47944e84513dcd27b6173b72a349bb7ca68d0ee6e9c0" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp313-cp313-linux_x86_64.whl", hash = "sha256:0f3bc53c988ce9568cd876a2a5316761e84a8704135ec8068f5f81b4417979cb" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp313-cp313-win_amd64.whl", hash = "sha256:519330eef09534acad8110b6f423d2fe58c1d8e9ada999ed077a637a0021f908" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp313-cp313t-linux_x86_64.whl", hash = "sha256:35cba404c0d742406cdcba1609085874bc60facdfbc50e910c47a92405fef44c" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp39-cp39-linux_x86_64.whl", hash = "sha256:e661267cd0242462ab100bdd67f651988aa9f67eb31609d6909afcac891df612" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp39-cp39-win_amd64.whl", hash = "sha256:c2eb62b99161d87be486c88fd82441274cc892bce8c48dbc28c055cb147732ce" },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.21.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "pillow", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8e/0d/143bd264876fad17c82096b6c2d433f1ac9b29cdc69ee45023096976ee3d/torchvision-0.21.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:044ea420b8c6c3162a234cada8e2025b9076fa82504758cd11ec5d0f8cd9fa37", size = 1784140 },
+    { url = "https://files.pythonhosted.org/packages/29/88/00c69db213ee2443ada8886ec60789b227e06bb869d85ee324578221a7f7/torchvision-0.21.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:110d115333524d60e9e474d53c7d20f096dbd8a080232f88dddb90566f90064c", size = 1784141 },
+    { url = "https://files.pythonhosted.org/packages/6e/1b/28f527b22d5e8800184d0bc847f801ae92c7573a8c15979d92b7091c0751/torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:97a5814a93c793aaf0179cfc7f916024f4b63218929aee977b645633d074a49f", size = 1784140 },
+    { url = "https://files.pythonhosted.org/packages/f9/56/47d456b61c3bbce7bed4af3925c83d405bb87468e659fd3cf3d9840c3b51/torchvision-0.21.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:659b76c86757cb2ee4ca2db245e0740cfc3081fef46f0f1064d11adb4a8cee31", size = 1784141 },
+    { url = "https://files.pythonhosted.org/packages/49/d5/d18c5d89cbe32015b033f1fa06918c7cdd5c0af0c03e55d72a3cc2d768f8/torchvision-0.21.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5c22caeaae8b3c36d93459f1a5294e6f43306cff856ed243189a229331a404b4", size = 1784154 },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.21.0+cu124"
+source = { registry = "https://download.pytorch.org/whl/cu124" }
+resolution-markers = [
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+]
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and sys_platform == 'linux') or (python_full_version >= '3.10' and sys_platform == 'win32')" },
+    { name = "pillow", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp310-cp310-linux_x86_64.whl", hash = "sha256:3d3e74018eaa7837c73e3764dad3b7792b7544401c25a42977e9744303731bd3" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp310-cp310-win_amd64.whl", hash = "sha256:0c6aefb70ab2b312065240c804e459ac7b0e449867afd469b38d2fd47f9391a7" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-linux_x86_64.whl", hash = "sha256:137376805aca5ba57bd2c7a3ecb8569df961dbe82b128aac9b3b0a7125ef9385" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-win_amd64.whl", hash = "sha256:000a013584ad2304ab30496318145f284ac364622addb5ee3a5abd2769ba146f" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp312-cp312-linux_x86_64.whl", hash = "sha256:efb53ea0af7bf09b7b53e2a18b9be6d245f7d46a90b51d5cf97f37e9b929a991" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp312-cp312-win_amd64.whl", hash = "sha256:ec63c2ee792757492da40590e34b14f2fceda29050558c215f0c1f3b08149c0f" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp313-cp313-linux_x86_64.whl", hash = "sha256:4b70acf3b4b96a0ceb1374116626c9bef9e8be016b57b1284e482260ca1896d6" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp313-cp313-win_amd64.whl", hash = "sha256:8fcf55321b206de70ff8e01c884fa42e57a60b1cb749341b96e0f22c8a7c9ec7" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp39-cp39-linux_x86_64.whl", hash = "sha256:6afb21a22f5497e08ea4dbd4544472330d8249bf09dafd239302552cad6906b2" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp39-cp39-win_amd64.whl", hash = "sha256:579b6a7fffc34a860c57a7131221ef125831f5961431f8da15760ab1ef752d44" },
 ]
 
 [[package]]
@@ -2809,7 +2919,7 @@ wheels = [
 
 [[package]]
 name = "transformers"
-version = "4.49.0"
+version = "4.51.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -2824,9 +2934,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/79/50/46573150944f46df8ec968eda854023165a84470b42f69f67c7d475dabc5/transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e", size = 8610952 }
+sdist = { url = "https://files.pythonhosted.org/packages/38/75/6ebdae4d6f4574f47139a070445245537e43482d006f615af8e23d5bf05e/transformers-4.51.0.tar.gz", hash = "sha256:2d302563ff6c2cc2d0e88ef352cf059f9a21ce18102fd43662bb1246f70b8a84", size = 8925571 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/37/1f29af63e9c30156a3ed6ebc2754077016577c094f31de7b2631e5d379eb/transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03", size = 9970275 },
+    { url = "https://files.pythonhosted.org/packages/6f/db/7ee15028d5130929aa0b1b85bab6d8bafe806254d3b5c56c42a0066cceb8/transformers-4.51.0-py3-none-any.whl", hash = "sha256:2e6baa476735ab8adccbaee6961525a0d1ce8c21d49293af30ef5ee4b082f64d", size = 10362017 },
 ]
 
 [[package]]

From 37104acd75e40b19d26e2cf29fa9cbe2c946400b Mon Sep 17 00:00:00 2001
From: Baptiste Colle <32412211+baptistecolle@users.noreply.github.com>
Date: Mon, 7 Apr 2025 16:55:03 +0200
Subject: [PATCH 170/183] Gaudi: Add Integration Test for Gaudi Backend (#3142)

* feat(gaudi): add integration test

* feat(test): add more models to integration tests

* remove debug comments

* fix typos
---
 backends/gaudi/Makefile                       |   15 +-
 backends/gaudi/README.md                      |   44 +
 .../capture_expected_outputs.py               |   85 ++
 .../server/integration-tests/conftest.py      |  292 ++++
 .../gaudi/server/integration-tests/pytest.ini |    2 +
 .../server/integration-tests/requirements.txt |    7 +
 .../server/integration-tests/test_model.py    |  276 ++++
 backends/gaudi/server/tests/conftest.py       |   23 -
 .../gaudi/server/tests/models/test_bloom.py   |  363 -----
 .../server/tests/models/test_causal_lm.py     |  354 -----
 .../gaudi/server/tests/models/test_model.py   |   83 --
 .../server/tests/models/test_santacoder.py    |  108 --
 .../server/tests/models/test_seq2seq_lm.py    |  365 -----
 .../gaudi/server/tests/utils/test_adapter.py  |  247 ----
 .../gaudi/server/tests/utils/test_convert.py  |   21 -
 backends/gaudi/server/tests/utils/test_hub.py |  103 --
 .../gaudi/server/tests/utils/test_layers.py   |   82 --
 .../gaudi/server/tests/utils/test_tokens.py   |   88 --
 .../server/tests/utils/test_watermark.py      |   54 -
 .../gaudi/server/tests/utils/test_weights.py  | 1177 -----------------
 docs/source/backends/gaudi.mdx                |   69 +-
 21 files changed, 763 insertions(+), 3095 deletions(-)
 create mode 100644 backends/gaudi/server/integration-tests/capture_expected_outputs.py
 create mode 100644 backends/gaudi/server/integration-tests/conftest.py
 create mode 100644 backends/gaudi/server/integration-tests/pytest.ini
 create mode 100644 backends/gaudi/server/integration-tests/requirements.txt
 create mode 100644 backends/gaudi/server/integration-tests/test_model.py
 delete mode 100644 backends/gaudi/server/tests/conftest.py
 delete mode 100644 backends/gaudi/server/tests/models/test_bloom.py
 delete mode 100644 backends/gaudi/server/tests/models/test_causal_lm.py
 delete mode 100644 backends/gaudi/server/tests/models/test_model.py
 delete mode 100644 backends/gaudi/server/tests/models/test_santacoder.py
 delete mode 100644 backends/gaudi/server/tests/models/test_seq2seq_lm.py
 delete mode 100644 backends/gaudi/server/tests/utils/test_adapter.py
 delete mode 100644 backends/gaudi/server/tests/utils/test_convert.py
 delete mode 100644 backends/gaudi/server/tests/utils/test_hub.py
 delete mode 100644 backends/gaudi/server/tests/utils/test_layers.py
 delete mode 100644 backends/gaudi/server/tests/utils/test_tokens.py
 delete mode 100644 backends/gaudi/server/tests/utils/test_watermark.py
 delete mode 100644 backends/gaudi/server/tests/utils/test_weights.py

diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile
index 6e38c19e..f760f4d6 100644
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@@ -1,6 +1,6 @@
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
-root_dir := "${mkfile_dir}/../.."
+root_dir := ${mkfile_dir}/../..
 
 HABANA_VERSION := 1.20.0
 PYTORCH_VERSION := 2.6.0
@@ -47,3 +47,16 @@ local-dev-install: install-dependencies
 		make install-server && \
 		make install-router && \
 		make install-launcher'
+
+# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
+run-integration-tests:
+	uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
+	DOCKER_VOLUME=${root_dir}/data \
+	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
+	uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests
+
+# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
+capture-expected-outputs-for-integration-tests:
+	DOCKER_VOLUME=${root_dir}/data \
+	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
+	uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests/capture_expected_outputs.py
diff --git a/backends/gaudi/README.md b/backends/gaudi/README.md
index 765ad447..ba890f0b 100644
--- a/backends/gaudi/README.md
+++ b/backends/gaudi/README.md
@@ -96,3 +96,47 @@ curl 127.0.0.1:8080/generate \
      -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
      -H 'Content-Type: application/json'
 ```
+
+### Integration tests
+
+To run the integration tests, you need to first build the image:
+```bash
+make -C backends/gaudi image
+```
+
+Then run the following command to run the integration tests:
+```bash
+make -C backends/gaudi run-integration-tests
+```
+
+To capture the expected outputs for the integration tests, you can run the following command:
+```bash
+make -C backends/gaudi capture-expected-outputs-for-integration-tests
+```
+
+#### How the integration tests works
+The integration tests works as follows:
+
+1. Start a tgi server in a container, similar to the command:
+```bash
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+  -p 8080:80 -v $volume:/data \
+  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
+  tgi-gaudi --model-id $model \
+  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 4 --max-batch-prefill-tokens 2048
+```
+
+2. Do a /generate request to the server, similar to the command:
+```bash
+curl 127.0.0.1:8080/generate \
+     -X POST \
+     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+     -H 'Content-Type: application/json'
+```
+
+3. Check the output of the server against the expected output:
+```python
+assert curl_output == expected_output
+```
+
+This is the repeated for a set of models and configurations.
diff --git a/backends/gaudi/server/integration-tests/capture_expected_outputs.py b/backends/gaudi/server/integration-tests/capture_expected_outputs.py
new file mode 100644
index 00000000..051b9d69
--- /dev/null
+++ b/backends/gaudi/server/integration-tests/capture_expected_outputs.py
@@ -0,0 +1,85 @@
+import json
+import os
+from typing import Dict, Any, Generator
+
+import pytest
+from test_model import TEST_CONFIGS
+
+UNKNOWN_CONFIGS = {
+    name: config
+    for name, config in TEST_CONFIGS.items()
+    if config["expected_greedy_output"] == "unknown"
+    or config["expected_batch_output"] == "unknown"
+}
+
+
+@pytest.fixture(scope="module", params=UNKNOWN_CONFIGS.keys())
+def test_config(request) -> Dict[str, Any]:
+    """Fixture that provides model configurations for testing."""
+    test_config = UNKNOWN_CONFIGS[request.param]
+    test_config["test_name"] = request.param
+    return test_config
+
+
+@pytest.fixture(scope="module")
+def test_name(test_config):
+    yield test_config["test_name"]
+
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, test_config, test_name) -> Generator:
+    """Fixture that provides a TGI service for testing."""
+    with launcher(test_config["model_id"], test_name) as service:
+        yield service
+
+
+@pytest.mark.asyncio
+async def test_capture_expected_outputs(tgi_service, test_config, test_name):
+    """Test that captures expected outputs for models with unknown outputs."""
+    print(f"Testing {test_name} with {test_config['model_id']}")
+
+    # Wait for service to be ready
+    await tgi_service.health(1000)
+    client = tgi_service.client
+
+    # Test single request (greedy)
+    print("Testing single request...")
+    response = await client.generate(
+        test_config["input"],
+        max_new_tokens=32,
+    )
+    greedy_output = response.generated_text
+
+    # Test multiple requests (batch)
+    print("Testing batch requests...")
+    responses = []
+    for _ in range(4):
+        response = await client.generate(
+            test_config["input"],
+            max_new_tokens=32,
+        )
+        responses.append(response.generated_text)
+
+    # Store results in a JSON file
+    output_file = "server/integration-tests/expected_outputs.json"
+    results = {}
+
+    # Try to load existing results if file exists
+    if os.path.exists(output_file):
+        with open(output_file, "r") as f:
+            results = json.load(f)
+
+    # Update results for this model
+    results[test_name] = {
+        "model_id": test_config["model_id"],
+        "input": test_config["input"],
+        "greedy_output": greedy_output,
+        "batch_outputs": responses,
+        "args": test_config["args"],
+    }
+
+    # Save updated results
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=2)
+
+    print(f"\nResults for {test_name} saved to {output_file}")
diff --git a/backends/gaudi/server/integration-tests/conftest.py b/backends/gaudi/server/integration-tests/conftest.py
new file mode 100644
index 00000000..c7daf70e
--- /dev/null
+++ b/backends/gaudi/server/integration-tests/conftest.py
@@ -0,0 +1,292 @@
+import asyncio
+import contextlib
+import os
+import shlex
+import subprocess
+import sys
+import threading
+import time
+from tempfile import TemporaryDirectory
+from typing import List
+import socket
+
+import docker
+import pytest
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
+from docker.errors import NotFound
+from loguru import logger
+from test_model import TEST_CONFIGS
+from text_generation import AsyncClient
+from text_generation.types import Response
+
+# Use the latest image from the local docker build
+DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
+DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+
+assert (
+    HF_TOKEN is not None
+), "HF_TOKEN is not set, please set it as some models are gated and thus the test will fail without it"
+
+if DOCKER_VOLUME is None:
+    logger.warning(
+        "DOCKER_VOLUME is not set, this will lead to the tests redownloading the models on each run, consider setting it to speed up testing"
+    )
+
+LOG_LEVEL = os.getenv("LOG_LEVEL", "info")
+
+BASE_ENV = {
+    "HF_HUB_ENABLE_HF_TRANSFER": "1",
+    "LOG_LEVEL": LOG_LEVEL,
+    "HF_TOKEN": os.getenv("HF_TOKEN", None),
+}
+
+
+HABANA_RUN_ARGS = {
+    "runtime": "habana",
+    "ipc_mode": "host",
+    "cap_add": ["sys_nice"],
+}
+
+logger.add(
+    sys.stderr,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
+    level="INFO",
+)
+
+
+def stream_container_logs(container, test_name):
+    """Stream container logs in a separate thread."""
+    try:
+        for log in container.logs(stream=True, follow=True):
+            print(
+                f"[TGI Server Logs - {test_name}] {log.decode('utf-8')}",
+                end="",
+                file=sys.stderr,
+                flush=True,
+            )
+    except Exception as e:
+        logger.error(f"Error streaming container logs: {str(e)}")
+
+
+class LauncherHandle:
+    def __init__(self, port: int):
+        self.client = AsyncClient(f"http://localhost:{port}", timeout=3600)
+
+    def _inner_health(self):
+        raise NotImplementedError
+
+    async def health(self, timeout: int = 60):
+        assert timeout > 0
+        start_time = time.time()
+        logger.info(f"Starting health check with timeout of {timeout}s")
+
+        for attempt in range(timeout):
+            if not self._inner_health():
+                logger.error("Launcher crashed during health check")
+                raise RuntimeError("Launcher crashed")
+
+            try:
+                await self.client.generate("test")
+                elapsed = time.time() - start_time
+                logger.info(f"Health check passed after {elapsed:.1f}s")
+                return
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
+                if attempt == timeout - 1:
+                    logger.error(f"Health check failed after {timeout}s: {str(e)}")
+                    raise RuntimeError(f"Health check failed: {str(e)}")
+                if attempt % 10 == 0 and attempt != 0:  # Only log every 10th attempt
+                    logger.debug(
+                        f"Connection attempt {attempt}/{timeout} failed: {str(e)}"
+                    )
+                time.sleep(1)
+            except Exception as e:
+                logger.error(f"Unexpected error during health check: {str(e)}")
+                # Get full traceback for debugging
+                import traceback
+
+                logger.error(f"Full traceback:\n{traceback.format_exc()}")
+                raise
+
+
+class ContainerLauncherHandle(LauncherHandle):
+    def __init__(self, docker_client, container_name, port: int):
+        super(ContainerLauncherHandle, self).__init__(port)
+        self.docker_client = docker_client
+        self.container_name = container_name
+
+    def _inner_health(self) -> bool:
+        try:
+            container = self.docker_client.containers.get(self.container_name)
+            status = container.status
+            if status not in ["running", "created"]:
+                logger.warning(f"Container status is {status}")
+                # Get container logs for debugging
+                logs = container.logs().decode("utf-8")
+                logger.debug(f"Container logs:\n{logs}")
+            return status in ["running", "created"]
+        except Exception as e:
+            logger.error(f"Error checking container health: {str(e)}")
+            return False
+
+
+class ProcessLauncherHandle(LauncherHandle):
+    def __init__(self, process, port: int):
+        super(ProcessLauncherHandle, self).__init__(port)
+        self.process = process
+
+    def _inner_health(self) -> bool:
+        return self.process.poll() is None
+
+
+@pytest.fixture(scope="module")
+def data_volume():
+    tmpdir = TemporaryDirectory()
+    yield tmpdir.name
+    try:
+        # Cleanup the temporary directory using sudo as it contains root files created by the container
+        subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"), check=True)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error cleaning up temporary directory: {str(e)}")
+
+
+@pytest.fixture(scope="module")
+def launcher(data_volume):
+    @contextlib.contextmanager
+    def docker_launcher(
+        model_id: str,
+        test_name: str,
+    ):
+        logger.info(
+            f"Starting docker launcher for model {model_id} and test {test_name}"
+        )
+
+        # Get a random available port
+        def get_free_port():
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.bind(("", 0))
+                s.listen(1)
+                port = s.getsockname()[1]
+            return port
+
+        port = get_free_port()
+        logger.debug(f"Using port {port}")
+
+        client = docker.from_env()
+
+        container_name = f"tgi-gaudi-test-{test_name.replace('/', '-')}"
+
+        try:
+            container = client.containers.get(container_name)
+            logger.info(
+                f"Stopping existing container {container_name} for test {test_name}"
+            )
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+        except Exception as e:
+            logger.error(f"Error handling existing container: {str(e)}")
+
+        model_name = next(
+            name for name, cfg in TEST_CONFIGS.items() if cfg["model_id"] == model_id
+        )
+
+        tgi_args = TEST_CONFIGS[model_name]["args"].copy()
+
+        env = BASE_ENV.copy()
+
+        # Add model_id to env
+        env["MODEL_ID"] = model_id
+
+        # Add env config that is definied in the fixture parameter
+        if "env_config" in TEST_CONFIGS[model_name]:
+            env.update(TEST_CONFIGS[model_name]["env_config"].copy())
+
+        volumes = [f"{DOCKER_VOLUME}:/data"]
+        logger.debug(f"Using volume {volumes}")
+
+        try:
+            logger.info(f"Creating container with name {container_name}")
+
+            # Log equivalent docker run command for debugging, this is not actually executed
+            container = client.containers.run(
+                DOCKER_IMAGE,
+                command=tgi_args,
+                name=container_name,
+                environment=env,
+                detach=True,
+                volumes=volumes,
+                ports={"80/tcp": port},
+                **HABANA_RUN_ARGS,
+            )
+
+            logger.info(f"Container {container_name} started successfully")
+
+            # Start log streaming in a background thread
+            log_thread = threading.Thread(
+                target=stream_container_logs,
+                args=(container, test_name),
+                daemon=True,  # This ensures the thread will be killed when the main program exits
+            )
+            log_thread.start()
+
+            # Add a small delay to allow container to initialize
+            time.sleep(2)
+
+            # Check container status after creation
+            status = container.status
+            logger.debug(f"Initial container status: {status}")
+            if status not in ["running", "created"]:
+                logs = container.logs().decode("utf-8")
+                logger.error(f"Container failed to start properly. Logs:\n{logs}")
+
+            yield ContainerLauncherHandle(client, container.name, port)
+
+        except Exception as e:
+            logger.error(f"Error starting container: {str(e)}")
+            # Get full traceback for debugging
+            import traceback
+
+            logger.error(f"Full traceback:\n{traceback.format_exc()}")
+            raise
+        finally:
+            try:
+                container = client.containers.get(container_name)
+                logger.info(f"Stopping container {container_name}")
+                container.stop()
+                container.wait()
+
+                container_output = container.logs().decode("utf-8")
+                print(container_output, file=sys.stderr)
+
+                container.remove()
+                logger.info(f"Container {container_name} removed successfully")
+            except NotFound:
+                pass
+            except Exception as e:
+                logger.warning(f"Error cleaning up container: {str(e)}")
+
+    return docker_launcher
+
+
+@pytest.fixture(scope="module")
+def generate_load():
+    async def generate_load_inner(
+        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
+    ) -> List[Response]:
+        try:
+            futures = [
+                client.generate(
+                    prompt,
+                    max_new_tokens=max_new_tokens,
+                    decoder_input_details=True,
+                )
+                for _ in range(n)
+            ]
+            return await asyncio.gather(*futures)
+        except Exception as e:
+            logger.error(f"Error generating load: {str(e)}")
+            raise
+
+    return generate_load_inner
diff --git a/backends/gaudi/server/integration-tests/pytest.ini b/backends/gaudi/server/integration-tests/pytest.ini
new file mode 100644
index 00000000..2f4c80e3
--- /dev/null
+++ b/backends/gaudi/server/integration-tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+asyncio_mode = auto
diff --git a/backends/gaudi/server/integration-tests/requirements.txt b/backends/gaudi/server/integration-tests/requirements.txt
new file mode 100644
index 00000000..b67d2d8c
--- /dev/null
+++ b/backends/gaudi/server/integration-tests/requirements.txt
@@ -0,0 +1,7 @@
+pytest >= 8.3.5
+pytest-asyncio >= 0.26.0
+docker >= 7.1.0
+Levenshtein >= 0.27.1
+loguru >= 0.7.3
+aiohttp >= 3.11.14
+text-generation
diff --git a/backends/gaudi/server/integration-tests/test_model.py b/backends/gaudi/server/integration-tests/test_model.py
new file mode 100644
index 00000000..cb2bf6a9
--- /dev/null
+++ b/backends/gaudi/server/integration-tests/test_model.py
@@ -0,0 +1,276 @@
+from typing import Any, Dict
+
+from text_generation import AsyncClient
+import pytest
+from Levenshtein import distance as levenshtein_distance
+
+# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
+TEST_CONFIGS = {
+    "meta-llama/Llama-3.1-8B-Instruct-shared": {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
+        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
+        "args": [
+            "--sharded",
+            "true",
+            "--num-shard",
+            "8",
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "8",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "meta-llama/Llama-3.1-8B-Instruct": {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "env_config": {},
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "meta-llama/Llama-2-7b-chat-hf": {
+        "model_id": "meta-llama/Llama-2-7b-chat-hf",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+        "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "mistralai/Mistral-7B-Instruct-v0.3": {
+        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "bigcode/starcoder2-3b": {
+        "model_id": "bigcode/starcoder2-3b",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "google/gemma-7b-it": {
+        "model_id": "google/gemma-7b-it",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "Qwen/Qwen2-0.5B-Instruct": {
+        "model_id": "Qwen/Qwen2-0.5B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+        "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "tiiuae/falcon-7b-instruct": {
+        "model_id": "tiiuae/falcon-7b-instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+        "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "microsoft/phi-1_5": {
+        "model_id": "microsoft/phi-1_5",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+        "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "openai-community/gpt2": {
+        "model_id": "openai-community/gpt2",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+        "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "facebook/opt-125m": {
+        "model_id": "facebook/opt-125m",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+        "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "EleutherAI/gpt-j-6b": {
+        "model_id": "EleutherAI/gpt-j-6b",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+}
+
+print(f"Testing {len(TEST_CONFIGS)} models")
+
+
+@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
+def test_config(request) -> Dict[str, Any]:
+    """Fixture that provides model configurations for testing."""
+    test_config = TEST_CONFIGS[request.param]
+    test_config["test_name"] = request.param
+    return test_config
+
+
+@pytest.fixture(scope="module")
+def model_id(test_config):
+    yield test_config["model_id"]
+
+
+@pytest.fixture(scope="module")
+def test_name(test_config):
+    yield test_config["test_name"]
+
+
+@pytest.fixture(scope="module")
+def expected_outputs(test_config):
+    return {
+        "greedy": test_config["expected_greedy_output"],
+        # "sampling": model_config["expected_sampling_output"],
+        "batch": test_config["expected_batch_output"],
+    }
+
+
+@pytest.fixture(scope="module")
+def input(test_config):
+    return test_config["input"]
+
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, model_id, test_name):
+    with launcher(model_id, test_name) as tgi_service:
+        yield tgi_service
+
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service) -> AsyncClient:
+    await tgi_service.health(1000)
+    return tgi_service.client
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(
+    tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
+):
+    # Bounded greedy decoding without input
+    response = await tgi_client.generate(
+        input,
+        max_new_tokens=32,
+    )
+    assert response.details.generated_tokens == 32
+    assert response.generated_text == expected_outputs["greedy"]
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(
+    tgi_client, generate_load, expected_outputs, input
+):
+    num_requests = 4
+    responses = await generate_load(
+        tgi_client,
+        input,
+        max_new_tokens=32,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expected = expected_outputs["batch"]
+    for r in responses:
+        assert r.details.generated_tokens == 32
+        # Compute the similarity with the expectation using the levenshtein distance
+        # We should not have more than two substitutions or additions
+        assert levenshtein_distance(r.generated_text, expected) < 3
diff --git a/backends/gaudi/server/tests/conftest.py b/backends/gaudi/server/tests/conftest.py
deleted file mode 100644
index d99771f8..00000000
--- a/backends/gaudi/server/tests/conftest.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import pytest
-import os
-from text_generation_server.pb import generate_pb2
-
-os.environ["USE_PREFIX_CACHING"] = "1"
-os.environ["ATTENTION"] = "flashinfer"
-
-
-@pytest.fixture
-def default_pb_parameters():
-    return generate_pb2.NextTokenChooserParameters(
-        temperature=1.0,
-        repetition_penalty=1.0,
-        top_k=0,
-        top_p=1.0,
-        typical_p=1.0,
-        do_sample=False,
-    )
-
-
-@pytest.fixture
-def default_pb_stop_parameters():
-    return generate_pb2.StoppingCriteriaParameters(stop_sequences=[], max_new_tokens=10)
diff --git a/backends/gaudi/server/tests/models/test_bloom.py b/backends/gaudi/server/tests/models/test_bloom.py
deleted file mode 100644
index dff5239b..00000000
--- a/backends/gaudi/server/tests/models/test_bloom.py
+++ /dev/null
@@ -1,363 +0,0 @@
-import pytest
-import torch
-
-from copy import copy
-from transformers import AutoTokenizer
-
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.causal_lm import CausalLMBatch
-from text_generation_server.utils import weight_hub_files, download_weights
-from text_generation_server.models.bloom import BloomCausalLMBatch, BLOOMSharded
-from text_generation_server.models.custom_modeling.bloom_modeling import (
-    BloomForCausalLM,
-)
-
-
-@pytest.fixture(scope="session")
-def default_bloom():
-    model_id = "bigscience/bloom-560m"
-    revision = "main"
-    filenames = weight_hub_files(model_id, revision, ".safetensors")
-    download_weights(filenames, model_id, revision)
-    return BLOOMSharded(
-        model_id,
-        model_class=BloomForCausalLM,
-    )
-
-
-@pytest.fixture(scope="session")
-def bloom_560m_tokenizer():
-    return AutoTokenizer.from_pretrained("bigscience/bloom-560m", padding_side="left")
-
-
-@pytest.fixture
-def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="Test",
-        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_pb_batch(default_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
-
-
-@pytest.fixture
-def default_bloom_batch(default_pb_batch, bloom_560m_tokenizer):
-    return BloomCausalLMBatch.from_pb(
-        default_pb_batch, bloom_560m_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-@pytest.fixture
-def default_multi_requests_bloom_batch(default_pb_request, bloom_560m_tokenizer):
-    req_0 = copy(default_pb_request)
-    req_0.id = 1
-    req_1 = default_pb_request
-    req_1.id = 2
-    req_1.stopping_parameters.max_new_tokens = 5
-
-    batch_pb = generate_pb2.Batch(id=0, requests=[req_0, req_1], size=2)
-    return BloomCausalLMBatch.from_pb(
-        batch_pb, bloom_560m_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-def test_batch_from_pb(default_pb_batch, default_bloom_batch):
-    batch = default_bloom_batch
-
-    assert batch.batch_id == default_pb_batch.id
-    assert batch.requests == default_pb_batch.requests
-
-    assert len(batch.input_ids) == default_pb_batch.size
-    assert batch.input_ids[0][-1] == 10264
-    assert torch.all(batch.input_ids[0][:-1] == 3)
-
-    assert batch.attention_mask[0][0] == 1
-    assert torch.all(batch.attention_mask[0][1:] == 0)
-
-    assert batch.past_key_values is None
-
-    assert all(
-        [
-            torch.equal(input_ids, all_input_ids[:, 0])
-            for input_ids, all_input_ids in zip(batch.input_ids, batch.all_input_ids)
-        ]
-    )
-
-    assert batch.input_lengths == [1]
-
-    assert len(batch) == default_pb_batch.size
-    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)
-
-    assert batch.max_input_length == batch.input_lengths[0]
-
-
-def test_batch_concatenate_no_prefill(default_bloom_batch):
-    with pytest.raises(ValueError):
-        BloomCausalLMBatch.concatenate([default_bloom_batch, default_bloom_batch])
-
-
-def test_causal_lm_batch_type(default_bloom):
-    assert default_bloom.batch_type == BloomCausalLMBatch
-
-
-def test_causal_lm_generate_token(default_bloom, default_bloom_batch):
-    sequence_length = len(default_bloom_batch.all_input_ids[0])
-    generations, next_batch, _ = default_bloom.generate_token(default_bloom_batch)
-
-    assert len(generations) == len(default_bloom_batch)
-    assert isinstance(next_batch, CausalLMBatch)
-    assert not next_batch.keys_head_dim_last
-
-    assert len(next_batch.all_input_ids) == len(next_batch)
-    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
-    assert len(next_batch.attention_mask[0]) == 11
-    assert torch.all(next_batch.all_input_ids[0][-2:] == 10264)
-    assert torch.all(next_batch.all_input_ids[0][:-2] == 3)
-
-    assert torch.all(next_batch.attention_mask[0][:2] == 1)
-    assert torch.all(next_batch.attention_mask[0][2:] == 0)
-
-    assert next_batch.input_ids.shape == (len(next_batch), 1)
-    assert next_batch.input_ids[0, 0] == 10264
-
-    assert next_batch.input_lengths == [2]
-    assert next_batch.max_input_length == next_batch.input_lengths[0]
-
-    assert next_batch.past_key_values is not None
-    assert all(
-        [p[0].shape == (16, 64, sequence_length) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[1].shape == (16, sequence_length, 64) for p in next_batch.past_key_values]
-    )
-    assert all([generation.generated_text is None for generation in generations])
-    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
-    assert all(
-        [
-            token_id.item() == 10264
-            for generation in generations
-            for token_id in generation.tokens.token_ids
-        ]
-    )
-    assert all(
-        [
-            token_text == "Test"
-            for generation in generations
-            for token_text in generation.tokens.texts
-        ]
-    )
-    assert generations[0].request_id == 0
-
-
-def test_causal_lm_generate_token_completion(default_bloom, default_bloom_batch):
-    next_batch = default_bloom_batch
-    for _ in range(default_bloom_batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(default_bloom_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert (
-        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
-    )
-    assert generations[0].request_id == default_bloom_batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_bloom_batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-def test_causal_lm_generate_token_completion_multi(
-    default_bloom, default_multi_requests_bloom_batch
-):
-    next_batch = default_multi_requests_bloom_batch
-
-    for i in range(
-        default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 1
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(default_multi_requests_bloom_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[1].generated_text.text == "TestTestTestTestTest"
-    assert (
-        generations[1].request_id == default_multi_requests_bloom_batch.requests[1].id
-    )
-    assert (
-        generations[1].generated_text.generated_tokens
-        == default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
-    )
-    # Copy stopping_criterias before filtering
-    stopping_criterias = default_multi_requests_bloom_batch.stopping_criterias.copy()
-
-    next_batch = next_batch.filter([next_batch.requests[0].id])
-
-    for _ in range(
-        stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert (
-        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
-    )
-    assert (
-        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
-    )
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-def test_batch_concatenate(
-    default_bloom, default_bloom_batch, default_multi_requests_bloom_batch
-):
-    next_batch_0 = default_bloom_batch
-    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)
-    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)
-
-    next_batch_1 = default_multi_requests_bloom_batch
-    _, next_batch_1, _ = default_bloom.generate_token(next_batch_1)
-
-    # Clone past_key_values before concatenating to compare after,
-    # because they are removed from the concatenated batches
-    next_batch_0_past_key_values = [
-        (k.clone(), v.clone()) for (k, v) in next_batch_0.past_key_values
-    ]
-    next_batch_1_past_key_values = [
-        (k.clone(), v.clone()) for (k, v) in next_batch_1.past_key_values
-    ]
-
-    next_batch = BloomCausalLMBatch.concatenate([next_batch_0, next_batch_1])
-
-    assert torch.equal(next_batch.all_input_ids[0], next_batch_0.all_input_ids[0])
-    assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
-    assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])
-
-    assert torch.all(
-        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
-    )
-    assert torch.all(
-        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
-    )
-    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)
-
-    assert next_batch.batch_id == 0
-    assert torch.all(next_batch.input_ids == 10264)
-
-    assert next_batch.input_lengths == [3, 2, 2]
-    assert next_batch.max_input_length == 3
-
-    assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == list(next_batch_1.requests)
-
-    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
-    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
-
-    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
-    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias
-
-    assert next_batch.past_key_values is not None
-    assert all([p[0].shape == (3, 16, 64, 2) for p in next_batch.past_key_values])
-    assert all([p[1].shape == (3, 16, 2, 64) for p in next_batch.past_key_values])
-
-    for i, past in enumerate(next_batch.past_key_values):
-        assert torch.equal(next_batch_0_past_key_values[i][0][:, :, -2:], past[0][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][0][:, :, -1:],
-            past[0][1:, :, :, -1].reshape(-1, 64, 1),
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][1][:, -2:, :], past[1][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][1][:, -1:, :],
-            past[1][1:, :, -1, :].reshape(-1, 1, 64),
-        )
-
-    for _ in range(
-        default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 2
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 3
-    assert generations[2].generated_text.text == "TestTestTestTestTest"
-    assert (
-        generations[2].request_id == default_multi_requests_bloom_batch.requests[1].id
-    )
-    assert (
-        generations[2].generated_text.generated_tokens
-        == default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
-    )
-
-    next_batch = next_batch.filter(
-        [next_batch.requests[0].id, next_batch.requests[1].id]
-    )
-
-    for _ in range(
-        default_bloom_batch.stopping_criterias[0].max_new_tokens
-        - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
-        - 2
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert (
-        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
-    )
-    assert generations[0].request_id == default_bloom_batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_bloom_batch.stopping_criterias[0].max_new_tokens
-    )
-
-    next_batch = next_batch.filter([next_batch.requests[1].id])
-
-    for _ in range(
-        default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
-        - default_bloom_batch.stopping_criterias[0].max_new_tokens
-        - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
-        - 4
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert (
-        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
-    )
-    assert (
-        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
-    )
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
-    )
diff --git a/backends/gaudi/server/tests/models/test_causal_lm.py b/backends/gaudi/server/tests/models/test_causal_lm.py
deleted file mode 100644
index 11ca5efe..00000000
--- a/backends/gaudi/server/tests/models/test_causal_lm.py
+++ /dev/null
@@ -1,354 +0,0 @@
-import pytest
-import torch
-
-from copy import copy
-from transformers import AutoTokenizer
-
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.causal_lm import CausalLM, CausalLMBatch
-
-
-@pytest.fixture(scope="session")
-def default_causal_lm():
-    return CausalLM.fallback("gpt2")
-
-
-@pytest.fixture(scope="session")
-def gpt2_tokenizer():
-    tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
-    tokenizer.pad_token_id = 50256
-    return tokenizer
-
-
-@pytest.fixture
-def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="Test",
-        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_pb_batch(default_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
-
-
-@pytest.fixture
-def default_causal_lm_batch(default_pb_batch, gpt2_tokenizer):
-    return CausalLMBatch.from_pb(
-        default_pb_batch, gpt2_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-@pytest.fixture
-def default_multi_requests_causal_lm_batch(default_pb_request, gpt2_tokenizer):
-    req_0 = copy(default_pb_request)
-    req_0.id = 1
-    req_1 = default_pb_request
-    req_1.id = 2
-    req_1.stopping_parameters.max_new_tokens = 5
-
-    batch_pb = generate_pb2.Batch(id=1, requests=[req_0, req_1], size=2)
-    return CausalLMBatch.from_pb(
-        batch_pb, gpt2_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-def test_batch_from_pb(default_pb_batch, default_causal_lm_batch):
-    batch = default_causal_lm_batch
-
-    assert batch.batch_id == default_pb_batch.id
-    assert batch.requests == default_pb_batch.requests
-
-    assert len(batch.input_ids) == default_pb_batch.size
-    assert batch.input_ids[0][-1] == 14402
-    assert torch.all(batch.input_ids[0][:-1] == 50256)
-
-    assert batch.attention_mask[0, 0] == 1
-    assert torch.all(batch.attention_mask[0, 1:] == 0)
-
-    assert batch.past_key_values is None
-
-    assert all(
-        [
-            torch.equal(input_ids, all_input_ids[:, 0])
-            for input_ids, all_input_ids in zip(batch.input_ids, batch.all_input_ids)
-        ]
-    )
-
-    assert batch.input_lengths == [1]
-
-    assert len(batch) == default_pb_batch.size
-    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)
-
-    assert batch.max_input_length == batch.input_lengths[0]
-
-
-def test_batch_concatenate_no_prefill(default_causal_lm_batch):
-    with pytest.raises(ValueError):
-        CausalLMBatch.concatenate([default_causal_lm_batch, default_causal_lm_batch])
-
-
-def test_causal_lm_batch_type(default_causal_lm):
-    assert default_causal_lm.batch_type == CausalLMBatch
-
-
-def test_causal_lm_generate_token(default_causal_lm, default_causal_lm_batch):
-    sequence_length = len(default_causal_lm_batch.all_input_ids[0])
-    generations, next_batch, _ = default_causal_lm.generate_token(
-        default_causal_lm_batch
-    )
-
-    assert len(generations) == len(next_batch)
-    assert isinstance(next_batch, CausalLMBatch)
-
-    assert len(next_batch.all_input_ids) == len(next_batch)
-    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
-    assert len(next_batch.attention_mask[0]) == 11
-    assert next_batch.all_input_ids[0][-1] == 13
-    assert next_batch.all_input_ids[0][-2] == 14402
-    assert torch.all(next_batch.all_input_ids[0][:-2] == 50256)
-
-    assert torch.all(next_batch.attention_mask[0][0:2] == 1)
-    assert torch.all(next_batch.attention_mask[0][2:] == 0)
-
-    assert next_batch.input_ids.shape == (len(next_batch), 1)
-    assert next_batch.input_ids[0, 0] == 13
-
-    assert next_batch.input_lengths == [2]
-    assert next_batch.max_input_length == next_batch.input_lengths[0]
-
-    assert next_batch.past_key_values is not None
-    assert all(
-        [p[0].shape == (1, 12, sequence_length, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[1].shape == (1, 12, sequence_length, 64) for p in next_batch.past_key_values]
-    )
-    assert all([generation.generated_text is None for generation in generations])
-    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
-    assert all(
-        [
-            token_id.item() == 13
-            for generation in generations
-            for token_id in generation.tokens.token_ids
-        ]
-    )
-    assert all(
-        [
-            token_text == "."
-            for generation in generations
-            for token_text in generation.tokens.texts
-        ]
-    )
-    assert generations[0].request_id == 0
-
-
-def test_causal_lm_generate_token_completion(
-    default_causal_lm, default_causal_lm_batch
-):
-    next_batch = default_causal_lm_batch
-    for _ in range(default_causal_lm_batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
-    assert generations[0].request_id == default_causal_lm_batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_causal_lm_batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-def test_causal_lm_generate_token_completion_multi(
-    default_causal_lm, default_multi_requests_causal_lm_batch
-):
-    next_batch = default_multi_requests_causal_lm_batch
-
-    for i in range(
-        default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 1
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[1].generated_text.text == ".java:784)"
-    assert (
-        generations[1].request_id
-        == default_multi_requests_causal_lm_batch.requests[1].id
-    )
-    assert (
-        generations[1].generated_text.generated_tokens
-        == default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
-    )
-    # Copy stopping_criterias before filtering
-    stopping_criterias = (
-        default_multi_requests_causal_lm_batch.stopping_criterias.copy()
-    )
-
-    next_batch = next_batch.filter([next_batch.requests[0].id])
-
-    for _ in range(
-        stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
-    assert (
-        generations[0].request_id
-        == default_multi_requests_causal_lm_batch.requests[0].id
-    )
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-def test_batch_concatenate(
-    default_causal_lm, default_causal_lm_batch, default_multi_requests_causal_lm_batch
-):
-    next_batch_0 = default_causal_lm_batch
-    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)
-    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)
-
-    next_batch_1 = default_multi_requests_causal_lm_batch
-    _, next_batch_1, _ = default_causal_lm.generate_token(next_batch_1)
-
-    # Clone past_key_values before concatenating to compare after,
-    # because they are removed from the concatenated batches
-    next_batch_0_past_key_values = [
-        (k.clone(), v.clone()) for (k, v) in next_batch_0.past_key_values
-    ]
-    next_batch_1_past_key_values = [
-        (k.clone(), v.clone()) for (k, v) in next_batch_1.past_key_values
-    ]
-
-    next_batch = CausalLMBatch.concatenate([next_batch_0, next_batch_1])
-
-    assert torch.equal(next_batch.all_input_ids[0], next_batch_0.all_input_ids[0])
-    assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
-    assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])
-
-    assert torch.all(
-        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
-    )
-    assert torch.all(
-        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
-    )
-    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)
-
-    assert next_batch.batch_id == 0
-    assert next_batch.input_ids[0, 0] == 12355
-    assert torch.all(next_batch.input_ids[1:] == 13)
-
-    assert next_batch.input_lengths == [3, 2, 2]
-    assert next_batch.max_input_length == 3
-
-    assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == list(next_batch_1.requests)
-
-    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
-    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
-
-    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
-    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias
-
-    assert next_batch.past_key_values is not None
-    assert all([p[0].shape == (3, 12, 2, 64) for p in next_batch.past_key_values])
-    assert all([p[1].shape == (3, 12, 2, 64) for p in next_batch.past_key_values])
-
-    for i, past in enumerate(next_batch.past_key_values):
-        assert torch.equal(next_batch_0_past_key_values[i][0][0, :, -2:], past[0][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][0][:, :, -1:], past[0][1:, :, -1:, :]
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][1][0, :, -2:], past[1][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][1][:, :, -1:], past[1][1:, :, -1:, :]
-        )
-
-    for _ in range(
-        default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 2
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 3
-    assert generations[2].generated_text.text == ".java:784)"
-    assert (
-        generations[2].request_id
-        == default_multi_requests_causal_lm_batch.requests[1].id
-    )
-    assert (
-        generations[2].generated_text.generated_tokens
-        == default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
-    )
-
-    next_batch = next_batch.filter(
-        [next_batch.requests[0].id, next_batch.requests[1].id]
-    )
-
-    for _ in range(
-        default_causal_lm_batch.stopping_criterias[0].max_new_tokens
-        - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
-        - 2
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
-    assert generations[0].request_id == default_causal_lm_batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_causal_lm_batch.stopping_criterias[0].max_new_tokens
-    )
-
-    next_batch = next_batch.filter([next_batch.requests[1].id])
-
-    for _ in range(
-        default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
-        - default_causal_lm_batch.stopping_criterias[0].max_new_tokens
-        - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
-        - 4
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
-    assert (
-        generations[0].request_id
-        == default_multi_requests_causal_lm_batch.requests[0].id
-    )
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
-    )
diff --git a/backends/gaudi/server/tests/models/test_model.py b/backends/gaudi/server/tests/models/test_model.py
deleted file mode 100644
index 8441e8c6..00000000
--- a/backends/gaudi/server/tests/models/test_model.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import pytest
-import torch
-
-from transformers import AutoTokenizer
-
-from text_generation_server.models import Model
-
-
-def get_test_model():
-    class TestModel(Model):
-        def batch_type(self):
-            raise NotImplementedError
-
-        def generate_token(self, batch):
-            raise NotImplementedError
-
-    tokenizer = AutoTokenizer.from_pretrained("huggingface/llama-7b")
-
-    model = TestModel(
-        "test_model_id",
-        torch.nn.Linear(1, 1),
-        tokenizer,
-        False,
-        torch.float32,
-        torch.device("cpu"),
-    )
-    return model
-
-
-@pytest.mark.private
-def test_decode_streaming_english_spaces():
-    model = get_test_model()
-    truth = "Hello here, this is a simple test"
-    all_input_ids = [15043, 1244, 29892, 445, 338, 263, 2560, 1243]
-    assert (
-        all_input_ids == model.tokenizer(truth, add_special_tokens=False)["input_ids"]
-    )
-
-    decoded_text = ""
-    offset = 0
-    token_offset = 0
-    for i in range(len(all_input_ids)):
-        text, offset, token_offset = model.decode_token(
-            all_input_ids[: i + 1], offset, token_offset
-        )
-        decoded_text += text
-
-    assert decoded_text == truth
-
-
-@pytest.mark.private
-def test_decode_streaming_chinese_utf8():
-    model = get_test_model()
-    truth = "我很感谢你的热情"
-    all_input_ids = [
-        30672,
-        232,
-        193,
-        139,
-        233,
-        135,
-        162,
-        235,
-        179,
-        165,
-        30919,
-        30210,
-        234,
-        134,
-        176,
-        30993,
-    ]
-
-    decoded_text = ""
-    offset = 0
-    token_offset = 0
-    for i in range(len(all_input_ids)):
-        text, offset, token_offset = model.decode_token(
-            all_input_ids[: i + 1], offset, token_offset
-        )
-        decoded_text += text
-
-    assert decoded_text == truth
diff --git a/backends/gaudi/server/tests/models/test_santacoder.py b/backends/gaudi/server/tests/models/test_santacoder.py
deleted file mode 100644
index d5c91bff..00000000
--- a/backends/gaudi/server/tests/models/test_santacoder.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import pytest
-
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.causal_lm import CausalLMBatch, CausalLM
-
-
-@pytest.fixture(scope="session")
-def default_santacoder():
-    return CausalLM.fallback(model_id="bigcode/santacoder")
-
-
-@pytest.fixture
-def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="def",
-        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="def")]),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_pb_batch(default_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
-
-
-@pytest.fixture
-def default_fim_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="<fim-prefix>def<fim-suffix>world<fim-middle>",
-        input_chunks=generate_pb2.Input(
-            chunks=[
-                generate_pb2.InputChunk(
-                    text="<fim-prefix>def<fim-suffix>world<fim-middle>"
-                )
-            ]
-        ),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_fim_pb_batch(default_fim_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_fim_pb_request], size=1)
-
-
-@pytest.mark.skip
-def test_santacoder_generate_token_completion(default_santacoder, default_pb_batch):
-    batch = CausalLMBatch.from_pb(
-        default_pb_batch,
-        default_santacoder.tokenizer,
-        default_santacoder.dtype,
-        default_santacoder.device,
-    )
-    next_batch = batch
-
-    for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == " test_get_all_users_with_"
-    assert generations[0].request_id == batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-@pytest.mark.skip
-def test_fim_santacoder_generate_token_completion(
-    default_santacoder, default_fim_pb_batch
-):
-    batch = CausalLMBatch.from_pb(
-        default_fim_pb_batch,
-        default_santacoder.tokenizer,
-        default_santacoder.dtype,
-        default_santacoder.device,
-    )
-    next_batch = batch
-
-    for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert (
-        generations[0].generated_text.text
-        == """ineProperty(exports, "__esModule", { value"""
-    )
-    assert generations[0].request_id == batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == batch.stopping_criterias[0].max_new_tokens
-    )
diff --git a/backends/gaudi/server/tests/models/test_seq2seq_lm.py b/backends/gaudi/server/tests/models/test_seq2seq_lm.py
deleted file mode 100644
index 5ba7c64c..00000000
--- a/backends/gaudi/server/tests/models/test_seq2seq_lm.py
+++ /dev/null
@@ -1,365 +0,0 @@
-import pytest
-import torch
-
-from copy import copy
-
-from transformers import AutoTokenizer
-
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.seq2seq_lm import Seq2SeqLM, Seq2SeqLMBatch
-
-
-@pytest.fixture(scope="session")
-def mt0_small_tokenizer():
-    tokenizer = AutoTokenizer.from_pretrained(
-        "bigscience/mt0-small", padding_side="left"
-    )
-    tokenizer.bos_token_id = 0
-    return tokenizer
-
-
-@pytest.fixture(scope="session")
-def default_seq2seq_lm():
-    return Seq2SeqLM.fallback("bigscience/mt0-small")
-
-
-@pytest.fixture
-def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="Test",
-        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_pb_batch(default_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
-
-
-@pytest.fixture
-def default_seq2seq_lm_batch(default_pb_batch, mt0_small_tokenizer):
-    return Seq2SeqLMBatch.from_pb(
-        default_pb_batch, mt0_small_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-@pytest.fixture
-def default_multi_requests_seq2seq_lm_batch(default_pb_request, mt0_small_tokenizer):
-    req_0 = copy(default_pb_request)
-    req_0.id = 1
-    req_1 = default_pb_request
-    req_1.id = 2
-    req_1.stopping_parameters.max_new_tokens = 5
-
-    batch_pb = generate_pb2.Batch(id=0, requests=[req_0, req_1], size=2)
-    return Seq2SeqLMBatch.from_pb(
-        batch_pb, mt0_small_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-def test_batch_from_pb(default_pb_batch, default_seq2seq_lm_batch):
-    batch = default_seq2seq_lm_batch
-    sequence_length = len(default_seq2seq_lm_batch.input_ids[0])
-
-    assert batch.batch_id == default_pb_batch.id
-    assert batch.requests == default_pb_batch.requests
-
-    assert batch.input_ids.shape == (default_pb_batch.size, sequence_length)
-    assert batch.input_ids[0][-2] == 4268
-    assert batch.input_ids[0][-1] == 1
-    assert torch.all(batch.input_ids[0][:-2] == 0)
-
-    assert torch.all(batch.attention_mask[0][-2:] == 1)
-    assert torch.all(batch.attention_mask[0][:-2] == 0)
-
-    assert len(batch.decoder_input_ids) == default_pb_batch.size
-    assert batch.decoder_attention_mask is None
-    assert batch.encoder_last_hidden_state is None
-
-    assert batch.past_key_values is None
-
-    assert batch.input_lengths == [2]
-    assert batch.decoder_input_lengths == [1]
-
-    assert len(batch) == default_pb_batch.size
-    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)
-
-    assert batch.max_input_length == batch.input_lengths[0]
-    assert batch.max_decoder_input_length == batch.decoder_input_lengths[0]
-
-
-def test_batch_concatenate_no_prefill(default_seq2seq_lm_batch):
-    with pytest.raises(ValueError):
-        Seq2SeqLMBatch.concatenate([default_seq2seq_lm_batch, default_seq2seq_lm_batch])
-
-
-def test_seq2seq_lm_batch_type(default_seq2seq_lm):
-    assert default_seq2seq_lm.batch_type == Seq2SeqLMBatch
-
-
-def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_lm_batch):
-    sequence_length = len(default_seq2seq_lm_batch.input_ids[0])
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(
-        default_seq2seq_lm_batch
-    )
-
-    assert len(generations) == len(next_batch)
-    assert isinstance(next_batch, Seq2SeqLMBatch)
-
-    assert next_batch.input_ids is None
-    assert torch.equal(
-        next_batch.attention_mask, default_seq2seq_lm_batch.attention_mask
-    )
-    assert next_batch.input_lengths == default_seq2seq_lm_batch.input_lengths
-    assert next_batch.max_input_length == default_seq2seq_lm_batch.max_input_length
-    assert (
-        next_batch.next_token_choosers == default_seq2seq_lm_batch.next_token_choosers
-    )
-    assert next_batch.stopping_criterias == default_seq2seq_lm_batch.stopping_criterias
-
-    assert len(next_batch.decoder_input_ids) == len(next_batch)
-    assert next_batch.all_decoder_input_ids[0][0] == 0
-    assert next_batch.all_decoder_input_ids[0][1] == 259
-    assert next_batch.decoder_attention_mask is None
-    assert next_batch.encoder_last_hidden_state.shape == (1, sequence_length, 512)
-
-    assert next_batch.decoder_input_lengths == [2]
-    assert next_batch.max_decoder_input_length == 2
-
-    assert next_batch.past_key_values is not None
-    assert all(
-        [p[0].shape == (len(next_batch), 6, 1, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[1].shape == (len(next_batch), 6, 1, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [
-            p[2].shape == (len(next_batch), 6, sequence_length, 64)
-            for p in next_batch.past_key_values
-        ]
-    )
-    assert all(
-        [
-            p[3].shape == (len(next_batch), 6, sequence_length, 64)
-            for p in next_batch.past_key_values
-        ]
-    )
-    assert all([generation.generated_text is None for generation in generations])
-    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
-    assert all(
-        [
-            token_id.item() == 259
-            for generation in generations
-            for token_id in generation.tokens.token_ids
-        ]
-    )
-    assert all(
-        [
-            token_text == " "
-            for generation in generations
-            for token_text in generation.tokens.texts
-        ]
-    )
-    assert generations[0].request_id == 0
-
-
-def test_seq2seq_lm_generate_token_completion(
-    default_seq2seq_lm, default_seq2seq_lm_batch
-):
-    next_batch = default_seq2seq_lm_batch
-    for _ in range(6):
-        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == "a few weeks"
-    assert generations[0].request_id == default_seq2seq_lm_batch.requests[0].id
-    assert generations[0].generated_text.generated_tokens == 7
-
-
-def test_seq2seq_lm_generate_token_completion_multi(
-    default_seq2seq_lm, default_multi_requests_seq2seq_lm_batch
-):
-    next_batch = default_multi_requests_seq2seq_lm_batch
-
-    for i in range(4):
-        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[1].generated_text.text == "a few "
-    assert (
-        generations[1].request_id
-        == default_multi_requests_seq2seq_lm_batch.requests[1].id
-    )
-    assert generations[1].generated_text.generated_tokens == 5
-
-    next_batch = next_batch.filter([next_batch.requests[0].id])
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == "a few weeks"
-    assert (
-        generations[0].request_id
-        == default_multi_requests_seq2seq_lm_batch.requests[0].id
-    )
-    assert generations[0].generated_text.generated_tokens == 7
-
-
-def test_batch_concatenate(
-    default_seq2seq_lm,
-    default_seq2seq_lm_batch,
-    default_multi_requests_seq2seq_lm_batch,
-):
-    next_batch_0 = default_seq2seq_lm_batch
-    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)
-    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)
-
-    next_batch_1 = default_multi_requests_seq2seq_lm_batch
-    _, next_batch_1, _ = default_seq2seq_lm.generate_token(next_batch_1)
-
-    # Copy hidden state because it is removed from the concatenated branches
-    next_batch_0_encoder_last_hidden_state = next_batch_0.encoder_last_hidden_state
-    next_batch_1_encoder_last_hidden_state = next_batch_1.encoder_last_hidden_state
-
-    # Clone past_key_values before concatenating to compare after,
-    # because they are removed from the concatenated batches
-    next_batch_0_past_key_values = [
-        [t.clone() for t in layer] for layer in next_batch_0.past_key_values
-    ]
-    next_batch_1_past_key_values = [
-        [t.clone() for t in layer] for layer in next_batch_1.past_key_values
-    ]
-
-    next_batch = Seq2SeqLMBatch.concatenate([next_batch_0, next_batch_1])
-
-    assert next_batch.batch_id == 0
-
-    assert torch.equal(
-        next_batch.decoder_input_ids[0], next_batch_0.decoder_input_ids[0]
-    )
-    assert next_batch.all_decoder_input_ids[1][0] == 0
-    assert next_batch.all_decoder_input_ids[2][0] == 0
-    assert torch.equal(
-        next_batch.decoder_input_ids[1:, -2:], next_batch_1.decoder_input_ids
-    )
-
-    assert torch.all(next_batch.decoder_attention_mask[0, :3] == 1)
-    assert torch.all(next_batch.decoder_attention_mask[0, 3:] == 0)
-    assert torch.all(next_batch.decoder_attention_mask[1:, 0] == 0)
-    assert torch.all(next_batch.decoder_attention_mask[1:, 1:3] == 1)
-
-    assert torch.equal(
-        next_batch.encoder_last_hidden_state[0],
-        next_batch_0_encoder_last_hidden_state[0, -2:],
-    )
-    assert torch.equal(
-        next_batch.encoder_last_hidden_state[1:],
-        next_batch_1_encoder_last_hidden_state[:, -2:],
-    )
-
-    assert next_batch.input_lengths == [2, 2, 2]
-    assert next_batch.decoder_input_lengths == [3, 2, 2]
-    assert next_batch.max_input_length == 2
-    assert next_batch.max_decoder_input_length == 3
-
-    assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == list(next_batch_1.requests)
-
-    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
-    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
-
-    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
-    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias
-
-    assert next_batch.past_key_values is not None
-    assert all(
-        [p[0].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[1].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[2].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[3].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
-    )
-
-    for i, past in enumerate(next_batch.past_key_values):
-        assert torch.equal(next_batch_0_past_key_values[i][0][0, :, -2:, :], past[0][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][0][:, :, -1:, :], past[0][1:, :, -1:, :]
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][1][0, :, -2:, :], past[1][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][1][:, :, -1:, :], past[1][1:, :, -1:, :]
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][2][0, :, -2:, :], past[2][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][2][:, :, -2:, :], past[2][1:]
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][3][0, :, -2:, :], past[3][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][3][:, :, -2:, :], past[3][1:]
-        )
-
-    for _ in range(3):
-        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 3
-    assert generations[2].generated_text.text == "a few "
-    assert (
-        generations[2].request_id
-        == default_multi_requests_seq2seq_lm_batch.requests[1].id
-    )
-    assert generations[2].generated_text.generated_tokens == 5
-
-    next_batch = next_batch.filter(
-        [next_batch.requests[0].id, next_batch.requests[1].id]
-    )
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[0].generated_text.text == "a few weeks"
-    assert generations[0].request_id == default_seq2seq_lm_batch.requests[0].id
-    assert generations[0].generated_text.generated_tokens == 7
-
-    next_batch = next_batch.filter([next_batch.requests[1].id])
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == "a few weeks"
-    assert (
-        generations[0].request_id
-        == default_multi_requests_seq2seq_lm_batch.requests[0].id
-    )
-    assert generations[0].generated_text.generated_tokens == 7
diff --git a/backends/gaudi/server/tests/utils/test_adapter.py b/backends/gaudi/server/tests/utils/test_adapter.py
deleted file mode 100644
index a27c1055..00000000
--- a/backends/gaudi/server/tests/utils/test_adapter.py
+++ /dev/null
@@ -1,247 +0,0 @@
-import pytest
-from unittest.mock import Mock
-from text_generation_server.utils.adapter import (
-    get_attn_weights,
-    get_mlp_weights,
-    parse_lora_adapters,
-    AdapterInfo,
-)
-
-
-def test_parse_lora_adapters_empty():
-    assert parse_lora_adapters(None) == []
-    assert parse_lora_adapters("") == []
-
-
-def test_parse_lora_adapters_single():
-    result = parse_lora_adapters("adapter1")
-    assert result == [AdapterInfo(id="adapter1", path=None, revision=None)]
-
-
-def test_parse_lora_adapters_with_path():
-    result = parse_lora_adapters("adapter1=path/to/adapter1")
-    assert result == [
-        AdapterInfo(id="adapter1", path="path/to/adapter1", revision=None)
-    ]
-
-
-def test_parse_lora_adapters_with_path_and_revision():
-    result = parse_lora_adapters("adapter1=path/to/adapter1@main")
-    assert result == [
-        AdapterInfo(id="adapter1", path="path/to/adapter1", revision="main")
-    ]
-
-
-def test_parse_lora_adapters_multiple():
-    result = parse_lora_adapters(
-        "adapter1,adapter2=path/to/adapter2,adapter3=path/to/adapter3@dev"
-    )
-    assert result == [
-        AdapterInfo(id="adapter1", path=None, revision=None),
-        AdapterInfo(id="adapter2", path="path/to/adapter2", revision=None),
-        AdapterInfo(id="adapter3", path="path/to/adapter3", revision="dev"),
-    ]
-
-
-def test_parse_lora_adapters_invalid_format():
-    try:
-        parse_lora_adapters("adapter1,invalid=format=test,adapter3")
-        assert False, "Should have raised ValueError"
-    except ValueError as e:
-        assert str(e) == "Invalid LoRA adapter format: invalid=format=test"
-
-
-def test_get_attn_weights():
-    # create a mock layer
-    mock_layer = Mock()
-    mock_layer.self_attn.query_key_value = Mock()
-    mock_layer.self_attn.o_proj = Mock()
-
-    # call the function
-    result = get_attn_weights(2, mock_layer)
-
-    # assert the result
-    expected = {
-        (2, "q_proj"): (
-            "model.layers.2.self_attn.q_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "k_proj"): (
-            "model.layers.2.self_attn.k_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "qkv_proj"): (
-            "model.layers.2.self_attn.qkv_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "v_proj"): (
-            "model.layers.2.self_attn.v_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
-    }
-    assert result == expected
-
-
-def test_get_mlp_weights_with_gate_up_proj():
-    # create a mock layer with gate_up_proj
-    mock_layer = Mock()
-    mock_layer.mlp.gate_up_proj = Mock()
-    mock_layer.mlp.down_proj = Mock()
-
-    # call the function
-    result = get_mlp_weights(3, mock_layer)
-
-    # assert the result
-    expected = {
-        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
-        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
-        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
-    }
-    assert result == expected
-
-
-def test_get_mlp_weights_without_gate_up_proj():
-    # create a mock layer without gate_up_proj
-    mock_layer = Mock()
-    mock_layer.mlp = Mock(spec=[])
-
-    # call the function
-    result = get_mlp_weights(1, mock_layer)
-
-    # assert the result
-    assert result == {}
-
-
-@pytest.mark.parametrize("layer_index", [0, 1, 5])
-def test_get_attn_weights_different_layers(layer_index):
-    mock_layer = Mock()
-    mock_layer.self_attn.query_key_value = Mock()
-    mock_layer.self_attn.o_proj = Mock()
-
-    result = get_attn_weights(layer_index, mock_layer)
-
-    for k in ["q", "k", "v"]:
-        assert (layer_index, f"{k}_proj") in result
-        assert (
-            result[(layer_index, f"{k}_proj")][0]
-            == f"model.layers.{layer_index}.self_attn.{k}_proj"
-        )
-
-    assert (layer_index, "o_proj") in result
-    assert (
-        result[(layer_index, "o_proj")][0]
-        == f"model.layers.{layer_index}.self_attn.o_proj"
-    )
-
-
-@pytest.mark.parametrize("layer_index", [0, 1, 5])
-def test_get_mlp_weights_different_layers(layer_index):
-    mock_layer = Mock()
-    mock_layer.mlp.gate_up_proj = Mock()
-    mock_layer.mlp.down_proj = Mock()
-
-    result = get_mlp_weights(layer_index, mock_layer)
-
-    for k in ["gate", "up", "down"]:
-        assert (layer_index, f"{k}_proj") in result
-        assert (
-            result[(layer_index, f"{k}_proj")][0]
-            == f"model.layers.{layer_index}.mlp.{k}_proj"
-        )
-
-
-def test_get_attn_weights_llama_compatibility():
-    mock_layer = Mock()
-    mock_layer.self_attn.query_key_value = Mock()
-    mock_layer.self_attn.o_proj = Mock()
-
-    result = get_attn_weights(2, mock_layer)
-
-    expected = {
-        (2, "q_proj"): (
-            "model.layers.2.self_attn.q_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "k_proj"): (
-            "model.layers.2.self_attn.k_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "qkv_proj"): (
-            "model.layers.2.self_attn.qkv_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "v_proj"): (
-            "model.layers.2.self_attn.v_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
-    }
-    assert result == expected
-
-
-def test_get_mlp_weights_llama_compatibility():
-    mock_layer = Mock()
-    mock_layer.mlp.gate_up_proj = Mock()
-    mock_layer.mlp.down_proj = Mock()
-
-    result = get_mlp_weights(3, mock_layer)
-
-    expected = {
-        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
-        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
-        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
-    }
-    assert result == expected
-
-
-def test_get_attn_weights_gemma_compatibility():
-    mock_layer = Mock()
-    mock_layer.self_attn.query_key_value = Mock()
-    mock_layer.self_attn.o_proj = Mock()
-
-    result = get_attn_weights(2, mock_layer)
-
-    expected = {
-        (2, "q_proj"): (
-            "model.layers.2.self_attn.q_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "k_proj"): (
-            "model.layers.2.self_attn.k_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "qkv_proj"): (
-            "model.layers.2.self_attn.qkv_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "v_proj"): (
-            "model.layers.2.self_attn.v_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
-    }
-    assert result == expected
-
-
-def test_get_mlp_weights_gemma_compatibility():
-    mock_layer = Mock()
-    mock_layer.mlp.gate_proj = Mock()
-    mock_layer.mlp.up_proj = Mock()
-    mock_layer.mlp.down_proj = Mock()
-
-    # ensure that the mock_layer.mlp.gate_up_proj attribute does not exist.
-    # This is necessary because the use of `Mock` automatically creates any
-    # attributes that are accessed, even if they don't exist in the actual
-    # implementation. If `gate_up_proj` were created, `get_mlp_weights` might
-    # follow the wrong execution path and return an incorrect result.
-    del mock_layer.mlp.gate_up_proj
-
-    result = get_mlp_weights(3, mock_layer)
-
-    expected = {
-        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_proj),
-        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.up_proj),
-        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
-    }
-    assert result == expected
diff --git a/backends/gaudi/server/tests/utils/test_convert.py b/backends/gaudi/server/tests/utils/test_convert.py
deleted file mode 100644
index ba6c5702..00000000
--- a/backends/gaudi/server/tests/utils/test_convert.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from text_generation_server.utils.hub import (
-    download_weights,
-    weight_hub_files,
-    weight_files,
-)
-
-from text_generation_server.utils.convert import convert_files
-
-
-def test_convert_files():
-    model_id = "bigscience/bloom-560m"
-    pt_filenames = weight_hub_files(model_id, extension=".bin")
-    local_pt_files = download_weights(pt_filenames, model_id)
-    local_st_files = [
-        p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors" for p in local_pt_files
-    ]
-    convert_files(local_pt_files, local_st_files, discard_names=[])
-
-    found_st_files = weight_files(model_id)
-
-    assert all([p in found_st_files for p in local_st_files])
diff --git a/backends/gaudi/server/tests/utils/test_hub.py b/backends/gaudi/server/tests/utils/test_hub.py
deleted file mode 100644
index 291a41b0..00000000
--- a/backends/gaudi/server/tests/utils/test_hub.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import os
-import tempfile
-
-import pytest
-
-import huggingface_hub.constants
-
-import text_generation_server.utils.hub
-from text_generation_server.utils.hub import (
-    weight_hub_files,
-    download_weights,
-    weight_files,
-    EntryNotFoundError,
-    LocalEntryNotFoundError,
-    RevisionNotFoundError,
-)
-
-
-@pytest.fixture()
-def offline():
-    current_value = text_generation_server.utils.hub.HF_HUB_OFFLINE
-    text_generation_server.utils.hub.HF_HUB_OFFLINE = True
-    yield "offline"
-    text_generation_server.utils.hub.HF_HUB_OFFLINE = current_value
-
-
-@pytest.fixture()
-def fresh_cache():
-    with tempfile.TemporaryDirectory() as d:
-        current_value = huggingface_hub.constants.HUGGINGFACE_HUB_CACHE
-        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = d
-        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = d
-        os.environ["HUGGINGFACE_HUB_CACHE"] = d
-        yield
-        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = current_value
-        os.environ["HUGGINGFACE_HUB_CACHE"] = current_value
-        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = current_value
-
-
-@pytest.fixture()
-def prefetched():
-    model_id = "bert-base-uncased"
-    huggingface_hub.snapshot_download(
-        repo_id=model_id,
-        revision="main",
-        local_files_only=False,
-        repo_type="model",
-        allow_patterns=["*.safetensors"],
-    )
-    yield model_id
-
-
-def test_weight_hub_files_offline_error(offline, fresh_cache):
-    # If the model is not prefetched then it will raise an error
-    with pytest.raises(EntryNotFoundError):
-        weight_hub_files("gpt2")
-
-
-def test_weight_hub_files_offline_ok(prefetched, offline):
-    # If the model is prefetched then we should be able to get the weight files from local cache
-    filenames = weight_hub_files(prefetched)
-    root = None
-    assert len(filenames) == 1
-    for f in filenames:
-        curroot, filename = os.path.split(f)
-        if root is None:
-            root = curroot
-        else:
-            assert root == curroot
-        assert filename == "model.safetensors"
-
-
-def test_weight_hub_files():
-    filenames = weight_hub_files("bigscience/bloom-560m")
-    assert filenames == ["model.safetensors"]
-
-
-def test_weight_hub_files_llm():
-    filenames = weight_hub_files("bigscience/bloom")
-    assert filenames == [f"model_{i:05d}-of-00072.safetensors" for i in range(1, 73)]
-
-
-def test_weight_hub_files_empty():
-    with pytest.raises(EntryNotFoundError):
-        weight_hub_files("bigscience/bloom", extension=".errors")
-
-
-def test_download_weights():
-    model_id = "bigscience/bloom-560m"
-    filenames = weight_hub_files(model_id)
-    files = download_weights(filenames, model_id)
-    local_files = weight_files("bigscience/bloom-560m")
-    assert files == local_files
-
-
-def test_weight_files_revision_error():
-    with pytest.raises(RevisionNotFoundError):
-        weight_files("bigscience/bloom-560m", revision="error")
-
-
-def test_weight_files_not_cached_error(fresh_cache):
-    with pytest.raises(LocalEntryNotFoundError):
-        weight_files("bert-base-uncased")
diff --git a/backends/gaudi/server/tests/utils/test_layers.py b/backends/gaudi/server/tests/utils/test_layers.py
deleted file mode 100644
index 118540ee..00000000
--- a/backends/gaudi/server/tests/utils/test_layers.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import torch
-from text_generation_server.layers import (
-    TensorParallelEmbedding,
-)
-
-
-class ProcessGroup:
-    def __init__(self, rank: int, world_size: int):
-        self._rank = rank
-        self.world_size = world_size
-
-    def size(self) -> int:
-        return self.world_size
-
-    def rank(self) -> int:
-        return self._rank
-
-
-class Weights:
-    def __init__(self, rank: int, world_size: int, vocab_size: int, hidden_dim: int):
-        self.weight = (
-            torch.arange(vocab_size * hidden_dim).float().view(vocab_size, hidden_dim)
-        )
-        self.process_group = ProcessGroup(rank, world_size)
-
-    def get_partial_sharded(self, name: str, dim: int):
-        assert dim == 0
-
-        rank = self.process_group.rank()
-        world_size = self.process_group.size()
-        size = self.weight.shape[dim]
-
-        block_size = (size + world_size - 1) // world_size
-        start = rank * block_size
-        stop = (rank + 1) * block_size
-        return self.weight[start:stop]
-
-    def get_shape(self, name: str):
-        return self.weight.shape
-
-
-def test_weight_hub_files_offline_error():
-
-    vocab_size = 17
-    weights = Weights(
-        rank=0,
-        world_size=1,
-        vocab_size=vocab_size,
-        hidden_dim=256,
-    )
-    embeddings = TensorParallelEmbedding("", weights)
-
-    input_ids = torch.arange(vocab_size)
-    output = embeddings.forward(input_ids)
-    assert embeddings.min_id == 0
-    assert embeddings.max_id == 17
-    torch.testing.assert_close(output, torch.arange(256 * 17).float().view(17, 256))
-
-    weights_0_2 = Weights(rank=0, world_size=2, vocab_size=vocab_size, hidden_dim=256)
-    weights_1_2 = Weights(rank=1, world_size=2, vocab_size=vocab_size, hidden_dim=256)
-    embeddings_0_2 = TensorParallelEmbedding("", weights_0_2, reduce=False)
-    assert embeddings_0_2.min_id == 0
-    assert embeddings_0_2.max_id == 9
-    torch.testing.assert_close(
-        embeddings_0_2.weight,
-        torch.cat([torch.arange(9 * 256), torch.zeros(256)], dim=0)
-        .view(10, 256)
-        .float(),
-    )
-    embeddings_1_2 = TensorParallelEmbedding("", weights_1_2, reduce=False)
-    assert embeddings_1_2.min_id == 9
-    assert embeddings_1_2.max_id == 17
-    torch.testing.assert_close(
-        embeddings_1_2.weight,
-        torch.cat([torch.arange(8 * 256) + 9 * 256, torch.zeros(256)], dim=0)
-        .view(9, 256)
-        .float(),
-    )
-    output_tp_0 = embeddings_0_2.forward(input_ids)
-    output_tp_1 = embeddings_1_2.forward(input_ids)
-
-    torch.testing.assert_close(output, output_tp_0 + output_tp_1)
diff --git a/backends/gaudi/server/tests/utils/test_tokens.py b/backends/gaudi/server/tests/utils/test_tokens.py
deleted file mode 100644
index 5db32776..00000000
--- a/backends/gaudi/server/tests/utils/test_tokens.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import torch
-from text_generation_server.utils.tokens import (
-    StopSequenceCriteria,
-    StoppingCriteria,
-    FinishReason,
-    batch_top_tokens,
-)
-
-
-def test_stop_sequence_criteria():
-    criteria = StopSequenceCriteria("/test;")
-
-    assert not criteria("/")
-    assert not criteria("/test")
-    assert criteria("/test;")
-    assert not criteria("/test; ")
-
-
-def test_stop_sequence_criteria_escape():
-    criteria = StopSequenceCriteria("<|stop|>")
-
-    assert not criteria("<")
-    assert not criteria("<|stop")
-    assert criteria("<|stop|>")
-    assert not criteria("<|stop|> ")
-
-
-def test_stopping_criteria():
-    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
-    assert criteria(65827, "/test") == (False, None)
-    assert criteria(30, ";") == (True, FinishReason.FINISH_REASON_STOP_SEQUENCE)
-
-
-def test_stopping_criteria_eos():
-    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
-    assert criteria(1, "") == (False, None)
-    assert criteria(0, "") == (True, FinishReason.FINISH_REASON_EOS_TOKEN)
-
-
-def test_stopping_criteria_max():
-    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
-    assert criteria(1, "") == (False, None)
-    assert criteria(1, "") == (False, None)
-    assert criteria(1, "") == (False, None)
-    assert criteria(1, "") == (False, None)
-    assert criteria(1, "") == (True, FinishReason.FINISH_REASON_LENGTH)
-
-
-def test_batch_top_tokens():
-    top_n_tokens = [0, 2, 3, 4, 5]
-    top_n_tokens_tensor = torch.tensor(top_n_tokens)
-    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5)
-    accepted_ids = torch.ones_like(top_n_tokens_tensor)
-
-    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
-        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
-    )
-
-    assert topn_tok_ids[0] == [[]]
-    assert topn_tok_ids[1] == [[0, 3]]
-    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
-    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
-    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]
-
-    assert topn_tok_logprobs[0] == [[]]
-    assert topn_tok_logprobs[1] == [[-1, -2]]
-    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
-    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
-    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]
-
-    # Now let's make second member of the batch be speculated
-    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5 * 2)
-    accepted_ids[1] = 2
-    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
-        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
-    )
-
-    assert topn_tok_ids[0] == [[]]
-    assert topn_tok_ids[1] == [[0, 3], [0, 3]]
-    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
-    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
-    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]
-
-    assert topn_tok_logprobs[0] == [[]]
-    assert topn_tok_logprobs[1] == [[-1, -2], [-1, -2]]
-    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
-    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
-    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]
diff --git a/backends/gaudi/server/tests/utils/test_watermark.py b/backends/gaudi/server/tests/utils/test_watermark.py
deleted file mode 100644
index 5d0aaf05..00000000
--- a/backends/gaudi/server/tests/utils/test_watermark.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# test_watermark_logits_processor.py
-
-import os
-import numpy as np
-import torch
-from text_generation_server.utils.watermark import WatermarkLogitsProcessor
-
-
-GAMMA = os.getenv("WATERMARK_GAMMA", 0.5)
-DELTA = os.getenv("WATERMARK_DELTA", 2.0)
-
-
-def test_seed_rng():
-    input_ids = [101, 2036, 3731, 102, 2003, 103]
-    processor = WatermarkLogitsProcessor()
-    processor._seed_rng(input_ids)
-    assert isinstance(processor.rng, torch.Generator)
-
-
-def test_get_greenlist_ids():
-    input_ids = [101, 2036, 3731, 102, 2003, 103]
-    processor = WatermarkLogitsProcessor()
-    result = processor._get_greenlist_ids(input_ids, 10, torch.device("cpu"))
-    assert max(result) <= 10
-    assert len(result) == int(10 * 0.5)
-
-
-def test_calc_greenlist_mask():
-    processor = WatermarkLogitsProcessor()
-    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
-    greenlist_token_ids = torch.tensor([2, 3])
-    result = processor._calc_greenlist_mask(scores, greenlist_token_ids)
-    assert result.tolist() == [[False, False, False, False], [False, False, True, True]]
-    assert result.shape == scores.shape
-
-
-def test_bias_greenlist_logits():
-    processor = WatermarkLogitsProcessor()
-    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
-    green_tokens_mask = torch.tensor(
-        [[False, False, True, True], [False, False, False, True]]
-    )
-    greenlist_bias = 2.0
-    result = processor._bias_greenlist_logits(scores, green_tokens_mask, greenlist_bias)
-    assert np.allclose(result.tolist(), [[0.5, 0.3, 2.2, 2.8], [0.1, 0.2, 0.7, 2.9]])
-    assert result.shape == scores.shape
-
-
-def test_call():
-    input_ids = [101, 2036, 3731, 102, 2003, 103]
-    processor = WatermarkLogitsProcessor()
-    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
-    result = processor(input_ids, scores)
-    assert result.shape == scores.shape
diff --git a/backends/gaudi/server/tests/utils/test_weights.py b/backends/gaudi/server/tests/utils/test_weights.py
deleted file mode 100644
index 556fcea1..00000000
--- a/backends/gaudi/server/tests/utils/test_weights.py
+++ /dev/null
@@ -1,1177 +0,0 @@
-import pytest
-import torch
-from text_generation_server.utils.weights import (
-    DefaultWeightsLoader,
-    Weights,
-    WeightsLoader,
-)
-from text_generation_server.layers.gptq import GPTQWeight, GPTQWeightsLoader
-from text_generation_server.layers.exl2 import Exl2Weight, Exl2WeightsLoader
-from text_generation_server.layers.marlin.marlin import (
-    MarlinWeight,
-    MarlinWeightsLoader,
-)
-from types import SimpleNamespace
-from typing import List, Optional, Dict, Union
-from pathlib import Path
-
-
-@pytest.fixture
-def gptq_weights_loader():
-    return GPTQWeightsLoader(
-        bits=4,
-        groupsize=-1,
-        desc_act=False,
-        quant_method="gptq",
-        quantize="gptq",
-        sym=True,
-    )
-
-
-@pytest.fixture
-def gptq_weights_loader_awq():
-    return GPTQWeightsLoader(
-        bits=4,
-        groupsize=-1,
-        desc_act=False,
-        quant_method="awq",
-        quantize="awq",
-        sym=True,
-    )
-
-
-@pytest.fixture
-def marlin_weights_loader():
-    return MarlinWeightsLoader(bits=4, is_marlin_24=False)
-
-
-dummy_file_system = {
-    "test_weights": {
-        "layer.0.weight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-            ],
-            dtype=torch.float32,
-        ),
-    },
-    "test_weights_2": {
-        "layer.1337.weight": torch.tensor(
-            [
-                [1, 2, 3, 4],
-                [5, 6, 7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-    },
-    "test_get_weights_col_packed": {
-        "weight.weight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-    },
-    "test_get_multi_weights_col": {
-        "weight.weight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-    },
-    "test_get_weights_row": {
-        "weight.weight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-    },
-    "test_get_weights_col_gptq": {
-        "weight.qweight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
-        "weight.qzeros": torch.tensor(
-            [
-                [0, 1],
-                [1, 0],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.scales": torch.tensor(
-            [
-                [100.0, 100.0],
-                [100.0, 100.0],
-            ],
-            dtype=torch.float16,
-        ),
-        "gptq_bits": torch.tensor([8], dtype=torch.float32),
-        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
-    },
-    "test_get_weights_col_marlin": {
-        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
-        "weight.s": torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
-    },
-    "test_get_weights_row_gptq": {
-        "weight.qweight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
-        "weight.qzeros": torch.tensor(
-            [
-                [0, 1],
-                [1, 0],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.scales": torch.tensor(
-            [
-                [100.0, 100.0],
-                [100.0, 100.0],
-            ],
-            dtype=torch.float16,
-        ),
-        "gptq_bits": torch.tensor([8], dtype=torch.float32),
-        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
-    },
-    "test_get_multi_weights_col_gptq": {
-        "weight.qweight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
-        "weight.qzeros": torch.tensor(
-            [
-                [0, 1],
-                [1, 0],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.scales": torch.tensor(
-            [
-                [100.0, 100.0],
-                [100.0, 100.0],
-            ],
-            dtype=torch.float16,
-        ),
-        "gptq_bits": torch.tensor([8], dtype=torch.float32),
-        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
-    },
-    "test_get_weights_col_packed_gptq": {
-        "weight.qweight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
-        "weight.qzeros": torch.tensor(
-            [
-                [0, 1],
-                [1, 0],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.scales": torch.tensor(
-            [
-                [100.0, 100.0],
-                [100.0, 100.0],
-            ],
-            dtype=torch.float16,
-        ),
-        "gptq_bits": torch.tensor([8], dtype=torch.float32),
-        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
-    },
-    "test_get_weights_col_packed_exl2": {
-        "weight.q_weight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
-        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
-        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
-        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
-    },
-    "test_get_weights_row_exl2": {
-        "weight.q_weight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
-        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
-        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
-        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
-    },
-    "test_get_multi_weights_col_exl2": {
-        "weight.q_weight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
-        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
-        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
-        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
-    },
-    "test_get_weights_col_exl2": {
-        "weight.q_weight": torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.int32,
-        ),
-        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
-        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
-        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
-        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
-    },
-    "test_get_weights_row_marlin": {
-        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
-        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
-    },
-    "test_get_multi_weights_col_marlin": {
-        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
-        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
-    },
-    "test_get_weights_col_packed_marlin": {
-        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
-        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
-    },
-}
-
-
-class MockSlice:
-    def __init__(self, tensor):
-        self.tensor = tensor
-
-    def get_shape(self):
-        return self.tensor.shape
-
-    def __getitem__(self, idx):
-        return self.tensor[idx]
-
-
-def mock_get_slice(tensor_name, filename):
-    tensor = dummy_file_system[filename][tensor_name]
-    return MockSlice(tensor)
-
-
-def mock_handle(filename, device, dtype):
-    return SimpleNamespace(
-        get_slice=lambda tensor_name: mock_get_slice(tensor_name, filename)
-    )
-
-
-class MockSafeOpen:
-    def __init__(self, filename, framework, dummy_fs):
-        self.filename = filename
-        self.framework = framework
-        self.dummy_fs = dummy_fs
-
-    def keys(self):
-        return list(self.dummy_fs[self.filename].keys())
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
-
-
-class MockWeights(Weights):
-    def __init__(
-        self,
-        filenames: List[Union[Path, str]],
-        device,
-        dtype,
-        process_group,
-        dummy_fs,
-        aliases: Optional[Dict[str, List[str]]] = None,
-        prefix: Optional[str] = None,
-        weights_loader: Optional[WeightsLoader] = None,
-    ):
-        routing = {}
-        self.dummy_fs = dummy_fs
-        for filename in filenames:
-            with MockSafeOpen(filename, framework="pytorch", dummy_fs=dummy_fs) as f:
-                for k in f.keys():
-                    if k in routing:
-                        raise RuntimeError(
-                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
-                        )
-                    routing[k] = filename
-        if aliases is None:
-            aliases = {}
-        self.aliases = aliases
-        self.routing = routing
-        self.device = device
-        self.dtype = dtype
-        self.process_group = process_group
-        self.prefix = prefix
-        self.weights_loader = (
-            # We don't need to get linear layers, so just wrap raw tensors.
-            DefaultWeightsLoader(lambda x: x)
-            if weights_loader is None
-            else weights_loader
-        )
-        self._handles = {}
-
-    def _get_handle(self, filename: Union[Path, str]):
-        if filename in self._handles:
-            return self._handles[filename]
-        else:
-            handle = mock_handle(filename, self.device, self.dtype)
-            self._handles[filename] = handle
-            return handle
-
-    def get_shape(self, tensor_name: str):
-        filename, _ = self.get_filename(tensor_name)
-        handle = self._get_handle(filename)
-        return handle.get_slice(tensor_name).get_shape()
-
-    def get_tensor(self, tensor_name: str):
-        filename, _ = self.get_filename(tensor_name)
-        handle = self._get_handle(filename)
-        return handle.get_slice(tensor_name).tensor
-
-
-dummy_process_group = SimpleNamespace(rank=lambda: 0, size=lambda: 1)
-
-
-def test_weights():
-    weights = MockWeights(
-        [
-            "test_weights",
-            "test_weights_2",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-    )
-    assert weights.get_shape("layer.0.weight") == (2, 2)
-    assert weights.get_tensor("layer.1337.weight").shape == (2, 4)
-
-
-def test_get_tensor():
-    weights = MockWeights(
-        [
-            "test_weights",
-            "test_weights_2",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-    )
-    assert torch.allclose(
-        weights.get_tensor("layer.0.weight"),
-        torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-            ],
-            dtype=torch.float32,
-        ),
-    )
-    assert torch.allclose(
-        weights.get_tensor("layer.1337.weight"),
-        torch.tensor(
-            [
-                [1, 2, 3, 4],
-                [5, 6, 7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-    )
-
-
-def test_get_weights_col_packed():
-
-    weights = MockWeights(
-        [
-            "test_get_weights_col_packed",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-    )
-
-    prefix = "weight"
-    block_sizes = 1
-
-    w = weights.get_weights_col_packed(
-        prefix=prefix,
-        block_sizes=block_sizes,
-    )
-
-    assert torch.allclose(
-        w,
-        torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-    )
-
-
-def test_get_weights_col_packed_block_size():
-
-    weights = MockWeights(
-        [
-            "test_get_weights_col_packed",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-    )
-
-    prefix = "weight"
-    block_sizes = 2
-
-    w = weights.get_weights_col_packed(
-        prefix=prefix,
-        block_sizes=block_sizes,
-    )
-
-    assert torch.allclose(
-        w,
-        torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-    )
-
-
-def test_get_weights_col_packed_block_size_arr():
-
-    weights = MockWeights(
-        [
-            "test_get_weights_col_packed",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-    )
-
-    prefix = "weight"
-    block_sizes = [1, 1]
-
-    w = weights.get_weights_col_packed(
-        prefix=prefix,
-        block_sizes=block_sizes,
-    )
-
-    assert torch.allclose(
-        w,
-        torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-    )
-
-
-def test_get_multi_weights_col():
-    weights = MockWeights(
-        [
-            "test_get_multi_weights_col",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-    )
-
-    prefixes = ["weight", "weight"]
-
-    w = weights.get_multi_weights_col(
-        prefixes=prefixes,
-        dim=0,
-    )
-
-    assert torch.allclose(
-        w,
-        torch.tensor(
-            [
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-                [1, 2],
-                [3, 4],
-                [5, 6],
-                [7, 8],
-            ],
-            dtype=torch.float32,
-        ),
-    )
-
-
-def test_get_weights_row():
-    weights = MockWeights(
-        [
-            "test_get_weights_row",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-    )
-
-    prefix = "weight"
-
-    w = weights.get_weights_row(
-        prefix=prefix,
-    )
-
-    assert torch.allclose(
-        w,
-        torch.tensor(
-            [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]],
-            dtype=torch.float32,
-        ),
-    )
-
-
-# test_get_weights_col
-
-
-def test_get_weights_col_awq(gptq_weights_loader_awq):
-    weights = MockWeights(
-        [
-            "test_get_weights_col_gptq",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=gptq_weights_loader_awq,
-    )
-
-    prefix = "weight"
-
-    w = weights.get_weights_col(
-        prefix=prefix,
-    )
-
-    expected_weight = GPTQWeight(
-        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
-        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
-        scales=torch.tensor(
-            [[100.0, 100.0], [100.0, 100.0]],
-            dtype=torch.float16,
-        ),
-        g_idx=None,
-        bits=8.0,
-        groupsize=2.0,
-        use_awq_kernel=True,
-        use_exllama=False,
-    )
-
-    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
-    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
-    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
-    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
-    assert w.bits == expected_weight.bits, "bits mismatch"
-    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
-    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
-    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
-
-
-def test_get_weights_col_gtpq(gptq_weights_loader):
-    weights = MockWeights(
-        [
-            "test_get_weights_col_gptq",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=gptq_weights_loader,
-    )
-
-    prefix = "weight"
-
-    w = weights.get_weights_col(
-        prefix=prefix,
-    )
-
-    expected_weight = GPTQWeight(
-        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
-        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
-        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
-        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
-        bits=8.0,
-        groupsize=2.0,
-        use_awq_kernel=False,
-        use_exllama=False,
-    )
-
-    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
-    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
-    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
-    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
-    assert w.bits == expected_weight.bits, "bits mismatch"
-    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
-    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
-    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
-
-
-def test_get_weights_col_exl2():
-    weights = MockWeights(
-        [
-            "test_get_weights_col_exl2",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=Exl2WeightsLoader(),
-    )
-
-    prefix = "weight"
-
-    w = weights.get_weights_col(
-        prefix=prefix,
-    )
-
-    scaled_scale_max = 0.3906 * 256
-    expected_weight = Exl2Weight(
-        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
-        q_scale=torch.tensor([8], dtype=torch.int32),
-        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
-        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
-        q_groups=torch.tensor([4], dtype=torch.int16),
-    )
-
-    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
-    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
-    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
-    assert torch.allclose(
-        w.q_scale_max, expected_weight.q_scale_max
-    ), "q_scale_max mismatch"
-    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
-
-
-def test_get_weights_col_marlin(marlin_weights_loader):
-    weights = MockWeights(
-        [
-            "test_get_weights_col_marlin",
-        ],
-        device="cpu",
-        dtype=torch.float16,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=marlin_weights_loader,
-    )
-
-    prefix = "weight"
-
-    w = weights.get_weights_col(
-        prefix=prefix,
-    )
-
-    expected_weight = MarlinWeight(
-        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
-        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
-    )
-
-    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
-    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
-
-
-# test_get_weights_col_packed
-
-
-def test_get_weights_col_packed_awq(gptq_weights_loader_awq):
-    weights = MockWeights(
-        [
-            "test_get_weights_col_packed_gptq",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=gptq_weights_loader_awq,
-    )
-
-    prefix = "weight"
-    block_sizes = 1
-
-    w = weights.get_weights_col_packed(
-        prefix=prefix,
-        block_sizes=block_sizes,
-    )
-
-    expected_weight = GPTQWeight(
-        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
-        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
-        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
-        g_idx=None,
-        bits=8.0,
-        groupsize=2.0,
-        use_awq_kernel=True,
-        use_exllama=False,
-    )
-
-    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
-    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
-    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
-    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
-    assert w.bits == expected_weight.bits, "bits mismatch"
-    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
-    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
-    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
-
-
-@pytest.mark.skip(reason="Review expected functionality")
-def test_get_weights_col_packed_exl2():
-    weights = MockWeights(
-        [
-            "test_get_weights_col_packed_exl2",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=Exl2WeightsLoader(),
-    )
-
-    prefix = "weight"
-    block_sizes = 1
-
-    w = weights.get_weights_col_packed(
-        prefix=prefix,
-        block_sizes=block_sizes,
-    )
-
-    scaled_scale_max = 0.3906 * 256
-    expected_weight = Exl2Weight(
-        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
-        q_scale=torch.tensor([8], dtype=torch.int32),
-        q_invperm=torch.tensor([1], dtype=torch.int16),
-        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
-        q_groups=torch.tensor([4], dtype=torch.int16),
-    )
-
-    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
-    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
-    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
-    assert torch.allclose(
-        w.q_scale_max, expected_weight.q_scale_max
-    ), "q_scale_max mismatch"
-    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
-
-
-def test_get_weights_col_packed_gptq(gptq_weights_loader):
-    weights = MockWeights(
-        [
-            "test_get_weights_col_packed_gptq",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=gptq_weights_loader,
-    )
-
-    prefixes = ["weight"]
-
-    w = weights.get_multi_weights_col(
-        prefixes=prefixes,
-        dim=0,
-    )
-
-    expected_weight = GPTQWeight(
-        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
-        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
-        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
-        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
-        bits=8.0,
-        groupsize=2.0,
-        use_awq_kernel=False,
-        use_exllama=False,
-    )
-
-    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
-    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
-    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
-    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
-    assert w.bits == expected_weight.bits, "bits mismatch"
-    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
-    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
-    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
-
-
-def test_get_weights_col_packed_marlin(marlin_weights_loader):
-    weights = MockWeights(
-        [
-            "test_get_weights_col_packed_marlin",
-        ],
-        device="cpu",
-        dtype=torch.float16,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=marlin_weights_loader,
-    )
-
-    prefix = "weight"
-
-    w = weights.get_multi_weights_col(
-        prefixes=[prefix],
-        dim=0,
-    )
-
-    expected_weight = MarlinWeight(
-        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
-        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
-    )
-
-    print(expected_weight)
-
-    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
-    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
-
-
-# test_get_multi_weights_col
-
-
-def test_get_multi_weights_col_awq(gptq_weights_loader_awq):
-    weights = MockWeights(
-        [
-            "test_get_multi_weights_col_gptq",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=gptq_weights_loader_awq,
-    )
-
-    prefixes = ["weight"]
-
-    w = weights.get_multi_weights_col(
-        prefixes=prefixes,
-        dim=0,
-    )
-
-    expected_weight = GPTQWeight(
-        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
-        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
-        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
-        g_idx=None,
-        bits=8.0,
-        groupsize=2.0,
-        use_awq_kernel=True,
-        use_exllama=False,
-    )
-
-    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
-    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
-    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
-    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
-    assert w.bits == expected_weight.bits, "bits mismatch"
-    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
-    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
-    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
-
-
-def test_get_multi_weights_col_exl2():
-    weights = MockWeights(
-        [
-            "test_get_multi_weights_col_exl2",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=Exl2WeightsLoader(),
-    )
-
-    prefix = "weight"
-
-    try:
-        weights.get_multi_weights_col(
-            prefixes=[prefix],
-            dim=0,
-        )
-    except ValueError as e:
-        assert e.args[0] == "get_multi_weights_col is not supported for exl2"
-
-
-def test_get_multi_weights_col_gptq(gptq_weights_loader):
-    weights = MockWeights(
-        [
-            "test_get_multi_weights_col_gptq",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=gptq_weights_loader,
-    )
-
-    prefixes = ["weight"]
-
-    w = weights.get_multi_weights_col(
-        prefixes=prefixes,
-        dim=0,
-    )
-
-    expected_weight = GPTQWeight(
-        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
-        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
-        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
-        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
-        bits=8.0,
-        groupsize=2.0,
-        use_awq_kernel=False,
-        use_exllama=False,
-    )
-
-    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
-    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
-    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
-    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
-    assert w.bits == expected_weight.bits, "bits mismatch"
-    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
-    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
-    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
-
-
-def test_get_multi_weights_col_marlin(marlin_weights_loader):
-    weights = MockWeights(
-        [
-            "test_get_multi_weights_col_marlin",
-        ],
-        device="cpu",
-        dtype=torch.float16,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=marlin_weights_loader,
-    )
-
-    prefix = "weight"
-
-    w = weights.get_multi_weights_col(
-        prefixes=[prefix],
-        dim=0,
-    )
-
-    expected_weight = MarlinWeight(
-        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
-        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
-    )
-
-    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
-    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
-
-
-# test_get_weights_row
-
-
-def test_get_weights_row_awq(gptq_weights_loader_awq):
-    weights = MockWeights(
-        [
-            "test_get_weights_row_gptq",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=gptq_weights_loader_awq,
-    )
-
-    prefix = "weight"
-
-    w = weights.get_weights_row(
-        prefix=prefix,
-    )
-
-    expected_weight = GPTQWeight(
-        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
-        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
-        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
-        g_idx=None,
-        bits=8.0,
-        groupsize=2.0,
-        use_awq_kernel=True,
-        use_exllama=False,
-    )
-
-    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
-    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
-    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
-    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
-    assert w.bits == expected_weight.bits, "bits mismatch"
-    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
-    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
-    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
-
-
-def test_get_weights_row_exl2():
-    weights = MockWeights(
-        [
-            "test_get_weights_row_exl2",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=Exl2WeightsLoader(),
-    )
-
-    prefix = "weight"
-
-    w = weights.get_weights_row(
-        prefix=prefix,
-    )
-    print(w)
-
-    scaled_scale_max = 0.3906 * 256
-    expected_weight = Exl2Weight(
-        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
-        q_scale=torch.tensor([8], dtype=torch.int32),
-        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
-        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
-        q_groups=torch.tensor([4], dtype=torch.int16),
-    )
-
-    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
-    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
-    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
-    assert torch.allclose(
-        w.q_scale_max, expected_weight.q_scale_max
-    ), "q_scale_max mismatch"
-    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
-
-
-def test_get_weights_row_gptq(gptq_weights_loader):
-    weights = MockWeights(
-        [
-            "test_get_weights_row_gptq",
-        ],
-        device="cpu",
-        dtype=torch.float32,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=gptq_weights_loader,
-    )
-
-    prefix = "weight"
-
-    w = weights.get_weights_row(
-        prefix=prefix,
-    )
-
-    expected_weight = GPTQWeight(
-        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
-        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
-        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
-        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
-        bits=8.0,
-        groupsize=2.0,
-        use_awq_kernel=False,
-        use_exllama=False,
-    )
-
-    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
-    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
-    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
-    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
-    assert w.bits == expected_weight.bits, "bits mismatch"
-    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
-    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
-    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
-
-
-def test_get_weights_row_marlin(marlin_weights_loader):
-    weights = MockWeights(
-        [
-            "test_get_weights_row_marlin",
-        ],
-        device="cpu",
-        dtype=torch.float16,
-        process_group=dummy_process_group,
-        dummy_fs=dummy_file_system,
-        weights_loader=marlin_weights_loader,
-    )
-
-    prefix = "weight"
-
-    w = weights.get_weights_row(
-        prefix=prefix,
-    )
-
-    expected_weight = MarlinWeight(
-        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
-        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
-    )
-
-    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
-    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx
index 9c56ed58..6dcf973d 100644
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@@ -41,34 +41,12 @@ curl 127.0.0.1:8080/generate \
 
 ## How-to Guides
 
-### How to Run Specific Models
+You can view the full list of supported models in the [Supported Models](https://huggingface.co/docs/text-generation-inference/backends/gaudi#supported-models) section.
 
-The following models have been validated on Gaudi2:
-
-| Model                 | Model ID                               | BF16        |            | FP8         |            |
-|-----------------------|----------------------------------------|-------------|------------|-------------|------------|
-|                       |                                        | Single Card | Multi-Card | Single Card | Multi-Card |
-| Llama2-7B             | meta-llama/Llama-2-7b-chat-hf          | ✔           | ✔          | ✔           | ✔          |
-| Llama2-70B            | meta-llama/Llama-2-70b-chat-hf         |             | ✔          |             | ✔          |
-| Llama3-8B             | meta-llama/Meta-Llama-3.1-8B-Instruct  | ✔           | ✔          | ✔           | ✔          |
-| Llama3-70B            | meta-llama/Meta-Llama-3-70B-Instruct   |             | ✔          |             | ✔          |
-| Llama3.1-8B           | meta-llama/Meta-Llama-3.1-8B-Instruct  | ✔           | ✔          | ✔           | ✔          |
-| Llama3.1-70B          | meta-llama/Meta-Llama-3.1-70B-Instruct |             | ✔          |             | ✔          |
-| CodeLlama-13B         | codellama/CodeLlama-13b-hf             | ✔           | ✔          | ✔           | ✔          |
-| Mixtral-8x7B          | mistralai/Mixtral-8x7B-Instruct-v0.1   | ✔           | ✔          | ✔           | ✔          |
-| Mistral-7B            | mistralai/Mistral-7B-Instruct-v0.3     | ✔           | ✔          | ✔           | ✔          |
-| Falcon-180B           | tiiuae/falcon-180B-chat                |             | ✔          |             | ✔          |
-| Qwen2-72B             | Qwen/Qwen2-72B-Instruct                |             | ✔          |             | ✔          |
-| Starcoder2-3b         | bigcode/starcoder2-3b                  | ✔           | ✔          | ✔           |            |
-| Starcoder2-15b        | bigcode/starcoder2-15b                 | ✔           | ✔          | ✔           |            |
-| Starcoder             | bigcode/starcoder                      | ✔           | ✔          | ✔           | ✔          |
-| Gemma-7b              | google/gemma-7b-it                     | ✔           | ✔          | ✔           | ✔          |
-| Llava-v1.6-Mistral-7B | llava-hf/llava-v1.6-mistral-7b-hf      | ✔           | ✔          | ✔           | ✔          |
-
-To run any of these models:
+For example, to run Llama3.1-8B, you can use the following command:
 
 ```bash
-model=MODEL_ID_THAT_YOU_WANT_TO_RUN
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 hf_token=YOUR_ACCESS_TOKEN
 
@@ -87,7 +65,7 @@ The validated docker commands can be found in the [examples/docker_commands fold
 
 ### How to Enable Multi-Card Inference (Sharding)
 
-TGI-Gaudi supports sharding for multi-card inference, allowing you to distribute the load across multiple Gaudi cards.
+TGI-Gaudi supports sharding for multi-card inference, allowing you to distribute the load across multiple Gaudi cards. This is recommended to run large models and to speed up inference.
 
 For example, on a machine with 8 Gaudi cards, you can run:
 
@@ -270,6 +248,45 @@ Note: Model warmup can take several minutes, especially for FP8 inference. For f
 
 This section contains reference information about the Gaudi backend.
 
+### Supported Models
+
+Text Generation Inference enables serving optimized models on Gaudi hardware. The following sections list which models (VLMs & LLMs) are supported on Gaudi.
+
+**Large Language Models (LLMs)**
+- [Llama2-7B](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
+- [Llama2-70B](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
+- [Llama3-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+- [Llama3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
+- [LLama3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+- [LLama3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
+- [CodeLlama-13B](https://huggingface.co/codellama/CodeLlama-13b-hf)
+- [Opt-125m](https://huggingface.co/facebook/opt-125m)
+- [OpenAI-gpt2](https://huggingface.co/openai-community/gpt2)
+- [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
+- [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
+- [Qwen2-72B](https://huggingface.co/Qwen/Qwen2-72B-Instruct)
+- [Qwen2-7B](https://huggingface.co/Qwen/Qwen2-7B-Instruct)
+- [Phi-1.5](https://huggingface.co/microsoft/phi-1_5)
+- [Gemma-7b](https://huggingface.co/google/gemma-7b-it)
+- [Starcoder2-3b](https://huggingface.co/bigcode/starcoder2-3b)
+- [Starcoder2-15b](https://huggingface.co/bigcode/starcoder2-15b)
+- [Starcoder](https://huggingface.co/bigcode/starcoder)
+- [falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct)
+- [Falcon-180B](https://huggingface.co/tiiuae/falcon-180B-chat)
+- [GPT-2](https://huggingface.co/openai-community/gpt2)
+- [gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)
+
+**Vision-Language Models (VLMs)**
+- [LLaVA-v1.6-Mistral-7B](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
+- [Mllama (Multimodal Llama from Meta)](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
+- [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b)
+- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b)
+- [Idefics 2.5](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)
+- [Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)
+- [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)
+
+We also support on a best effort basis models with different parameters count that use the same model architecture but those models were not tested. For example, the gaudi backend supports `meta-llama/Llama-3.2-1B` as the architecture is the standard llama3 architecture. If you have an issue with a model, please open an issue on the [Gaudi backend repository](https://github.com/huggingface/text-generation-inference/issues).
+
 ### Environment Variables
 
 The following table contains the environment variables that can be used to configure the Gaudi backend:

From 24bec29ffc0e092ca1e6f892b8d7092a8bb72ecf Mon Sep 17 00:00:00 2001
From: oOraph <13552058+oOraph@users.noreply.github.com>
Date: Mon, 7 Apr 2025 17:24:11 +0200
Subject: [PATCH 171/183] fix: compute type typo (#3150)

Signed-off-by: Raphael Glon <oOraph@users.noreply.github.com>
Co-authored-by: Raphael Glon <oOraph@users.noreply.github.com>
---
 launcher/src/main.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 25061381..acff8573 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1709,16 +1709,16 @@ impl From<&str> for Gpu {
 impl std::fmt::Display for Gpu {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            Gpu::RTX4090 => write!(f, "nvida-4090"),
-            Gpu::T4 => write!(f, "nvida-t4"),
-            Gpu::L4 => write!(f, "nvida-l4"),
-            Gpu::L40 => write!(f, "nvida-l40"),
-            Gpu::L40S => write!(f, "nvida-l40s"),
+            Gpu::RTX4090 => write!(f, "nvidia-4090"),
+            Gpu::T4 => write!(f, "nvidia-t4"),
+            Gpu::L4 => write!(f, "nvidia-l4"),
+            Gpu::L40 => write!(f, "nvidia-l40"),
+            Gpu::L40S => write!(f, "nvidia-l40s"),
             Gpu::A10G => write!(f, "nvidia-a10g"),
             Gpu::A40 => write!(f, "nvidia-a40"),
             Gpu::H100 => write!(f, "nvidia-h100-80fb-hbm3"),
-            Gpu::A100 => write!(f, "nvida-a100-sxm4-80gb"),
-            Gpu::H200 => write!(f, "nvida-h200"),
+            Gpu::A100 => write!(f, "nvidia-a100-sxm4-80gb"),
+            Gpu::H200 => write!(f, "nvidia-h200"),
             Gpu::Unknown(card) => write!(f, "{}", card),
         }
     }

From 0b28aabb94e89360e762f2ac2945e8ae3f8e5dc0 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 8 Apr 2025 10:16:37 +0200
Subject: [PATCH 172/183] 3.2.3 (#3151)

---
 Cargo.lock                                       | 16 ++++++++--------
 Cargo.toml                                       |  2 +-
 README.md                                        |  6 +++---
 docs/openapi.json                                |  2 +-
 docs/source/backends/gaudi.mdx                   | 10 +++++-----
 docs/source/backends/neuron.md                   |  2 +-
 .../source/basic_tutorials/gated_model_access.md |  2 +-
 docs/source/conceptual/quantization.md           |  6 +++---
 docs/source/installation_amd.md                  |  2 +-
 docs/source/installation_intel.md                |  4 ++--
 docs/source/installation_nvidia.md               |  2 +-
 docs/source/quicktour.md                         |  4 ++--
 docs/source/reference/api_reference.md           |  2 +-
 13 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d64634a6..3cac30fb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4650,7 +4650,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.2.2-dev0"
+version = "3.2.3-dev0"
 dependencies = [
  "async-trait",
  "clap 4.5.32",
@@ -4671,7 +4671,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-benchmark"
-version = "3.2.2-dev0"
+version = "3.2.3-dev0"
 dependencies = [
  "average",
  "clap 4.5.32",
@@ -4691,7 +4691,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "3.2.2-dev0"
+version = "3.2.3-dev0"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -4709,7 +4709,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "3.2.2-dev0"
+version = "3.2.3-dev0"
 dependencies = [
  "clap 4.5.32",
  "ctrlc",
@@ -4730,7 +4730,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "3.2.2-dev0"
+version = "3.2.3-dev0"
 dependencies = [
  "anyhow",
  "async-stream",
@@ -4782,7 +4782,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-llamacpp"
-version = "3.2.2-dev0"
+version = "3.2.3-dev0"
 dependencies = [
  "async-trait",
  "bindgen 0.71.1",
@@ -4800,7 +4800,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v2"
-version = "3.2.2-dev0"
+version = "3.2.3-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -4849,7 +4849,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router-v3"
-version = "3.2.2-dev0"
+version = "3.2.3-dev0"
 dependencies = [
  "async-stream",
  "async-trait",
diff --git a/Cargo.toml b/Cargo.toml
index 9b818837..1bc736ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "3.2.2-dev0"
+version = "3.2.3-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
diff --git a/README.md b/README.md
index 23d87f9a..ed7b4809 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>
 
 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
diff --git a/docs/openapi.json b/docs/openapi.json
index 55fc7ec6..ad512479 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "3.2.2-dev0"
+    "version": "3.2.3-dev0"
   },
   "paths": {
     "/": {
diff --git a/docs/source/backends/gaudi.mdx b/docs/source/backends/gaudi.mdx
index 6dcf973d..5c54d19d 100644
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
     --model-id $model
 ```
 
@@ -52,7 +52,7 @@ hf_token=YOUR_ACCESS_TOKEN
 
 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
     -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
     --model-id $model
     <text-generation-inference-launcher-arguments>
 ```
@@ -115,7 +115,7 @@ docker run -p 8080:80 \
    -e BATCH_BUCKET_SIZE=256 \
    -e PREFILL_BATCH_BUCKET_SIZE=4 \
    -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
    --model-id $model \
    --sharded true --num-shard 8 \
    --max-input-tokens 1024 --max-total-tokens 2048 \
@@ -141,7 +141,7 @@ docker run -p 8080:80 \
    -v $volume:/data \
     -e PREFILL_BATCH_BUCKET_SIZE=1 \
     -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
    --model-id $model \
    --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
    --max-total-tokens 8192 --max-batch-size 4
@@ -208,7 +208,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \
     -e PROF_PATH=/tmp/hpu_profile \
     -e PROF_RANKS=0 \
     -e PROF_RECORD_SHAPES=True \
-    ghcr.io/huggingface/text-generation-inference:3.2.2-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
     --model-id $model
 ```
 
diff --git a/docs/source/backends/neuron.md b/docs/source/backends/neuron.md
index e528197f..c8e3876e 100644
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:
 
 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.2.2-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.2.3-neuron <service_parameters>
 ```
 
 - system parameters are used to map ports, volumes and devices between the host and the service,
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 48d3abfe..a45190f7 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3 \
     --model-id $model
 ```
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index 4790578d..3aa7bb95 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model --quantize bitsandbytes
 ```
 
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model --quantize bitsandbytes-nf4
 ```
 
 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.2 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model --quantize gptq
 ```
 
 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
index bd2f9148..cec52acb 100644
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
     --device=/dev/kfd --device=/dev/dri --group-add video \
     --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.2-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-rocm \
     --model-id $model
 ```
 
diff --git a/docs/source/installation_intel.md b/docs/source/installation_intel.md
index 60d2f896..12fdd9f5 100644
--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.2-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-intel-xpu \
     --model-id $model --cuda-graphs 0
 ```
 
@@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
     --device=/dev/dri \
     --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.2-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-intel-cpu \
     --model-id $model --cuda-graphs 0
 ```
 
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
index 0ed0da32..0c3a0bd9 100644
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.2 \
+    ghcr.io/huggingface/text-generation-inference:3.2.3 \
     --model-id $model
 ```
 
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index 2ce9e686..e7f60e52 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.2 \
+    ghcr.io/huggingface/text-generation-inference:3.2.3 \
     --model-id $model
 ```
 
@@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.2.2 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.2.3 --help
 ```
 
 </Tip>
diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
index 7dc6768e..8e04b2a8 100644
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@@ -163,7 +163,7 @@ hub = {
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.2"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.3"),
  env=hub,
  role=role,
 )

From 5861da1ad7cdbec1bce2ecb4d5b8115b7df8e7a4 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 9 Apr 2025 17:07:30 +0200
Subject: [PATCH 173/183] Fixing Qwen 2.5 VL (32B). (#3157)

Reduce the config constraints, and use common ground between the 8B and
32B.
---
 router/src/config.rs | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/router/src/config.rs b/router/src/config.rs
index 8188e535..fcda2612 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -323,20 +323,20 @@ impl Qwen2Vl {
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct Qwen2_5VlVisionConfig {
-    pub(crate) depth: usize,
-    pub(crate) hidden_act: String,
-    pub(crate) hidden_size: usize,
-    pub(crate) intermediate_size: usize,
-    pub(crate) num_heads: usize,
-    pub(crate) in_chans: usize,
-    pub(crate) out_hidden_size: usize,
-    pub(crate) patch_size: usize,
-    pub(crate) spatial_merge_size: usize,
+    // pub(crate) depth: usize,
+    // pub(crate) hidden_act: String,
+    // pub(crate) hidden_size: usize,
+    // pub(crate) intermediate_size: usize,
+    // pub(crate) num_heads: usize,
+    // pub(crate) in_chans: usize,
+    // pub(crate) out_hidden_size: usize,
+    // pub(crate) patch_size: usize,
+    // pub(crate) spatial_merge_size: usize,
     pub(crate) spatial_patch_size: usize,
-    pub(crate) window_size: usize,
-    pub(crate) fullatt_block_indexes: Vec<usize>,
-    pub(crate) tokens_per_second: usize,
-    pub(crate) temporal_patch_size: usize,
+    // pub(crate) window_size: usize,
+    // pub(crate) fullatt_block_indexes: Vec<usize>,
+    // pub(crate) tokens_per_second: usize,
+    // pub(crate) temporal_patch_size: usize,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
@@ -348,7 +348,7 @@ pub struct Qwen2_5Vl {
 impl Qwen2_5Vl {
     pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
         let num_pixels = height * width;
-        num_pixels / self.vision_config.patch_size.pow(2)
+        num_pixels / self.vision_config.spatial_patch_size.pow(2)
     }
 }
 

From 9a8d0462e1737238752aa6a7d9c526db87ad6e20 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 9 Apr 2025 18:42:25 +0200
Subject: [PATCH 174/183] =?UTF-8?q?Fixing=20tokenization=20like=20https://?=
 =?UTF-8?q?github.com/huggingface/text-embeddin=E2=80=A6=20(#3156)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixing tokenization like https://github.com/huggingface/text-embeddings-inference/issues/525
---
 router/src/server.rs | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/router/src/server.rs b/router/src/server.rs
index 45d2b9f3..077c4102 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -74,11 +74,8 @@ fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec<Simpl
             .iter()
             .zip(offsets)
             .map(|(&id, &(start, stop))| {
-                let text = input
-                    .chars()
-                    .skip(start)
-                    .take(stop - start)
-                    .collect::<String>();
+                let text: Vec<u8> = input.bytes().skip(start).take(stop - start).collect();
+                let text: String = String::from_utf8_lossy(&text).to_string();
                 SimpleToken {
                     id,
                     text,

From d62c941c563c5aa31d178c5b6f60caf898ec085e Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Mon, 14 Apr 2025 21:58:13 +0800
Subject: [PATCH 175/183] Gaudi: clean cuda/rocm code in hpu backend, enable
 flat_hpu (#3113)

* clean cuda/rocm code in hpu backend, enable flat_hpu

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* fix TP in pageattn

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* adjust block table in hpu to improve performance

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* enable all the model. not testet yet

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* use tensor cache in hpu graph to avoid replay issue

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* add moe support, fix qwen/mistral/mixtral crash

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* fix phimoe issue

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* gpt_bigcode could also go pageattn

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* enable dbrx remove some unused code

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* multi-modality initial PR

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* adjust warmup and enable vlm

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* fix incorrect output in qwen2 idefics if hpu graph is used

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* remove unused quantization code and enable awq/gptq int4

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* fix gptq issue

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* enable fp8

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* warmup prefill

remove model where pageattn is not used, set block table to None since it's not used

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* add warmup_decode

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* warmup decode

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* remove block_tables and prefill_cache_indices which will lead to dynamic shape

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* fix comment

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* missing gptj change...

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* fix some issue

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* remove torch.where to fix incorrect output in hpu graph model

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* match the latest vllm_extension ops

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_gaudi                              |    2 +-
 .../server/text_generation_server/cli.py      |   11 +-
 .../layers/attention/__init__.py              |   51 +-
 .../layers/attention/common.py                |  195 +-
 .../layers/attention/cuda.py                  |  357 ---
 .../layers/attention/flash_attn_triton.py     |  813 ------
 .../layers/attention/flashinfer.py            |  251 --
 .../layers/attention/hpu.py                   |   95 +
 .../layers/attention/ipex.py                  |   82 -
 .../layers/attention/kv_cache.py              |  139 +
 .../layers/attention/rocm.py                  |  308 ---
 .../layers/awq/quantize/__init__.py           |    3 +
 .../layers/awq/quantize/hpu.py                |  134 +
 .../layers/awq/quantize/qmodule.py            |   49 -
 .../text_generation_server/layers/eetq.py     |   43 -
 .../text_generation_server/layers/fp8.py      |  381 ++-
 .../layers/gptq/__init__.py                   |  113 +-
 .../layers/gptq/custom_autotune.py            |  261 --
 .../layers/gptq/exllama.py                    |  134 -
 .../layers/gptq/exllamav2.py                  |  267 --
 .../text_generation_server/layers/gptq/hpu.py |  186 ++
 .../layers/gptq/quant_linear.py               |  359 ---
 .../layers/gptq/quantize.py                   |   17 +-
 .../layers/layernorm.py                       |  149 +-
 .../text_generation_server/layers/linear.py   |   90 +-
 .../layers/marlin/__init__.py                 |   15 -
 .../layers/marlin/fp8.py                      |  140 -
 .../layers/marlin/gptq.py                     |  464 ----
 .../layers/marlin/marlin.py                   |  346 ---
 .../layers/marlin/util.py                     |  141 -
 .../layers/moe/__init__.py                    |   55 +-
 .../text_generation_server/layers/moe/fp8.py  |  173 ++
 .../moe/{fused_moe_rocm.py => fused_moe.py}   |   17 +-
 .../layers/moe/gptq_marlin.py                 |  215 --
 .../layers/moe/unquantized.py                 |   41 +-
 .../text_generation_server/layers/rotary.py   |  162 +-
 .../layers/tensor_parallel.py                 |   41 +-
 .../text_generation_server/models/__init__.py |  680 ++++-
 .../models/custom_modeling/bloom_modeling.py  |    4 +-
 .../custom_modeling/flash_cohere_modeling.py  |  168 +-
 .../custom_modeling/flash_dbrx_modeling.py    |   85 +-
 .../flash_deepseek_v2_modeling.py             |   94 +-
 .../flash_deepseek_v3_modeling.py             |  642 +++++
 .../custom_modeling/flash_gemma2_modeling.py  |   63 +-
 .../custom_modeling/flash_gemma_modeling.py   |   62 +-
 .../custom_modeling/flash_gpt2_modeling.py    |   64 +-
 .../custom_modeling/flash_gptj_modeling.py    |  117 +-
 .../custom_modeling/flash_llama_modeling.py   |  200 +-
 .../custom_modeling/flash_llava_next.py       |  285 +++
 .../custom_modeling/flash_mistral_modeling.py |  118 +-
 .../custom_modeling/flash_mixtral_modeling.py |   81 +-
 .../models/custom_modeling/flash_mllama.py    |  986 +++++++
 .../custom_modeling/flash_neox_modeling.py    |   63 +-
 .../flash_pali_gemma_modeling.py              |   12 +-
 .../custom_modeling/flash_phi_modeling.py     |   60 +-
 .../custom_modeling/flash_qwen2_modeling.py   |  117 +-
 .../custom_modeling/flash_rw_modeling.py      |  105 +-
 .../flash_santacoder_modeling.py              |   60 +-
 .../flash_starcoder2_modeling.py              |  199 +-
 .../models/custom_modeling/idefics2.py        |   31 +-
 .../models/custom_modeling/idefics3.py        |  596 +++++
 .../custom_modeling/idefics_modeling.py       |  104 +-
 .../models/custom_modeling/mamba_modeling.py  |   10 +-
 .../models/custom_modeling/mpt_modeling.py    | 1215 ---------
 .../models/custom_modeling/neox_modeling.py   |  796 ------
 .../models/custom_modeling/opt_modeling.py    |  857 -------
 .../models/custom_modeling/phi_modeling.py    |  336 ---
 .../models/custom_modeling/qwen2_5_vl.py      |  946 +++++++
 .../models/custom_modeling/qwen2_vl.py        |  519 ++++
 .../models/custom_modeling/t5_modeling.py     | 1227 ---------
 .../models/custom_modeling/vlm.py             |   10 +-
 .../models/flash_causal_lm.py                 | 2266 +++++++++--------
 .../models/flash_vlm_causal_lm.py             |  489 ++++
 .../text_generation_server/models/globals.py  |   28 +-
 .../models/idefics_causal_lm.py               |   21 +-
 .../models/mllama_causal_lm.py                |  179 +-
 .../text_generation_server/models/model.py    |    1 +
 .../models/pali_gemma.py                      |    6 +-
 .../models/seq2seq_lm.py                      |   20 +-
 .../models/vlm_causal_lm.py                   |   12 +-
 .../server/text_generation_server/server.py   |   61 +-
 .../text_generation_server/utils/dist.py      |   51 +-
 .../utils/import_utils.py                     |   69 +-
 .../text_generation_server/utils/kernels.py   |   22 +
 .../utils/prefill_chunking.py                 |   24 +
 .../utils/quantization.py                     |   57 +-
 .../text_generation_server/utils/weights.py   |   20 +-
 backends/v3/src/block_allocator.rs            |   12 +-
 launcher/src/env_runtime.rs                   |    4 +-
 launcher/src/main.rs                          |    4 +-
 router/src/usage_stats.rs                     |   59 +
 91 files changed, 8804 insertions(+), 11813 deletions(-)
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/attention/cuda.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/attention/flash_attn_triton.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/attention/flashinfer.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/attention/hpu.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/attention/ipex.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/attention/rocm.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/awq/quantize/__init__.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/awq/quantize/hpu.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/awq/quantize/qmodule.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/eetq.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/custom_autotune.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/exllama.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/exllamav2.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/hpu.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/gptq/quant_linear.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/__init__.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/fp8.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/gptq.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/marlin.py
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/marlin/util.py
 create mode 100644 backends/gaudi/server/text_generation_server/layers/moe/fp8.py
 rename backends/gaudi/server/text_generation_server/layers/moe/{fused_moe_rocm.py => fused_moe.py} (80%)
 delete mode 100644 backends/gaudi/server/text_generation_server/layers/moe/gptq_marlin.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mllama.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py
 delete mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/mpt_modeling.py
 delete mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/neox_modeling.py
 delete mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/opt_modeling.py
 delete mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/phi_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_vl.py
 delete mode 100644 backends/gaudi/server/text_generation_server/models/custom_modeling/t5_modeling.py
 create mode 100644 backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/kernels.py
 create mode 100644 backends/gaudi/server/text_generation_server/utils/prefill_chunking.py

diff --git a/Dockerfile_gaudi b/Dockerfile_gaudi
index eff87ab6..06073fe4 100644
--- a/Dockerfile_gaudi
+++ b/Dockerfile_gaudi
@@ -95,7 +95,7 @@ RUN cd server && \
     pip install "git+https://github.com/HabanaAI/DeepSpeed.git@${HABANA_VERSION}" && \
     BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \
     pip install . --no-cache-dir
-
+RUN pip install git+https://github.com/sywangyi/vllm-hpu-extension.git
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router
diff --git a/backends/gaudi/server/text_generation_server/cli.py b/backends/gaudi/server/text_generation_server/cli.py
index 700f763e..53837ef7 100644
--- a/backends/gaudi/server/text_generation_server/cli.py
+++ b/backends/gaudi/server/text_generation_server/cli.py
@@ -16,15 +16,9 @@ app = typer.Typer()
 
 
 class Quantization(str, Enum):
-    bitsandbytes = "bitsandbytes"
-    bitsandbytes_nf4 = "bitsandbytes-nf4"
-    bitsandbytes_fp4 = "bitsandbytes-fp4"
     gptq = "gptq"
     awq = "awq"
-    eetq = "eetq"
-    exl2 = "exl2"
     fp8 = "fp8"
-    marlin = "marlin"
 
 
 class Dtype(str, Enum):
@@ -105,6 +99,9 @@ def serve(
         "bitsandbytes",
         "bitsandbytes-nf4",
         "bitsandbytes-fp4",
+        "gptq",
+        "awq",
+        "fp8",
     }:
         raise RuntimeError(
             "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
@@ -112,7 +109,7 @@ def serve(
 
     logger.info("CLI SHARDED = {} DTYPE = {}".format(sharded, dtype))
 
-    if sharded:
+    if sharded and os.getenv("ATTENTION", "default") not in {"paged"}:
         tgi_file = Path(__file__).resolve().parent / "tgi_service.py"
         num_shard = int(os.getenv("WORLD_SIZE", "1"))
         logger.info("CLI SHARDED = {}".format(num_shard))
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/__init__.py b/backends/gaudi/server/text_generation_server/layers/attention/__init__.py
index 4d83a11f..9ba9f6e0 100644
--- a/backends/gaudi/server/text_generation_server/layers/attention/__init__.py
+++ b/backends/gaudi/server/text_generation_server/layers/attention/__init__.py
@@ -1,43 +1,28 @@
-from text_generation_server.utils.import_utils import SYSTEM
-import os
+from .common import (
+    Seqlen,
+    HPUPagedAttentionMetadata,
+    trim_attn_metadata,
+    trim_seqlen_metadata,
+)
 
-from .common import Seqlen
+from .hpu import (
+    SUPPORTS_WINDOWING,
+    attention,
+    paged_attention,
+)
 
-if os.getenv("USE_FLASH_ATTENTION", "true").lower() == "false":
-    raise ImportError("`USE_FLASH_ATTENTION` is false.")
-if SYSTEM == "cuda":
-    from .cuda import (
-        attention,
-        paged_attention,
-        reshape_and_cache,
-        SUPPORTS_WINDOWING,
-        PREFILL_IN_KV_CACHE,
-    )
-elif SYSTEM == "rocm":
-    from .rocm import (
-        attention,
-        paged_attention,
-        reshape_and_cache,
-        PREFILL_IN_KV_CACHE,
-        SUPPORTS_WINDOWING,
-    )
-elif SYSTEM == "ipex":
-    from .ipex import (
-        attention,
-        paged_attention,
-        reshape_and_cache,
-        PREFILL_IN_KV_CACHE,
-        SUPPORTS_WINDOWING,
-    )
-else:
-    raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")
 
+# KVCache needs `reshape_and_cache`, so ensure that it is defined already.
+from .kv_cache import KVCache, get_kv_scales
 
 __all__ = [
     "attention",
+    "get_kv_scales",
     "paged_attention",
-    "reshape_and_cache",
-    "PREFILL_IN_KV_CACHE",
     "SUPPORTS_WINDOWING",
+    "KVCache",
     "Seqlen",
+    "HPUPagedAttentionMetadata",
+    "trim_seqlen_metadata",
+    "trim_attn_metadata",
 ]
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/common.py b/backends/gaudi/server/text_generation_server/layers/attention/common.py
index d6e512c0..8ec9fb46 100644
--- a/backends/gaudi/server/text_generation_server/layers/attention/common.py
+++ b/backends/gaudi/server/text_generation_server/layers/attention/common.py
@@ -1,72 +1,147 @@
 from dataclasses import dataclass
-from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.models.globals import ATTENTION
 import torch
-from typing import Optional
+from typing import Optional, List, Dict
+import collections
+
+_TYPE_CACHE = {}
 
 
-if ATTENTION in {"flashinfer", "flashdecoding"}:
+@dataclass
+class HPUPagedAttentionMetadata:
+    """Metadata for PagedAttention."""
 
-    @dataclass
-    class Seqlen:
-        input_lengths: torch.Tensor
-        prefix_lengths: torch.Tensor
-        cu_seqlen_q: Optional[torch.Tensor]
-        cu_seqlen_k: Optional[torch.Tensor]
-        max_q: int
-        max_k: int
+    block_list: Optional[torch.Tensor]
+    block_mapping: Optional[torch.Tensor]
+    block_usage: Optional[torch.Tensor]
+    block_scales: Optional[torch.Tensor]
+    block_groups: Optional[torch.Tensor]
+    attn_bias: Optional[torch.Tensor]
 
-        def __init__(
-            self,
-            input_lengths,
-            prefix_lengths,
-            cu_seqlen_q=None,
-            max_q=None,
-            max_k=None,
-        ):
-            self.input_lengths = input_lengths
-            self.prefix_lengths = prefix_lengths
-            device = self.input_lengths.device
-            shape = self.input_lengths.shape
-            if cu_seqlen_q is None:
-                cu_seqlen_q = torch.arange(
-                    shape[0] + 1,
-                    device=device,
-                    dtype=torch.int32,
-                )
-                max_q = 1
-            else:
-                assert max_q is not None
-            assert max_k is not None
-            cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
 
-            # cuda graphs don't like this and this is necessary to clamp within mistral
-            # Although FA2 might not want the clamping
-            # cu_seqlen_k[0] = 0
-            total = self.input_lengths + self.prefix_lengths
-            torch.cumsum(total, -1, out=cu_seqlen_k[1:])
+def subtuple(
+    obj: object,
+    typename: str,
+    to_copy: List[str],
+    to_override: Optional[Dict[str, object]] = None,
+):
+    if obj is None:
+        return None
+    if to_override is None:
+        to_override = {}
+    fields = set(to_copy) | set(to_override.keys())
+    if isinstance(obj, dict):
+        values = {key: obj[key] for key in fields if key in obj}
+    else:
+        values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
+    if typename not in _TYPE_CACHE:
+        _TYPE_CACHE[typename] = collections.namedtuple(typename, " ".join(fields))
+    return _TYPE_CACHE[typename](**values)
 
-            self.cu_seqlen_q = cu_seqlen_q
-            self.cu_seqlen_k = cu_seqlen_k
-            self.max_q = max_q
-            self.max_k = max_k
 
-        def clamp(self, max):
-            # Flash decoding doesn't need to clamp
-            return self
+def trim_attn_metadata(metadata: HPUPagedAttentionMetadata) -> object:
+    # NOTE(kzawora): To anyone working on this in the future:
+    # Trimming metadata is required when using HPUGraphs.
+    # Attention metadata is going to be hashed by PT bridge, and
+    # appropriate HPUGraphs will be matched based on all inputs' hash.
 
-else:
+    # Before you put more keys in here, make sure you know their
+    # value type and make sure you know how it's going to be hashed.
+    # You can find that information in input_hash function
+    # in habana_frameworks/torch/hpu/graphs.py. You can also hash
+    # it manually with torch.hpu.graphs.input_hash(attention_metadata)
 
-    @dataclass
-    class Seqlen:
-        input_lengths: torch.Tensor
-        prefix_lengths: torch.Tensor
-        cu_seqlen_q: torch.Tensor
-        max_q: int
-        max_k: int
+    # If you use primitive types here - they will get hashed based
+    # on their value. You *will* get lots of excessive graph captures
+    # (and an OOM eventually) if you decide to put something like
+    # seq_len int here.
+    # If you absolutely need a scalar, put it in a tensor. Tensors
+    # get hashed using their metadata, not their values:
+    # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
+    # input_hash(123) != input_hash(321)
+    # input_hash("abc") != input_hash("cba")
+    attention_metadata = subtuple(
+        metadata,
+        "TrimmedAttentionMetadata",
+        [
+            "block_list",
+            "block_mapping",
+            "block_usage",
+            "block_scales",
+            "block_groups",
+            "attn_bias",
+        ],
+    )
+    return attention_metadata
 
-        def clamp(self, max):
-            if SYSTEM == "rocm":
-                return self
-            raise NotImplementedError("Not implemented seqlen for paged")
-            return Seqlen(torch.clamp(self.input_lengths, max=max))
+
+@dataclass
+class Seqlen:
+    input_lengths: torch.Tensor
+    cache_lengths: torch.Tensor
+    cu_seqlen_q: Optional[torch.Tensor]
+    cu_seqlen_k: Optional[torch.Tensor]
+
+    def __init__(
+        self,
+        input_lengths,
+        cache_lengths,
+        cu_seqlen_q=None,
+    ):
+        self.input_lengths = input_lengths
+        self.cache_lengths = cache_lengths
+        device = self.input_lengths.device
+        shape = self.input_lengths.shape
+        if cu_seqlen_q is None:
+            cu_seqlen_q = torch.arange(
+                shape[0] + 1,
+                device=device,
+                dtype=torch.int32,
+            )
+        cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
+
+        # cuda graphs don't like this and this is necessary to clamp within mistral
+        # Although FA2 might not want the clamping
+        # cu_seqlen_k[0] = 0
+        total = self.input_lengths + self.cache_lengths
+        torch.cumsum(total, -1, out=cu_seqlen_k[1:])
+
+        self.cu_seqlen_q = cu_seqlen_q
+        self.cu_seqlen_k = cu_seqlen_k
+
+    def clamp(self, max):
+        # Flash decoding doesn't need to clamp
+        return self
+
+
+def trim_seqlen_metadata(metadata: Seqlen) -> object:
+    # NOTE(kzawora): To anyone working on this in the future:
+    # Trimming metadata is required when using HPUGraphs.
+    # Attention metadata is going to be hashed by PT bridge, and
+    # appropriate HPUGraphs will be matched based on all inputs' hash.
+
+    # Before you put more keys in here, make sure you know their
+    # value type and make sure you know how it's going to be hashed.
+    # You can find that information in input_hash function
+    # in habana_frameworks/torch/hpu/graphs.py. You can also hash
+    # it manually with torch.hpu.graphs.input_hash(attention_metadata)
+
+    # If you use primitive types here - they will get hashed based
+    # on their value. You *will* get lots of excessive graph captures
+    # (and an OOM eventually) if you decide to put something like
+    # seq_len int here.
+    # If you absolutely need a scalar, put it in a tensor. Tensors
+    # get hashed using their metadata, not their values:
+    # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
+    # input_hash(123) != input_hash(321)
+    # input_hash("abc") != input_hash("cba")
+    attention_metadata = subtuple(
+        metadata,
+        "TrimmedSeqlen",
+        [
+            "input_lengths",
+            "cache_lengths",
+            "cu_seqlen_q",
+            "cu_seqlen_k",
+        ],
+    )
+    return attention_metadata
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/cuda.py b/backends/gaudi/server/text_generation_server/layers/attention/cuda.py
deleted file mode 100644
index 51af928d..00000000
--- a/backends/gaudi/server/text_generation_server/layers/attention/cuda.py
+++ /dev/null
@@ -1,357 +0,0 @@
-import torch
-from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.models.globals import (
-    ATTENTION,
-    BLOCK_SIZE,
-)
-from text_generation_server.layers.attention import Seqlen
-from typing import Optional
-
-major, minor = torch.cuda.get_device_capability()
-is_sm75 = major == 7 and minor == 5
-_PARTITION_SIZE = 512
-
-try:
-    from vllm._C import cache_ops
-except Exception as e:
-    raise ImportError(
-        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
-    )
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    if ATTENTION in {"flashdecoding", "flashinfer"}:
-        shape = key_cache.shape
-        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
-        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
-    else:
-        cache_ops.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, "auto", 1.0
-        )
-
-
-def paged_attention(
-    query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    kv_head_mapping: torch.Tensor,
-    softmax_scale: float,
-    block_tables: torch.Tensor,
-    seqlen: Seqlen,
-    max_s: int,
-    softcap: Optional[float] = None,
-):
-    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
-    # Copyright 2023 The vLLM team. All rights
-    # reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    #
-
-    # value_cache => [num_blocks, num_heads, head_size, block_size]
-    # block_size = value_cache.shape[3]
-    block_size = BLOCK_SIZE
-    num_seqs, num_heads, head_size = query.shape
-    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-
-    # NOTE(woosuk): We use a simple heuristic to decide whether to use
-    # PagedAttention V1 or V2. If the number of partitions is 1, we use
-    # V1 to avoid the overhead of reduction. Also, if the number of
-    # sequences or heads is large, we use V1 since there is enough work
-    # to parallelize.
-    if ATTENTION == "flashinfer":
-        from text_generation_server.layers.attention.flashinfer import decode_state
-
-        return decode_state.get().forward(
-            query.contiguous(),
-            paged_kv_cache=(key_cache, value_cache),
-            logits_soft_cap=softcap,
-            sm_scale=softmax_scale,
-        )
-    elif ATTENTION == "flashdecoding":
-        max_q = 1
-        max_k = max_s
-        import flash_attn_2_cuda
-
-        # TODO fixme when flash contains the fix.
-        # Number of splits is not correctly handled
-        # by the current path
-        # https://github.com/Dao-AILab/flash-attention/blob/320fb59487658f033f56711efd3d61b7c7a6f8f3/csrc/flash_attn/flash_api.cpp#L577
-        # This fails becuase we're using causal, therefore window_right is set to 0 and the split logic is never applied.
-        if softcap is None:
-            softcap = 0.0
-        out = flash_attn_2_cuda.varlen_fwd(
-            query,
-            key_cache,
-            value_cache,
-            None,
-            seqlen.cu_seqlen_q,
-            seqlen.cu_seqlen_k,
-            None,  # pad_k
-            None,
-            block_tables,
-            None,
-            max_q,
-            max_k,
-            0.0,  # dropout
-            softmax_scale,
-            False,  # zero_tensors
-            True,  # causal
-            -1,  # Window_left
-            -1,  # Window right
-            softcap,
-            False,  # return softmax
-            None,  # generator
-        )
-        return out[0]
-    else:
-        if softcap is not None:
-            raise RuntimeError("Paged attention doesn't support softcapping")
-        input_lengths = seqlen.input_lengths
-        from vllm._C import ops
-
-        out = torch.empty_like(query)
-
-        use_v1 = max_s <= 8192 and (
-            max_num_partitions == 1 or num_seqs * num_heads > 512
-        )
-        if use_v1:
-            ops.paged_attention_v1(
-                out,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-                "auto",
-                1.0,
-            )
-        else:
-            # Run PagedAttention V2.
-            assert _PARTITION_SIZE % block_size == 0
-            tmp_output = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions, head_size),
-                dtype=out.dtype,
-                device=out.device,
-            )
-            exp_sums = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions),
-                dtype=torch.float32,
-                device=out.device,
-            )
-            max_logits = torch.empty_like(exp_sums)
-
-            ops.paged_attention_v2(
-                out,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-                "auto",
-                1.0,
-            )
-    return out
-
-
-try:
-    is_ampere_or_newer = major >= 8 and minor >= 0
-    if not is_ampere_or_newer:
-        raise ImportError("FlashAttention only supports Ampere GPUs or newer.")
-
-    import flash_attn_2_cuda
-
-    V2 = True
-except ImportError:
-    try:
-        import flash_attn_cuda
-
-        V2 = False
-    except ImportError as e:
-        if major >= 8:
-            architecture_suffix = f"-{SYSTEM}"
-            raise ImportError(
-                "Flash Attention V2 is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
-            )
-        elif is_sm75:
-            raise ImportError(
-                "Flash Attention is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                "or install flash attention with `cd server && make install install-flash-attention`"
-            ) from e
-        else:
-            raise ImportError(
-                f"GPU with CUDA capability {major} {minor} is not supported"
-            ) from e
-
-
-SUPPORTS_WINDOWING = V2
-
-if ATTENTION == "flashinfer":
-
-    def attention(
-        q: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale,
-        window_size_left=-1,
-        causal=True,
-        softcap=0.0,
-    ):
-        from text_generation_server.layers.attention.flashinfer import (
-            prefill_with_paged_kv_state,
-        )
-
-        return prefill_with_paged_kv_state.get().forward(
-            q.contiguous(),
-            causal=causal,
-            paged_kv_cache=(key_cache, value_cache),
-            logits_soft_cap=softcap,
-            sm_scale=softmax_scale,
-            window_left=window_size_left,
-        )
-
-elif V2:
-
-    def attention(
-        q,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale,
-        window_size_left=-1,
-        causal=True,
-        softcap=0.0,
-    ):
-        out = torch.empty_like(q)
-        if window_size_left <= 0 and window_size_left != -1:
-            raise ValueError("`window_size_left` must be > 0 or -1")
-        return flash_attn_2_cuda.varlen_fwd(
-            q,
-            key_cache,
-            value_cache,
-            out,
-            seqlen.cu_seqlen_q,
-            seqlen.cu_seqlen_k,
-            None,
-            None,
-            block_tables,
-            None,
-            seqlen.max_q,
-            seqlen.max_k,
-            0.0,
-            softmax_scale,
-            False,
-            causal,
-            window_size_left,
-            0,
-            softcap,
-            False,
-            None,
-        )[0]
-
-else:
-
-    def attention(
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale: float,
-        window_size_left: int = -1,
-        causal: bool = True,
-        softcap=None,
-    ):
-        if window_size_left != -1:
-            raise NotImplementedError(
-                "window_size_left is only available with flash attn v2"
-            )
-        if softcap is not None:
-            raise NotImplementedError("softcap is only available with flash attn v2")
-
-        # Flash attention v1 requires q, k and v to have the same number of heads
-        if k.shape[1] != q.shape[1]:
-            # MQA expand
-            if k.shape[1] == 1:
-                k = k.expand(-1, q.shape[1], -1)
-            # Grouped attention reshape
-            else:
-                original_shape = k.shape
-                k = (
-                    k.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
-                    .reshape(original_shape[0], -1, original_shape[2])
-                )
-        if v.shape[1] != q.shape[1]:
-            # MQA expand
-            if v.shape[1] == 1:
-                v = v.expand(-1, q.shape[1], -1)
-            # Grouped attention reshape
-            else:
-                original_shape = v.shape
-                v = (
-                    v.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
-                    .reshape(original_shape[0], -1, original_shape[2])
-                )
-
-        out = torch.empty_like(q)
-        flash_attn_cuda.fwd(
-            q,
-            k,
-            v,
-            out,
-            seqlen.cu_seqlen_q,
-            seqlen.cu_seqlen_q,
-            seqlen.max_q,
-            seqlen.max_k,
-            0.0,
-            softmax_scale,
-            False,
-            causal,
-            False,
-            0,
-            None,
-        )
-        return out
-
-
-# Prefill in the cache with every kind of attention, unless we
-# have a configuration that requires flash-attention v1, which
-# does not support block tables.
-PREFILL_IN_KV_CACHE = ATTENTION != "paged" or V2
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/flash_attn_triton.py b/backends/gaudi/server/text_generation_server/layers/attention/flash_attn_triton.py
deleted file mode 100644
index 3a6f9a73..00000000
--- a/backends/gaudi/server/text_generation_server/layers/attention/flash_attn_triton.py
+++ /dev/null
@@ -1,813 +0,0 @@
-#!/usr/bin/env python
-"""
-Fused Attention
-===============
-
-This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
-(https://tridao.me/publications/flash2/flash2.pdf)
-Credits: OpenAI kernel team, AMD ML Frameworks Triton team
-
-Features supported:
-
-1) Fwd with causal masking
-2) Any sequence lengths without padding (currently fwd kernel only)
-3) Support for different sequence lengths for q and k
-4) Nested tensor API currently does not support dropout or bias.
-
-Not currently supported:
-
-1) Non power of two head dims
-
-"""
-
-import torch
-import triton
-import triton.language as tl
-
-torch_dtype: tl.constexpr = torch.float16
-
-
-@triton.jit
-def cdiv_fn(x, y):
-    return (x + y - 1) // y
-
-
-@triton.jit
-def max_fn(x, y):
-    return tl.math.max(x, y)
-
-
-@triton.jit
-def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
-    ms = tl.arange(0, m)
-    ns = tl.arange(0, n)
-    return philox_offset + ms[:, None] * stride + ns[None, :]
-
-
-@triton.jit
-def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_offsets = dropout_offsets(
-        philox_seed, philox_offset, dropout_p, m, n, stride
-    ).to(tl.uint32)
-    # TODO: use tl.randint for better performance
-    return tl.rand(philox_seed, rng_offsets)
-
-
-@triton.jit
-def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
-    rng_keep = rng_output > dropout_p
-    return rng_keep
-
-
-@triton.jit
-def load_fn(block_ptr, first, second, pad):
-    if first and second:
-        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
-    elif first:
-        tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad)
-    elif second:
-        tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad)
-    else:
-        tensor = tl.load(block_ptr)
-    return tensor
-
-
-@triton.jit
-def _attn_fwd_inner(
-    acc,
-    l_i,
-    m_i,
-    q,
-    K_block_ptr,
-    V_block_ptr,
-    start_m,
-    actual_seqlen_k,
-    dropout_p,
-    philox_seed,
-    batch_philox_offset,
-    encoded_softmax_block_ptr,
-    block_min,
-    block_max,
-    offs_n_causal,
-    masked_blocks,
-    n_extra_tokens,
-    bias_ptr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    OFFS_M: tl.constexpr,
-    OFFS_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    MASK_STEPS: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    PADDED_HEAD: tl.constexpr,
-):
-    # loop over k, v, and update accumulator
-    for start_n in range(block_min, block_max, BLOCK_N):
-        # For padded blocks, we will overrun the tensor size if
-        # we load all BLOCK_N. For others, the blocks are all within range.
-        k = load_fn(
-            K_block_ptr,
-            PADDED_HEAD,
-            MASK_STEPS and (n_extra_tokens != 0),
-            "zero",
-        )
-        if PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        # We start from end of seqlen_k so only the first iteration would need
-        # to be checked for padding if it is not a multiple of block_n
-        # TODO: This can be optimized to only be true for the padded block.
-        if MASK_STEPS:  # noqa: SIM102
-            # If this is the last block / iteration, we want to
-            # mask if the sequence length is not a multiple of block size
-            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
-            # if not is_modulo_mn. last step might get wasted but that is okay.
-            # check if this masking works for that case.
-            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
-                boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32)
-                size_n = start_n + OFFS_N[None, :]
-                mask = size_n < boundary_m[:, None]
-                qk = tl.where(mask, qk, float("-inf"))
-        if IS_CAUSAL:
-            causal_boundary = start_n + offs_n_causal
-            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
-            qk = tl.where(causal_mask, qk, float("-inf"))
-        # -- compute qk ----
-        qk += tl.dot(q, k)
-        if bias_ptr is not None:
-            bias = load_fn(
-                bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero"
-            )
-            # While bias is added after multiplying qk with sm_scale, our
-            # optimization to use 2^x instead of e^x results in an additional
-            # scale factor of log2(e) which we must also multiply the bias with.
-            qk += bias * 1.44269504089
-        m_ij = tl.maximum(m_i, tl.max(qk, 1))
-        qk = qk - m_ij[:, None]
-        p = tl.math.exp2(qk)
-
-        # CAVEAT: Must update l_ij before applying dropout
-        l_ij = tl.sum(p, 1)
-        if ENABLE_DROPOUT:
-            philox_offset = (
-                batch_philox_offset
-                + start_m * BLOCK_M * actual_seqlen_k
-                + start_n
-                - BLOCK_N
-            )
-            keep = dropout_mask(
-                philox_seed,
-                philox_offset,
-                dropout_p,
-                BLOCK_M,
-                BLOCK_N,
-                actual_seqlen_k,
-            )
-            if RETURN_ENCODED_SOFTMAX:
-                tl.store(
-                    encoded_softmax_block_ptr,
-                    tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty),
-                )
-            p = tl.where(keep, p, 0.0)
-        elif RETURN_ENCODED_SOFTMAX:
-            tl.store(
-                encoded_softmax_block_ptr,
-                p.to(encoded_softmax_block_ptr.type.element_ty),
-            )
-        # -- update output accumulator --
-        alpha = tl.math.exp2(m_i - m_ij)
-        acc = acc * alpha[:, None]
-        if not PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
-        # -- update m_i and l_i
-        l_i = l_i * alpha + l_ij
-        # update m_i and l_i
-        m_i = m_ij
-        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
-        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
-        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(
-                encoded_softmax_block_ptr, (0, BLOCK_N)
-            )
-    return acc, l_i, m_i
-
-
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {
-                "BLOCK_M": 256,
-                "BLOCK_N": 64,
-                "waves_per_eu": 2,
-                "PRE_LOAD_V": False,
-            },
-            num_stages=1,
-            num_warps=8,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 128,
-                "BLOCK_N": 128,
-                "waves_per_eu": 2,
-                "PRE_LOAD_V": False,
-            },
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 256,
-                "BLOCK_N": 128,
-                "waves_per_eu": 2,
-                "PRE_LOAD_V": False,
-            },
-            num_stages=1,
-            num_warps=8,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 128,
-                "BLOCK_N": 64,
-                "waves_per_eu": 3,
-                "PRE_LOAD_V": True,
-            },
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 128,
-                "BLOCK_N": 64,
-                "waves_per_eu": 3,
-                "PRE_LOAD_V": False,
-            },
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 64,
-                "BLOCK_N": 64,
-                "waves_per_eu": 4,
-                "PRE_LOAD_V": False,
-            },
-            num_stages=1,
-            num_warps=8,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 32,
-                "BLOCK_N": 32,
-                "waves_per_eu": 4,
-                "PRE_LOAD_V": False,
-            },
-            num_stages=1,
-            num_warps=8,
-        ),
-        # TODO: This config fails with head_size not pow2 with data mismatches.
-        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
-        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
-        triton.Config(
-            {
-                "BLOCK_M": 16,
-                "BLOCK_N": 16,
-                "waves_per_eu": 1,
-                "PRE_LOAD_V": False,
-            },
-            num_stages=1,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 128,
-                "BLOCK_N": 64,
-                "waves_per_eu": 1,
-                "PRE_LOAD_V": False,
-            },
-            num_stages=1,
-            num_warps=4,
-        ),
-    ],
-    key=["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL"],
-)
-@triton.jit
-def attn_fwd(
-    Q,
-    K,
-    V,
-    bias,
-    sm_scale,
-    L,
-    Out,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    stride_oz,
-    stride_oh,
-    stride_om,
-    stride_on,
-    stride_bz,
-    stride_bh,
-    stride_bm,
-    stride_bn,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    dropout_p,
-    philox_seed,
-    philox_offset_base,
-    encoded_softmax,
-    HQ: tl.constexpr,
-    HK: tl.constexpr,
-    ACTUAL_BLOCK_DMODEL: tl.constexpr,
-    MAX_SEQLENS_Q: tl.constexpr,
-    MAX_SEQLENS_K: tl.constexpr,
-    VARLEN: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_h_q = tl.program_id(1)
-    off_z = tl.program_id(2)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    if VARLEN:
-        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
-        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
-        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
-        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
-        # small for all start_m so for those we return early.
-        if start_m * BLOCK_M > seqlen_q:
-            return
-        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
-        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
-        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
-    else:
-        cu_seqlens_q_start = 0
-        cu_seqlens_k_start = 0
-        seqlen_q = MAX_SEQLENS_Q
-        seqlen_k = MAX_SEQLENS_K
-
-    # Now we compute whether we need to exit early due to causal masking.
-    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
-    # are completely masked, resulting in 0s written to the output, and
-    # inf written to LSE. We don't need to do any GEMMs in this case.
-    # This block of code determines what N is, and if this WG is operating
-    # on those M rows.
-    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
-    if IS_CAUSAL:
-        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
-        # If seqlen_q != seqlen_k, attn scores are rectangular which means
-        # the causal mask boundary is bottom right aligned, and ends at either
-        # the top edge (seqlen_q < seqlen_k) or left edge.
-        # This captures the decrease in n_blocks if we have a rectangular attn
-        # matrix
-        n_blocks_seqlen = cdiv_fn(
-            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N
-        )
-        # This is what adjusts the block_max for the current WG, only
-        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
-        n_blocks = min(n_blocks, n_blocks_seqlen)
-        # If we have no blocks after adjusting for seqlen deltas, this WG is
-        # part of the blocks that are all 0. We exit early.
-        if n_blocks <= 0:
-            o_offset = (
-                off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
-            )
-            O_block_ptr = tl.make_block_ptr(
-                base=Out + o_offset,
-                shape=(seqlen_q, BLOCK_DMODEL),
-                strides=(stride_om, stride_on),
-                offsets=(start_m * BLOCK_M, 0),
-                block_shape=(BLOCK_M, BLOCK_DMODEL),
-                order=(1, 0),
-            )
-            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
-            # We still need to write 0s to the result
-            # tl.store(O_block_ptr,
-            # acc.to(Out.type.element_ty), boundary_check=(0,1))
-            # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
-            #          + offs_m
-            # We store inf to LSE, not -inf because in the bwd pass,
-            # we subtract this
-            # from qk which makes it -inf, such that exp(qk - inf) = 0
-            # for these masked blocks.
-            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
-            # tl.store(l_ptrs, l)
-            # TODO: Should dropout and return encoded softmax be handled here?
-            return
-
-    # If MQA / GQA, set the K and V head offsets appropriately.
-    GROUP_SIZE: tl.constexpr = HQ // HK
-    if GROUP_SIZE != 1:
-        off_h_k = off_h_q // GROUP_SIZE
-    else:
-        off_h_k = off_h_q
-
-    n_extra_tokens = 0
-    if seqlen_k < BLOCK_N:
-        n_extra_tokens = BLOCK_N - seqlen_k
-    elif seqlen_k % BLOCK_N:
-        n_extra_tokens = seqlen_k % BLOCK_N
-    PADDED_HEAD: tl.constexpr = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
-
-    # Compute pointers for all the tensors used in this kernel.
-    q_offset = off_z * stride_qz + off_h_q * stride_qh + cu_seqlens_q_start * stride_qm
-    Q_block_ptr = tl.make_block_ptr(
-        base=Q + q_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_qm, stride_qk),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn
-    K_block_ptr = tl.make_block_ptr(
-        base=K + k_offset,
-        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
-        strides=(stride_kk, stride_kn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_DMODEL, BLOCK_N),
-        order=(0, 1),
-    )
-    v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk
-    V_block_ptr = tl.make_block_ptr(
-        base=V + v_offset,
-        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_vk, stride_vn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_N, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    if BIAS_TYPE != 0:
-        bias_ptr = tl.make_block_ptr(
-            base=bias + off_h_q * stride_bh,
-            shape=(seqlen_q, seqlen_k),
-            strides=(stride_bm, stride_bn),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        bias_ptr = None
-    if ENABLE_DROPOUT:
-        batch_philox_offset = (
-            philox_offset_base + (off_z * HQ + off_h_q) * seqlen_q * seqlen_k
-        )
-    else:
-        batch_philox_offset = 0
-    # We can ask to return the dropout mask without actually doing any dropout.
-    # In this case, we return an invalid pointer so indicate the mask is not i
-    # valid.
-    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
-    if RETURN_ENCODED_SOFTMAX:
-        encoded_softmax_block_ptr = tl.make_block_ptr(
-            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
-            shape=(seqlen_q, seqlen_k),
-            strides=(seqlen_k, 1),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        encoded_softmax_block_ptr = 0
-    # initialize pointer to m and l
-    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
-    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
-    # have native e^x support in HW.
-    qk_scale = sm_scale * 1.44269504089
-    # Q is loaded once at the beginning and shared by all N blocks.
-    q = load_fn(Q_block_ptr, True, PADDED_HEAD, "zero")
-    q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
-
-    # Here we compute how many full and masked blocks we have.
-    padded_block_k = n_extra_tokens != 0
-    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
-    if IS_CAUSAL:
-        # There are always at least BLOCK_M // BLOCK_N masked blocks.
-        # Additionally there might be one more due to dissimilar seqlens.
-        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
-    else:
-        # Padding on Q does not need to be masked in the FA loop.
-        masked_blocks = padded_block_k
-    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
-    # block. In this case we might exceed n_blocks so pick the min.
-    masked_blocks = min(masked_blocks, n_blocks)
-    n_full_blocks = n_blocks - masked_blocks
-    block_min = 0
-    block_max = n_blocks * BLOCK_N
-    # Compute for full blocks. Here we set causal to false regardless of its
-    # value because there is no masking. Similarly we do not need padding.
-    if n_full_blocks > 0:
-        block_max = (n_blocks - masked_blocks) * BLOCK_N
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
-            block_min,
-            block_max,
-            0,
-            0,
-            0,
-            bias_ptr,
-            # IS_CAUSAL, ....
-            False,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            False,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            PADDED_HEAD,
-        )
-        block_min = block_max
-        block_max = n_blocks * BLOCK_N
-
-    tl.debug_barrier()
-    # Remaining blocks, if any, are full / not masked.
-    if masked_blocks > 0:
-        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
-        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
-        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(
-                encoded_softmax_block_ptr, (0, n_full_blocks)
-            )
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            block_min,
-            block_max,
-            offs_n_causal,
-            masked_blocks,
-            n_extra_tokens,
-            bias_ptr,
-            IS_CAUSAL,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            True,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            PADDED_HEAD,
-        )
-    # epilogue
-    acc = acc / l_i[:, None]
-    if ENABLE_DROPOUT:
-        acc = acc / (1 - dropout_p)
-    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
-    # then we have one block with a row of all NaNs which come from computing
-    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
-    # and store 0s where there are NaNs as these rows should've been zeroed out.
-    end_m_idx = (start_m + 1) * BLOCK_M
-    start_m_idx = start_m * BLOCK_M
-    causal_start_idx = seqlen_q - seqlen_k
-    acc = acc.to(Out.type.element_ty)
-    if IS_CAUSAL:  # noqa: SIM102
-        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
-            out_mask_boundary = tl.full(
-                (BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32
-            )
-            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-            out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :]
-            z = 0.0
-            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
-    # write back LSE
-    # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
-    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
-    # few rows. This is only true for the last M block. For others,
-    # overflow_size will be -ve
-    # overflow_size = end_m_idx - seqlen_q
-    # if overflow_size > 0:
-    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
-    #    # This is a > check because mask being 0 blocks the store.
-    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
-    # else:
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
-
-    # write back O
-    o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
-    O_block_ptr = tl.make_block_ptr(
-        base=Out + o_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_om, stride_on),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    # Need boundary check on this to make sure the padding from the
-    # Q and KV tensors in both dims are not part of what we store back.
-    # TODO: Do the boundary check optionally.
-    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
-
-
-def check_args(
-    q,
-    k,
-    v,
-    o,
-    varlen=True,
-    max_seqlens=None,
-    cu_seqlens_q=None,
-    cu_seqlens_k=None,
-):
-    assert q.dim() == k.dim() and q.dim() == v.dim()
-    if varlen:
-        assert q.dim() == 3
-        total_q, nheads_q, head_size = q.shape
-        total_k, nheads_k, _ = k.shape
-        assert cu_seqlens_q is not None
-        assert cu_seqlens_k is not None
-        assert len(cu_seqlens_q) == len(cu_seqlens_k)
-    else:
-        assert q.dim() == 4
-        batch, nheads_q, seqlen_q, head_size = q.shape
-        _, nheads_k, seqlen_k, _ = k.shape
-        assert max_seqlens > 0
-    assert k.shape == v.shape
-    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
-    # TODO: Change assert if we support qkl f8 and v f16
-    assert q.dtype == k.dtype and q.dtype == v.dtype
-    # TODO: Fix assert to check head size <=256 once supported
-    assert head_size <= 128
-    assert o.shape == q.shape
-    assert (nheads_q % nheads_k) == 0
-
-
-class _attention(torch.autograd.Function):
-
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        k,
-        v,
-        o,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlens_q,
-        max_seqlens_k,
-        causal=False,
-        sm_scale=1.0,
-        bias=None,
-    ):
-        if o is None:
-            o = torch.empty_like(q, dtype=v.dtype)
-
-        check_args(
-            q,
-            k,
-            v,
-            o,
-            varlen=True,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-        )
-        if True:  # varlen
-            total_q, nheads_q, head_size = q.shape
-            total_k, nheads_k, _ = k.shape
-            batch = len(cu_seqlens_q) - 1
-            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
-            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
-            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
-            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
-        else:
-            batch, seqlen_q, nheads_q, head_size = q.shape
-            _, seqlen_k, nheads_k, _ = k.shape
-            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
-            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
-            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
-            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
-
-        # Get closest power of 2 over or equal to 32.
-        padded_d_model = 1 << (head_size - 1).bit_length()
-        padded_d_model = max(padded_d_model, 16)
-
-        def grid(META):
-            return triton.cdiv(max_seqlens_q, META["BLOCK_M"]), nheads_q, batch
-
-        encoded_softmax = None
-
-        # Seed the RNG so we get reproducible results for testing.
-        philox_seed = 0x1BF52
-        philox_offset = 0x1D4B42
-
-        if bias is not None:
-            bias_strides = (
-                bias.stride(0),
-                bias.stride(1),
-                bias.stride(2),
-                bias.stride(3),
-            )
-        else:
-            bias_strides = (0, 0, 0, 0)
-
-        attn_fwd[grid](
-            q,
-            k,
-            v,
-            bias,
-            sm_scale,
-            None,
-            o,
-            *q_strides,
-            *k_strides,
-            *v_strides,
-            *o_strides,
-            *bias_strides,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            dropout_p=0.0,
-            philox_seed=philox_seed,
-            philox_offset_base=philox_offset,
-            encoded_softmax=encoded_softmax,
-            HQ=nheads_q,
-            HK=nheads_k,
-            ACTUAL_BLOCK_DMODEL=head_size,
-            MAX_SEQLENS_Q=max_seqlens_q,
-            MAX_SEQLENS_K=max_seqlens_k,
-            IS_CAUSAL=causal,
-            VARLEN=True,
-            BLOCK_DMODEL=padded_d_model,
-            BIAS_TYPE=0 if bias is None else 1,
-            ENABLE_DROPOUT=False,
-            RETURN_ENCODED_SOFTMAX=False,
-        )
-
-        ctx.grid = grid
-        ctx.sm_scale = sm_scale
-        ctx.BLOCK_DMODEL = head_size
-        ctx.causal = causal
-        ctx.dropout_p = 0.0
-        ctx.philox_seed = philox_seed
-        ctx.philox_offset = philox_offset
-        ctx.encoded_softmax = encoded_softmax
-        ctx.return_encoded_softmax = False
-        return o, encoded_softmax
-
-
-triton_attention = _attention.apply
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/flashinfer.py b/backends/gaudi/server/text_generation_server/layers/attention/flashinfer.py
deleted file mode 100644
index d603c6f5..00000000
--- a/backends/gaudi/server/text_generation_server/layers/attention/flashinfer.py
+++ /dev/null
@@ -1,251 +0,0 @@
-from typing import Optional
-from contextvars import ContextVar
-from contextlib import contextmanager
-
-import flashinfer
-import torch
-
-prefill_state: ContextVar[flashinfer.BatchPrefillWithRaggedKVCacheWrapper] = ContextVar(
-    "prefill_state"
-)
-
-prefill_with_paged_kv_state: ContextVar[
-    flashinfer.BatchPrefillWithPagedKVCacheWrapper
-] = ContextVar("prefill_with_paged_kv_state")
-
-decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = ContextVar(
-    "decode_state"
-)
-
-workspace: Optional[torch.Tensor] = None
-
-
-def get_workspace(device):
-    """Get shared flashinfer workspace."""
-    global workspace
-    if workspace is None:
-        workspace = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device=device)
-    return workspace
-
-
-def create_prefill_with_paged_kv_state(
-    *,
-    device: torch.device,
-):
-    """Create a prefill state that uses the KV cache."""
-    workspace_buffer = get_workspace(device)
-    return flashinfer.BatchPrefillWithPagedKVCacheWrapper(
-        workspace_buffer, kv_layout="NHD", use_cuda_graph=False
-    )
-
-
-@contextmanager
-def use_prefill_with_paged_kv_state(
-    *,
-    state: flashinfer.BatchPrefillWithPagedKVCacheWrapper,
-    block_tables: torch.Tensor,
-    cu_seqlens: torch.Tensor,
-    input_lengths: torch.Tensor,
-    num_heads: int,
-    num_kv_heads: int,
-    head_size: int,
-    page_size: int,
-    dtype: torch.dtype,
-    window_left: int,
-):
-    """
-    Context manager to set the active flashinfer prefill state to the given
-    `state` and parameters. This state will be used by all calls to the
-    `attention` function while the context manager is active.
-    """
-
-    indptr = torch.zeros(
-        input_lengths.shape[0] + 1, device=input_lengths.device, dtype=torch.int32
-    )
-    # Round up to page size and then calculate the cumulative sum to get
-    # the indices into the block table.
-    torch.add(input_lengths, page_size - 1, out=indptr[1:])
-    indptr[1:].div_(page_size, rounding_mode="floor")
-    indptr[1:].cumsum_(-1)
-
-    # Get the lengths of the last page in a block.
-    if page_size == 1:
-        last_page_len = torch.ones(
-            input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
-        )
-    else:
-        last_page_len = torch.empty(
-            input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
-        )
-        torch.sub(input_lengths, 1, out=last_page_len)
-        last_page_len.remainder_(page_size)
-        last_page_len += 1
-
-    token = prefill_with_paged_kv_state.set(state)
-    try:
-        state.begin_forward(
-            qo_indptr=cu_seqlens,
-            paged_kv_indptr=indptr,
-            paged_kv_indices=block_tables,
-            paged_kv_last_page_len=last_page_len,
-            num_qo_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=head_size,
-            q_data_type=dtype,
-            page_size=page_size,
-            window_left=window_left,
-        )
-        yield
-    finally:
-        state.end_forward()
-        if token is not None:
-            prefill_with_paged_kv_state.reset(token)
-
-
-def create_prefill_state(
-    *,
-    device: torch.device,
-):
-    """Create a prefill state."""
-    workspace_buffer = get_workspace(device)
-    return flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
-        workspace_buffer, kv_layout="NHD", use_cuda_graph=False
-    )
-
-
-@contextmanager
-def use_prefill_state(
-    *,
-    state: flashinfer.BatchPrefillWithRaggedKVCacheWrapper,
-    cu_seqlens: torch.Tensor,
-    num_heads: int,
-    num_kv_heads: int,
-    head_size: int,
-    dtype: torch.dtype,
-    window_left: int,
-):
-    """
-    Context manager to set the active flashinfer prefill state to the given
-    `state` and parameters. This state will be used by all calls to the
-    `attention` function while the context manager is active.
-    """
-
-    token = prefill_state.set(state)
-    try:
-        state.begin_forward(
-            qo_indptr=cu_seqlens,
-            kv_indptr=cu_seqlens,
-            num_qo_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=head_size,
-            q_data_type=dtype,
-            window_left=window_left,
-        )
-        yield
-    finally:
-        state.end_forward()
-        if token is not None:
-            prefill_state.reset(token)
-
-
-def create_decode_state(
-    *,
-    device: torch.device,
-    num_heads: int,
-    num_kv_heads: int,
-):
-    """Create a decode state."""
-    workspace_buffer = get_workspace(device)
-    num_groups = num_heads // num_kv_heads
-    return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer,
-        kv_layout="NHD",
-        use_cuda_graph=False,
-        # Taken from https://github.com/flashinfer-ai/flashinfer/blob/33ef95700981ba70f4cab63b8931e562bc795b21/python/flashinfer/decode.py#L57-L60
-        use_tensor_cores=num_groups not in [1, 2, 4, 8],
-    )
-
-
-def create_decode_state_cuda_graphs(
-    *,
-    device: torch.device,
-    block_tables: torch.Tensor,
-    block_tables_ptr: torch.Tensor,
-    last_page_len: torch.Tensor,
-    num_heads: int,
-    num_kv_heads: int,
-):
-    """
-    Create a decode state for use with CUDA Graphs. `block_tables`,
-    `block_tables_ptr`, and `last_page_len` are used in CUDA Graphs and are
-    therefore stored as part of the state.
-    """
-    workspace_buffer = get_workspace(device)
-    num_groups = num_heads // num_kv_heads
-    return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
-        workspace_buffer,
-        kv_layout="NHD",
-        use_cuda_graph=True,
-        paged_kv_indices_buffer=block_tables,
-        paged_kv_indptr_buffer=block_tables_ptr,
-        paged_kv_last_page_len_buffer=last_page_len,
-        # Taken from https://github.com/flashinfer-ai/flashinfer/blob/33ef95700981ba70f4cab63b8931e562bc795b21/python/flashinfer/decode.py#L57-L60
-        use_tensor_cores=num_groups not in [1, 2, 4, 8],
-    )
-
-
-@contextmanager
-def use_decode_state(
-    *,
-    state: flashinfer.BatchDecodeWithPagedKVCacheWrapper,
-    input_lengths: torch.Tensor,
-    block_tables: torch.Tensor,
-    num_heads: int,
-    num_kv_heads: int,
-    head_size: int,
-    page_size: int,
-    dtype: torch.dtype,
-    window_left: int,
-):
-    """
-    Context manager to set the active flashinfer decoding state to the given
-    `state` and parameters. This state will be used by all calls to the
-    `paged_attention` function while the context manager is active.
-    """
-    indptr = torch.zeros(
-        input_lengths.shape[0] + 1, device=input_lengths.device, dtype=torch.int32
-    )
-    # Round up to page size and then calculate the cumulative sum to get
-    # the indices into the block table.
-    torch.add(input_lengths, page_size - 1, out=indptr[1:])
-    indptr[1:].div_(page_size, rounding_mode="floor")
-    indptr[1:].cumsum_(-1)
-
-    # Get the lengths of the last page in a block.
-    last_page_len = torch.empty(
-        input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
-    )
-    torch.sub(input_lengths, 1, out=last_page_len)
-    last_page_len.remainder_(page_size)
-    last_page_len += 1
-
-    token = decode_state.set(state)
-
-    try:
-        state.begin_forward(
-            indptr=indptr,
-            indices=block_tables,
-            last_page_len=last_page_len,
-            num_qo_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=head_size,
-            page_size=page_size,
-            data_type=dtype,
-            q_data_type=dtype,
-            window_left=window_left,
-        )
-        yield
-    finally:
-        state.end_forward()
-        if token is not None:
-            decode_state.reset(token)
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/hpu.py b/backends/gaudi/server/text_generation_server/layers/attention/hpu.py
new file mode 100644
index 00000000..f34e93ab
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/attention/hpu.py
@@ -0,0 +1,95 @@
+import torch
+from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
+from typing import Optional
+from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
+from vllm_hpu_extension import ops
+from vllm_hpu_extension.utils import Matmul
+from habana_frameworks.torch.hpex.kernels import FusedSDPA
+from vllm_hpu_extension.utils import ModuleFusedSDPA
+import os
+
+SUPPORTS_WINDOWING = False
+
+
+def fetch_from_cache(cache, blocks):
+    if os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true":
+        return cache[: blocks.size(0)]
+    else:
+        return cache.index_select(0, blocks)
+
+
+def attention(
+    *,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: KVCache,
+    kv_scales: KVScales,
+    seqlen: Seqlen,
+    softmax_scale: float,
+    window_size_left: int = -1,
+    causal: bool = True,
+    softcap: Optional[float] = None,
+):
+    fsdpa_op = ModuleFusedSDPA(FusedSDPA)
+    bs = seqlen.input_lengths.shape[0]
+    _, head_num, head_size = query.shape
+    _, kv_head_num, head_size = key.shape
+    query = query.view(bs, -1, head_num, head_size).transpose(1, 2)
+    key = key.view(bs, -1, kv_head_num, head_size).transpose(1, 2)
+    value = value.view(bs, -1, kv_head_num, head_size).transpose(1, 2)
+    attn_output = fsdpa_op(
+        query,
+        key,
+        value,
+        attn_mask=None,
+        dropout_p=0.0,
+        is_causal=causal,
+        scale=softmax_scale,
+        softmax_mode="None",
+        recompute_mode=None,
+        valid_sequence_lengths=seqlen.input_lengths,
+        padding_side="left",
+    )
+    attn_output = attn_output.transpose(1, 2).squeeze(0).contiguous()
+    return attn_output
+
+
+def paged_attention(
+    query: torch.Tensor,
+    kv_cache: KVCache,
+    kv_head_mapping: torch.Tensor,
+    softmax_scale: float,
+    seqlen: Seqlen,
+    *,
+    kv_scales: KVScales,
+    softcap: Optional[float] = None,
+    hpu_attention_meta: HPUPagedAttentionMetadata,
+):
+    batch_size, head_num, head_size = query.shape
+    output = ops.flat_pa(
+        query=query.view(batch_size, 1, head_num * head_size),
+        key_cache=kv_cache.key,
+        value_cache=kv_cache.value,
+        block_list=hpu_attention_meta.block_list,
+        block_mapping=hpu_attention_meta.block_mapping,
+        block_bias=hpu_attention_meta.attn_bias,
+        block_scales=hpu_attention_meta.block_scales,
+        block_groups=hpu_attention_meta.block_groups,
+        scale=softmax_scale,
+        matmul_qk_op=Matmul(),
+        matmul_av_op=Matmul(),
+        batch2block_matmul_op=Matmul(),
+        block2batch_matmul_op=Matmul(),
+        keys_fetch_func=fetch_from_cache,
+        values_fetch_func=fetch_from_cache,
+    )
+    # Reshape the output tensor.
+    return output.view(batch_size, head_num, head_size)
+
+
+__all__ = [
+    "SUPPORTS_WINDOWING",
+    "attention",
+    "paged_attention",
+]
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/ipex.py b/backends/gaudi/server/text_generation_server/layers/attention/ipex.py
deleted file mode 100644
index 657c90af..00000000
--- a/backends/gaudi/server/text_generation_server/layers/attention/ipex.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import intel_extension_for_pytorch as ipex
-import torch
-from text_generation_server.models.flash_causal_lm import BLOCK_SIZE
-from text_generation_server.layers.attention import Seqlen
-from typing import Optional
-
-SUPPORTS_WINDOWING = False
-PREFILL_IN_KV_CACHE = False
-
-
-def attention(
-    q: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    seqlen: Seqlen,
-    block_tables: torch.Tensor,
-    softmax_scale,
-    window_size_left=-1,
-    causal=True,
-    softcap: Optional[float] = None,
-):
-    out = torch.empty_like(q)
-
-    # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
-    ipex.llm.functional.varlen_attention(
-        q.contiguous() if q.device.type == "xpu" else q,
-        key_cache.contiguous() if key_cache.device.type == "xpu" else key_cache,
-        value_cache.contiguous() if value_cache.device.type == "xpu" else value_cache,
-        out,
-        seqlen.cu_seqlen_q,
-        seqlen.cu_seqlen_q,
-        seqlen.max_q,
-        seqlen.max_q,
-        0.0,
-        softmax_scale,
-        False,
-        causal,
-        False,
-        None,
-    )
-
-    return out
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    ipex.llm.modules.PagedAttention.reshape_and_cache(
-        key, value, key_cache, value_cache, slots
-    )
-
-
-def paged_attention(
-    query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    kv_head_mapping: torch.Tensor,
-    softmax_scale: float,
-    block_tables: torch.Tensor,
-    seqlen: Seqlen,
-    max_s: int,
-    softcap: Optional[float] = None,
-):
-    out = torch.empty_like(query)
-    ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
-        out,
-        query,
-        key_cache,
-        value_cache,
-        kv_head_mapping,
-        softmax_scale,
-        block_tables,
-        seqlen.input_lengths,
-        BLOCK_SIZE,
-        max_s,
-        None,
-    )
-    return out
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py b/backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py
new file mode 100644
index 00000000..d238cdb9
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py
@@ -0,0 +1,139 @@
+from typing import Tuple
+from dataclasses import dataclass, field
+
+import torch
+
+from text_generation_server.models.globals import BLOCK_SIZE
+from text_generation_server.utils.weights import Weights
+from vllm_hpu_extension import cache_ops
+
+
+@dataclass
+class KVScales:
+    """
+    Key-value scales for FP8 KV cache.
+
+    This data class stores key and value scales both as a GPU tensor and
+    as a GPU float. This inconvenience is necessary because some functions
+    (e.g. scaling kernels) take scales as a GPU tensor, whereas others
+    (e.g. flashinfer) take scales as a CPU scalar.
+    """
+
+    key_scale: torch.Tensor
+    value_scale: torch.Tensor
+    key_scale_cpu: float = field(init=False)
+    value_scale_cpu: float = field(init=False)
+
+    def __post_init__(self):
+        if self.key_scale.numel() != 1 or self.value_scale.numel() != 1:
+            raise ValueError("Key and value scales must be scalar tensors.")
+
+        self.key_scale_cpu = self.key_scale.item()
+        self.value_scale_cpu = self.value_scale.item()
+
+
+class KVCache:
+    """
+    Key-value cache for attention layers.
+    """
+
+    kv_cache: Tuple[torch.Tensor, torch.Tensor]
+
+    def __init__(
+        self,
+        *,
+        num_blocks: int,
+        num_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        """Construct the key-value cache for a layer."""
+        ## TODO FP8 kv cache support
+
+        self.kv_cache = (
+            torch.zeros(
+                (num_blocks, BLOCK_SIZE, num_heads, head_size),
+                dtype=dtype,
+                device=device,
+            ),
+            torch.zeros(
+                (num_blocks, BLOCK_SIZE, num_heads, head_size),
+                dtype=dtype,
+                device=device,
+            ),
+        )
+
+    @property
+    def dtype(self):
+        """Get the data type of the cache."""
+        return self.kv_cache[0].dtype
+
+    @property
+    def key(self):
+        """Get the key cache."""
+
+        return self.kv_cache[0]
+
+    @property
+    def value(self):
+        """Get the value cache."""
+
+        return self.kv_cache[1]
+
+    def store(
+        self,
+        *,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        slots: torch.Tensor,
+        kv_scales: KVScales,
+    ):
+        """Store the key and value at the given slots."""
+        ## TODO FP8 kv cache support
+
+        key_cache = self.kv_cache[0]
+        value_cache = self.kv_cache[1]
+
+        paged_reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slots,
+            kv_scales.key_scale_cpu,
+            kv_scales.value_scale_cpu,
+        )
+
+
+def paged_reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+    k_scale: float = 1.0,
+    v_scale: float = 1.0,
+):
+    block_idx = slots // BLOCK_SIZE
+    block_offset = slots % BLOCK_SIZE
+    cache_ops.insert_or_update_cache(key, key_cache, block_idx, block_offset)
+    cache_ops.insert_or_update_cache(value, value_cache, block_idx, block_offset)
+
+
+def get_kv_scales(weights: Weights, prefix: str) -> KVScales:
+    """Load KV cache scales."""
+
+    key_scale = torch.tensor(1.0, dtype=torch.float32, device=weights.device)
+    value_scale = key_scale
+    if weights.has_tensor(f"{prefix}.k_scale") and weights.has_tensor(
+        f"{prefix}.v_scale"
+    ):
+        key_scale = weights.get_tensor(f"{prefix}.k_scale", to_dtype=False).float()
+        value_scale = weights.get_tensor(f"{prefix}.v_scale", to_dtype=False).float()
+    elif weights.has_tensor(f"{prefix}.kv_scale"):
+        # Fall back to older more coarse-grained scale when available.
+        key_scale = weights.get_tensor(f"{prefix}.kv_scale").float()
+        value_scale = key_scale
+
+    return KVScales(key_scale=key_scale, value_scale=value_scale)
diff --git a/backends/gaudi/server/text_generation_server/layers/attention/rocm.py b/backends/gaudi/server/text_generation_server/layers/attention/rocm.py
deleted file mode 100644
index 646a763d..00000000
--- a/backends/gaudi/server/text_generation_server/layers/attention/rocm.py
+++ /dev/null
@@ -1,308 +0,0 @@
-import os
-from typing import Optional
-import torch
-from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.models.globals import ATTENTION
-from text_generation_server.layers.attention import Seqlen
-from text_generation_server.utils.log import log_master
-from loguru import logger
-
-major, minor = torch.cuda.get_device_capability()
-is_sm75 = major == 7 and minor == 5
-
-_PARTITION_SIZE_V1V2 = 512
-_PARTITION_SIZE_CUSTOM = 256
-
-use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"}
-ENGINE = "triton" if use_triton else "ck"
-
-PREFILL_IN_KV_CACHE = False
-
-use_rocm_custom_paged_attn = os.getenv("ROCM_USE_CUSTOM_PAGED_ATTN", "1") != "0"
-try:
-    if use_rocm_custom_paged_attn:
-        from vllm._custom_C import paged_attention_custom
-except ImportError as e:
-    log_master(
-        logger.info,
-        f"Custom Paged Attention not available. Complete error: {e}",
-    )
-    use_rocm_custom_paged_attn = False
-
-try:
-    import vllm._custom_ops as ops
-except Exception as e:
-    raise ImportError(
-        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
-    )
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    if ATTENTION == "flashdecoding":
-        shape = key_cache.shape
-        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
-        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
-    else:
-        ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
-
-
-def paged_attention(
-    query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    kv_head_mapping: torch.Tensor,
-    softmax_scale: float,
-    block_tables: torch.Tensor,
-    seqlen: Seqlen,
-    max_s: int,
-    softcap: Optional[float] = None,
-):
-    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
-    # Copyright 2023 The vLLM team. All rights
-    # reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    #
-
-    if softcap is not None:
-        raise RuntimeError("Paged attention doesn't support softcapping")
-
-    # value_cache => [num_blocks, num_heads, head_size, block_size]
-    block_size = value_cache.shape[3]
-    num_seqs, num_heads, head_size = query.shape
-
-    num_kv_heads = key_cache.shape[1]
-    gqa_ratio = num_heads // num_kv_heads
-    use_custom = (
-        use_rocm_custom_paged_attn
-        and (query.dtype == torch.half or query.dtype == torch.bfloat16)
-        and (head_size == 128 or head_size == 64)
-        and (block_size == 16 or block_size == 32)
-        and (gqa_ratio >= 1 and gqa_ratio <= 16)
-        and max_s <= 32768
-    )
-
-    if not use_custom:
-        _PARTITION_SIZE = _PARTITION_SIZE_V1V2
-    else:
-        _PARTITION_SIZE = _PARTITION_SIZE_CUSTOM
-
-    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-    input_lengths = seqlen.input_lengths
-
-    out = torch.empty_like(query)
-
-    # NOTE(woosuk): We use a simple heuristic to decide whether to use
-    # PagedAttention V1 or V2. If the number of partitions is 1, we use
-    # V1 to avoid the overhead of reduction. Also, if the number of
-    # sequences or heads is large, we use V1 since there is enough work
-    # to parallelize.
-    import vllm._custom_ops as ops
-
-    use_v1 = (
-        max_s <= 8192
-        and (max_num_partitions == 1 or num_seqs * num_heads > 512)
-        and not use_custom
-    )
-    if use_v1:
-        ops.paged_attention_v1(
-            out,
-            query,
-            key_cache,
-            value_cache,
-            kv_head_mapping,
-            softmax_scale,
-            block_tables,
-            input_lengths,
-            block_size,
-            max_s,
-            None,
-            "auto",
-            1.0,
-        )
-    else:
-        # Run PagedAttention V2.
-        assert _PARTITION_SIZE % block_size == 0
-        tmp_output = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions, head_size),
-            dtype=out.dtype,
-            device=out.device,
-        )
-        exp_sums = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions),
-            dtype=torch.float32,
-            device=out.device,
-        )
-        max_logits = torch.empty_like(exp_sums)
-
-        if not use_custom:
-            ops.paged_attention_v2(
-                out,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                kv_head_mapping,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-                "auto",
-                1.0,
-            )
-        else:
-            paged_attention_custom(
-                out,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                softmax_scale,
-                block_tables,
-                input_lengths,
-                block_size,
-                max_s,
-                None,
-                "auto",
-            )
-
-    return out
-
-
-if ENGINE != "triton":
-    try:
-        import flash_attn_2_cuda
-
-        log_master(
-            logger.info,
-            "ROCm: using Flash Attention 2 Composable Kernel implementation.",
-        )
-    except ImportError as e:
-        if major >= 8:
-            architecture_suffix = f"-{SYSTEM}"
-            raise ImportError(
-                "Flash Attention V2 is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
-            )
-        elif is_sm75:
-            raise ImportError(
-                "Flash Attention is not installed.\n"
-                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-                "or install flash attention with `cd server && make install install-flash-attention`"
-            ) from e
-        else:
-            for idx in range(torch.cuda.device_count()):
-                name = torch.cuda.get_device_name(idx)
-                if "MI210" not in name and "MI250" not in name:
-                    raise ImportError(
-                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
-                    )
-            raise ImportError(
-                f"AMD GPU with ROCm capability {major} {minor} is not supported"
-            ) from e
-
-
-SUPPORTS_WINDOWING = False
-if ENGINE == "ck":
-
-    def attention(
-        q,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale: float,
-        window_size_left: int = -1,
-        causal: bool = True,
-        softcap: float = 0.0,
-    ):
-        if window_size_left <= 0 and window_size_left != -1:
-            raise ValueError("`window_size_left` must be > 0 or -1")
-
-        out = torch.empty_like(q)
-
-        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
-        return flash_attn_2_cuda.varlen_fwd(
-            q,
-            key_cache,
-            value_cache,
-            out,
-            seqlen.cu_seqlen_q,
-            seqlen.cu_seqlen_q,
-            None,
-            None,
-            None,
-            None,
-            seqlen.max_q,
-            seqlen.max_k,
-            0.0,
-            softmax_scale,
-            False,
-            causal,
-            window_size_left,
-            0,
-            softcap,
-            False,
-            None,
-        )[0]
-
-elif ENGINE == "triton":
-    from .flash_attn_triton import triton_attention
-
-    def attention(
-        q,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale: float,
-        window_size_left: int = -1,
-        causal: bool = True,
-        softcap: Optional[float] = None,
-    ):
-        if softcap is not None:
-            raise NotImplementedError("softcap is only available with CK flash attn")
-
-        out = torch.empty_like(q)
-
-        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
-        output, _ = triton_attention(
-            q,
-            key_cache,
-            value_cache,
-            out,
-            seqlen.cu_seqlen_q,
-            seqlen.cu_seqlen_q,
-            seqlen.max_q,
-            seqlen.max_k,
-            causal,
-            softmax_scale,
-        )
-        return output
-
-else:
-    raise RuntimeError(f"Unknown attention engine {ENGINE}")
diff --git a/backends/gaudi/server/text_generation_server/layers/awq/quantize/__init__.py b/backends/gaudi/server/text_generation_server/layers/awq/quantize/__init__.py
new file mode 100644
index 00000000..856d7c28
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/awq/quantize/__init__.py
@@ -0,0 +1,3 @@
+from .hpu import WQLinear
+
+__all__ = ["WQLinear"]
diff --git a/backends/gaudi/server/text_generation_server/layers/awq/quantize/hpu.py b/backends/gaudi/server/text_generation_server/layers/awq/quantize/hpu.py
new file mode 100644
index 00000000..3af0131b
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/awq/quantize/hpu.py
@@ -0,0 +1,134 @@
+from typing import Optional
+import torch
+import torch.nn as nn
+
+try:
+    import habana_frameworks.torch.hpu  # noqa: F401
+
+    convert_from_uint4 = torch.ops.hpu.convert_from_uint4
+except Exception as e:
+    hpu_import_exception = e
+
+    def error_raiser_hpu(*args, **kwargs):
+        raise ValueError(
+            f"Trying to use HPU, but could not import the HPU framework with the following error: {hpu_import_exception}"
+        )
+
+    convert_from_uint4 = error_raiser_hpu
+
+AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+
+
+def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
+    shifts = torch.arange(0, 32, bits, device=qzeros.device)
+
+    # unpacking columnwise
+    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
+        torch.int8  # smallest dtype available
+    )
+    iweights = iweights.view(iweights.shape[0], -1)
+
+    # unpacking columnwise
+    if qzeros is not None:
+        izeros = torch.bitwise_right_shift(
+            qzeros[:, :, None], shifts[None, None, :]
+        ).to(
+            torch.int8  # smallest dtype available
+        )
+        izeros = izeros.view(izeros.shape[0], -1)
+    else:
+        izeros = qzeros
+
+    return iweights, izeros
+
+
+def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
+    reverse_order_tensor = torch.arange(
+        iweights.shape[-1],
+        dtype=torch.int32,
+        device=izeros.device,
+    )
+    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
+    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
+    reverse_order_tensor = reverse_order_tensor.view(-1)
+
+    if izeros is not None:
+        izeros = izeros[:, reverse_order_tensor]
+    iweights = iweights[:, reverse_order_tensor]
+
+    return iweights, izeros
+
+
+def unpack_weight_and_zeros(qweight, qzeros, bits):
+    # Unpack the qweight and qzeros tensors
+    iweight, izeros = unpack_awq(qweight, qzeros, bits)
+    # Reverse the order of the iweight and izeros tensors
+    iweight, izeros = reverse_awq_order(iweight, izeros, bits)
+
+    # overflow checks
+    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
+    izeros = torch.bitwise_and(izeros, (2**bits) - 1)
+
+    return iweight, izeros
+
+
+def pack_tensor(input, bits=4):
+    normal = input.to(torch.int32)
+    q = torch.zeros(
+        (normal.shape[0], normal.shape[1] // 32 * bits),
+        dtype=torch.int32,
+        device=input.device,
+    )
+    i = 0
+    col = 0
+    while col < q.shape[1]:
+        for j in range(i, i + (32 // bits)):
+            q[:, col] |= normal[:, j] << (bits * (j - i))
+        i += 32 // bits
+        col += 1
+    q = q.to(torch.int32)
+    return q
+
+
+class WQLinear(nn.Module):
+    def __init__(
+        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
+    ):
+        super().__init__()
+
+        if w_bit not in [4]:
+            raise NotImplementedError("Only 4-bit are supported for now.")
+
+        self.in_features = qweight.shape[0]
+        self.out_features = qweight.shape[1] * 32 // w_bit
+
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else self.in_features
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert self.out_features % (32 // self.w_bit) == 0
+
+        self.qweight = qweight
+        self.qzeros = qzeros
+        self.scales = scales
+        self.bias = bias
+        self._preprocessing()
+
+    def _preprocessing(self):
+        device = self.qweight.device
+        weight, zeros = unpack_weight_and_zeros(
+            self.qweight.cpu(), self.qzeros.cpu(), self.w_bit
+        )
+        self.qweight = pack_tensor(weight).to(device)
+        self.qzeros = pack_tensor(zeros).to(device)
+
+    @torch.no_grad()
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.out_features,)
+        x = x.reshape(-1, x.shape[-1])
+        weights = convert_from_uint4(self.qweight, self.scales, self.qzeros, x.dtype)
+        outputs = torch.matmul(x, weights)
+
+        outputs = outputs + self.bias if self.bias is not None else outputs
+        outputs = outputs.reshape(out_shape)
+        return outputs
diff --git a/backends/gaudi/server/text_generation_server/layers/awq/quantize/qmodule.py b/backends/gaudi/server/text_generation_server/layers/awq/quantize/qmodule.py
deleted file mode 100644
index 391371a5..00000000
--- a/backends/gaudi/server/text_generation_server/layers/awq/quantize/qmodule.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py
-
-from typing import Optional
-import torch
-import torch.nn as nn
-import awq_inference_engine  # with CUDA kernels
-
-
-# class ScaledActivation(nn.Module):
-#     def __init__(self, module, scales):
-#         super().__init__()
-#         self.act = module
-#         self.scales = nn.Parameter(scales.data)
-#
-#     def forward(self, x):
-#         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
-
-
-class WQLinear(nn.Module):
-    def __init__(
-        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
-    ):
-        super().__init__()
-
-        if w_bit not in [4]:
-            raise NotImplementedError("Only 4-bit are supported for now.")
-
-        self.in_features = qweight.shape[0]
-        self.out_features = qweight.shape[1] * 32 // w_bit
-
-        self.w_bit = w_bit
-        self.group_size = group_size if group_size != -1 else self.in_features
-        # quick sanity check (make sure aligment)
-        assert self.in_features % self.group_size == 0
-        assert self.out_features % (32 // self.w_bit) == 0
-
-        self.qweight = qweight
-        self.qzeros = qzeros
-        self.scales = scales
-        self.bias = bias
-
-    @torch.no_grad()
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.out_features,)
-        out = awq_inference_engine.gemm_forward_cuda(
-            x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
diff --git a/backends/gaudi/server/text_generation_server/layers/eetq.py b/backends/gaudi/server/text_generation_server/layers/eetq.py
deleted file mode 100644
index b1e5235a..00000000
--- a/backends/gaudi/server/text_generation_server/layers/eetq.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from dataclasses import dataclass
-
-import torch
-from EETQ import quant_weights, w8_a16_gemm
-from text_generation_server.utils.weights import UnquantizedWeight
-
-
-@dataclass
-class EETQWeight(UnquantizedWeight):
-    weight: torch.Tensor
-
-    def get_linear(self, bias: torch.Tensor):
-        try:
-            from text_generation_server.layers.eetq import EETQLinear
-
-            return EETQLinear(self.weight, bias)
-        except ImportError:
-            raise ImportError(
-                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
-            )
-
-
-class EETQLinear(torch.nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        device = weight.device
-        if weight.dtype != torch.float16:
-            weight = weight.to(dtype=torch.float16)
-        weight = torch.t(weight).contiguous().cpu()
-        weight, scale = quant_weights(weight, torch.int8, False)
-
-        self.weight = weight.cuda(device)
-        self.scale = scale.cuda(device)
-        self.bias = bias.cuda(device) if bias is not None else None
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        output = w8_a16_gemm(input, self.weight, self.scale)
-        output = output + self.bias if self.bias is not None else output
-        return output
diff --git a/backends/gaudi/server/text_generation_server/layers/fp8.py b/backends/gaudi/server/text_generation_server/layers/fp8.py
index 61dd5115..0dc5cdaf 100644
--- a/backends/gaudi/server/text_generation_server/layers/fp8.py
+++ b/backends/gaudi/server/text_generation_server/layers/fp8.py
@@ -1,100 +1,152 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple, Type, Union, List
+
 import torch
 
-from dataclasses import dataclass
-from typing import Optional, Union, List
-from loguru import logger
-
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.weights import (
     Weight,
     WeightsLoader,
     UnquantizedWeight,
     Weights,
 )
-from text_generation_server.utils.log import log_master, log_once
-import importlib.util
+
+from vllm_hpu_extension.ops import scaled_fp8_quant
+from vllm_hpu_extension.scales import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
+import habana_frameworks.torch.utils.experimental as htexp
+
+w8a8_block_fp8_matmul = None
+per_token_group_quant_fp8 = None
+quant_dtype: torch.dtype = torch.float8_e4m3fn
 
 
-FBGEMM_MM_AVAILABLE = False
-FBGEMM_DYN_AVAILABLE = False
-
-
-def is_fbgemm_gpu_available():
-    try:
-        return importlib.util.find_spec("fbgemm_gpu.experimental.gen_ai") is not None
-    except ModuleNotFoundError:
-        return False
-
-
-if is_fbgemm_gpu_available():
-    if SYSTEM == "cuda":
-        major, _ = torch.cuda.get_device_capability()
-        FBGEMM_MM_AVAILABLE = major == 9
-        FBGEMM_DYN_AVAILABLE = major >= 8
-else:
-    log_master(logger.warning, "FBGEMM fp8 kernels are not installed.")
-
-
-def get_fp8_linear() -> torch.nn.Module:
+def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
     """
     Return an FP8 linear `Module` that is compatible with the current system.
     """
-
-    if SYSTEM == "cuda":
-        major, _ = torch.cuda.get_device_capability()
-        if major == 8:
-            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
-
-            return GPTQMarlinFP8Linear
-
     # On other systems let Torch decide if the hardware supports FP8.
     return Fp8Linear
 
 
-def fp8_quantize(
-    weight, scale_upper_bound=None, qdtype=torch.float8_e4m3fn, scalar=False
-):
-    if FBGEMM_DYN_AVAILABLE and not scalar:
-        qweight, scale = torch.ops.fbgemm.quantize_fp8_per_row(
-            weight, bs=None, scale_ub=scale_upper_bound, output_dtype=qdtype
-        )
-        return qweight, scale
+def normalize_e4m3fn_to_native_float8(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return weight, weight_scale, input_scale
 
-    # weight, scale = quant_weights(weight, torch.int8, False)
-    finfo = torch.finfo(qdtype)
-    # Calculate the scale as dtype max divided by absmax
-    scale = finfo.max / weight.abs().max().clamp(min=1e-12, max=scale_upper_bound)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
-    # Return both float8 data and the inverse scale (as float),
-    # as both required as inputs to torch._scaled_mm
-    qweight = qweight.to(qdtype)
-    scale = scale.float().reciprocal()
-    return qweight, scale
+
+def per_tensor_dequantize(
+    tensor: torch.Tensor,
+    inv_scale: Union[float, torch.Tensor],
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    device = tensor.device
+    dtype = torch.bfloat16
+    if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2:
+        # dequant on cpu to avoid nan on gaudi2
+        tensor = tensor.to("cpu")
+
+    fake_qweight = tensor.to(dtype).to(device)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight
+
+
+def requantize_with_max_scale(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    logical_widths: int,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for requanitzation.
+    max_w_scale = weight_scale.max()
+
+    if is_hpu_gaudi2():
+        max_w_scale = max_w_scale * get_hpu_gaudi2_scale_factor()
+
+    start = 0
+    for idx, logical_width in enumerate(logical_widths):
+        end = start + logical_width
+        weight_dq = per_tensor_dequantize(
+            weight[start:end, :], weight_scale[idx], dtype
+        )
+        weight[start:end, :], max_w_scale_normalized = fp8_quantize(
+            weight_dq, max_w_scale
+        )
+        start = end
+
+    return weight, max_w_scale_normalized
+
+
+def fp8_quantize(
+    weight: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    scale_upper_bound: Optional[torch.Tensor] = None,
+    qdtype: torch.dtype = torch.float8_e4m3fn,
+    scalar: bool = False,
+):
+    """
+    This function returns a reciprocal of the scale, so that a tensor can be unscaled
+    by multiplying it with the returned scale. If a scale is given through the `scale`
+    argument, it must also be a reciprocal (so that scales from an FP8 checkpoint can
+    be used without modification).
+    """
+    shape = weight.shape
+    qweight, scale = scaled_fp8_quant(
+        weight.reshape(-1, shape[-1]),
+        scale=scale,
+        scale_ub=scale_upper_bound,
+        # TODO: don't do this when we have to use the Torch kernel.
+        use_per_token_if_dynamic=not scalar,
+    )
+
+    return qweight.reshape(shape), scale
 
 
 class HybridFP8UnquantLoader(WeightsLoader):
     """Weight loader that loads FP8 and unquantized Torch tensors."""
 
-    def __init__(self, activation_scale_ub: Optional[float], to_fp8: bool):
+    def __init__(
+        self,
+        activation_scale_ub: Optional[float],
+        to_fp8: bool,
+        weight_block_size: Optional[List[int]] = None,
+    ):
         self.activation_scale_ub = activation_scale_ub
         self.to_fp8 = to_fp8
+        self.weight_block_size = weight_block_size
 
     def get_weights(self, weights: "Weights", prefix: str):
         w = weights.get_tensor(f"{prefix}.weight")
 
         if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                scale = weights.get_tensor(f"{prefix}.weight_scale_inv")
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
             # FP8 branch
-            scale = (
-                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
-                .reshape(-1)
-                .expand(w.shape[0])
+            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = (
+                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
+                    .reshape(-1)
+                    .max()
+                )
+            logical_widths = [w.shape[0]]
+            w, scale = requantize_with_max_scale(
+                w, scale.unsqueeze(0), logical_widths, weights.dtype
             )
+
             return Fp8Weight(
                 weight=w,
                 weight_scale=scale,
+                input_scale=input_scale,
                 activation_scale_ub=self.activation_scale_ub,
                 dtype=weights.dtype,
             )
@@ -116,6 +168,7 @@ class HybridFP8UnquantLoader(WeightsLoader):
         if w.dtype == torch.float8_e4m3fn:
             # FP8 branch
             scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+
             if scale.numel() > 1:
                 scale = weights.get_packed_sharded(
                     f"{prefix}.weight_scale",
@@ -123,11 +176,29 @@ class HybridFP8UnquantLoader(WeightsLoader):
                     block_sizes=block_sizes,
                     to_dtype=False,
                 )
-            scale = scale.reshape(-1).expand(w.shape[0])
+
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = weights.get_tensor(
+                    f"{prefix}.input_scale", to_dtype=False
+                )
+                if input_scale.numel() > 1:
+                    input_scale = weights.get_packed_sharded(
+                        f"{prefix}.input_scale",
+                        dim=0,
+                        block_sizes=block_sizes,
+                        to_dtype=False,
+                    )
+                input_scale = input_scale.reshape(-1).max()
+            logical_widths = [w.shape[0]]
+            w, scale = requantize_with_max_scale(
+                w, scale.unsqueeze(0), logical_widths, weights.dtype
+            )
 
             return Fp8Weight(
                 weight=w,
                 weight_scale=scale,
+                input_scale=input_scale,
                 activation_scale_ub=self.activation_scale_ub,
                 dtype=weights.dtype,
             )
@@ -148,15 +219,48 @@ class HybridFP8UnquantLoader(WeightsLoader):
 
         # FP8 branch
         if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                scale = [
+                    weights.get_sharded(f"{p}.weight_scale_inv", dim=0, to_device=False)
+                    for p in prefixes
+                ]
+                scale = torch.cat(scale, dim=dim)
+                scale = scale.to(weights.device)
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+
             scale = [
                 _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                 for p, shape in zip(prefixes, shapes)
             ]
             scale = torch.cat(scale, dim=0).reshape(-1)
 
+            input_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+                if weights.has_tensor(f"{p}.input_scale")
+            ]
+            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
+            input_scale = (
+                torch.cat(input_scale, dim=0).reshape(-1).max()
+                if len(input_scale) != 0
+                else None
+            )
+
+            logical_widths = [x[0] for x in shapes]
+            w, scale = requantize_with_max_scale(
+                w, scale.to(weights.device), logical_widths, weights.dtype
+            )
+
             return Fp8Weight(
                 weight=w,
                 weight_scale=scale,
+                input_scale=input_scale,
                 activation_scale_ub=self.activation_scale_ub,
                 dtype=weights.dtype,
             )
@@ -169,14 +273,35 @@ class HybridFP8UnquantLoader(WeightsLoader):
         w = weights.get_sharded(f"{prefix}.weight", dim=1)
         # FP8 branch
         if w.dtype == torch.float8_e4m3fn:
-            scale = (
-                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
-                .reshape(-1)
-                .expand(w.shape[0])
+            if self.weight_block_size is not None:
+                # XXX: Yes the weights is named scale_inv, but corresponds to scale it seems.
+                scale = weights.get_sharded(f"{prefix}.weight_scale_inv", dim=1)
+
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+
+            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = (
+                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
+                    .reshape(-1)
+                    .max()
+                )
+            logical_widths = [w.shape[0]]
+            w, scale = requantize_with_max_scale(
+                w, scale.unsqueeze(0), logical_widths, weights.dtype
             )
             return Fp8Weight(
                 weight=w,
                 weight_scale=scale,
+                input_scale=input_scale,
                 activation_scale_ub=self.activation_scale_ub,
                 dtype=weights.dtype,
             )
@@ -191,83 +316,126 @@ class Fp8Weight(Weight):
     weight: torch.Tensor
     dtype: torch.dtype
     weight_scale: Optional[torch.Tensor] = None
+    input_scale: Optional[torch.Tensor] = None
     activation_scale_ub: Optional[float] = None
+    force_w8a16: bool = False
+    weight_block_size: Optional[List[int]] = None
 
     def get_linear(self, bias: torch.Tensor):
         if self.weight_scale is None:
-            return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
+            return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
+                self.weight, bias, self.dtype
+            )
         # This is not checked by the fbgemm kernels, but they require contiguous
         # memory. Can be non-contiguous when we e.g. expand from scalars.
         self.weight_scale = self.weight_scale.contiguous()
-        return get_fp8_linear().from_fp8(
-            self.weight, self.weight_scale, self.activation_scale_ub, bias, self.dtype
+        return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
+            weight=self.weight,
+            scale=self.weight_scale,
+            dtype=self.dtype,
+            bias=bias,
+            input_scale=self.input_scale,
+            scale_upper_bound=self.activation_scale_ub,
+            weight_block_size=self.weight_block_size,
         )
 
 
 class Fp8Linear(torch.nn.Module):
+    _device_identity_cache = {}
+
     def __init__(
         self,
-        qweight,
-        scale,
-        scale_upper_bound,
-        bias,
-        dtype,
+        qweight: torch.Tensor,
+        scale: torch.Tensor,
+        dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        scale_upper_bound: Optional[float] = None,
+        weight_block_size: Optional[List[int]] = None,
     ) -> None:
         super().__init__()
-        if FBGEMM_MM_AVAILABLE:
-            log_once(logger.info, "Using FBGEMM fp8 optimized kernels")
 
         self.dtype = dtype
         self.qweight = qweight
-        self.scale = scale
-        self.scale_upper_bound = (
-            torch.tensor(
-                [scale_upper_bound], dtype=torch.float32, device=qweight.device
-            )
-            if scale_upper_bound is not None
-            else None
-        )
+        self.scale = scale.float()
+        self.input_scale = input_scale.float() if input_scale is not None else None
+        self.weight_block_size = weight_block_size
+        self.scale_upper_bound = scale_upper_bound
 
         self.bias = bias if bias is not None else None
 
     @classmethod
     def from_unquant(cls, weight, bias, dtype):
-        qweight, scale = fp8_quantize(weight, scalar=not FBGEMM_MM_AVAILABLE)
+        qweight, scale = fp8_quantize(weight, scalar=True)
         return cls(
-            qweight=qweight, scale=scale, scale_upper_bound=None, bias=bias, dtype=dtype
+            qweight=qweight,
+            scale=scale,
+            dtype=dtype,
+            bias=bias,
+            input_scale=None,
+            scale_upper_bound=None,
         )
 
     @classmethod
-    def from_fp8(cls, weight, scale, input_scale, bias, dtype):
-        if FBGEMM_DYN_AVAILABLE:
-            # fbgemm needs float32 scales.
-            scale = scale.float()
+    def from_fp8(
+        cls,
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> "Fp8Linear":
+        input_scale = kwargs.get("input_scale", None)
+        scale_upper_bound = kwargs.get("scale_upper_bound", None)
+        weight_block_size = kwargs.get("weight_block_size", None)
+
         return cls(
             qweight=weight,
             scale=scale,
-            scale_upper_bound=input_scale,
+            input_scale=input_scale,
+            scale_upper_bound=scale_upper_bound,
             bias=bias,
             dtype=dtype,
+            weight_block_size=weight_block_size,
         )
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if FBGEMM_MM_AVAILABLE:
-            qinput, scale = fp8_quantize(
-                input, scale_upper_bound=self.scale_upper_bound
-            )
+    @classmethod
+    def get_shared_device_identity(cls, device):
+        # Input scaling factors are no longer optional in _scaled_mm starting
+        # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
+        if device not in cls._device_identity_cache:
+            cls._device_identity_cache[device] = torch.ones(1, device=device)
+        return cls._device_identity_cache[device]
 
-            y = torch.ops.fbgemm.f8f8bf16_rowwise(
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.weight_block_size is not None:
+            # https://arxiv.org/pdf/2412.19437
+            # At a more granular level. As illustrated in Figure 7 (a), (1) for activations, we group and
+            # scale elements on a 1x128 tile basis (i.e., per token per 128 channels); and (2) for weights, we
+            # group and scale elements on a 128x128 block basis (i.e., per 128 input channels per 128 output
+            # channels).
+            qinput, scale = per_token_group_quant_fp8(input, self.weight_block_size[1])
+            output = w8a8_block_fp8_matmul(
                 qinput,
                 self.qweight,
                 scale,
                 self.scale,
-                use_fast_accum=True,
-                bias=self.bias,
+                self.weight_block_size,
+                output_dtype=input.dtype,
             )
-            return y.to(self.dtype)
 
-        qinput, scale = fp8_quantize(input, scalar=True)
-        output, _ = torch._scaled_mm(
+            if self.bias is not None:
+                output = output + self.bias
+            return output.to(dtype=input.dtype)
+
+        qinput, scale = fp8_quantize(
+            input,
+            self.input_scale,
+            scale_upper_bound=self.scale_upper_bound,
+            scalar=True,
+        )
+
+        output = torch._scaled_mm(
             qinput,
             self.qweight.t(),
             out_dtype=self.dtype,
@@ -275,11 +443,16 @@ class Fp8Linear(torch.nn.Module):
             scale_b=self.scale,
             bias=self.bias,
         )
+
+        if isinstance(output, tuple) and len(output) == 2:
+            output = output[0]
+
         return output
 
 
 def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: torch.Size):
     scale = weights.get_tensor(prefix, to_dtype=False)
+
     if scale.numel() > 1:
         scale = weights.get_sharded(prefix, dim=0, to_dtype=False)
-    return scale.reshape(-1).expand(shape[0])
+    return scale.reshape(-1)
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/__init__.py b/backends/gaudi/server/text_generation_server/layers/gptq/__init__.py
index 505caa59..90b8f692 100644
--- a/backends/gaudi/server/text_generation_server/layers/gptq/__init__.py
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/__init__.py
@@ -1,14 +1,15 @@
-import os
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
 import torch
 from loguru import logger
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
 
 
+from .hpu import QuantLinear
+
+
 @dataclass
 class GPTQWeight(Weight):
     qweight: torch.Tensor
@@ -30,13 +31,8 @@ class GPTQWeight(Weight):
 
     def get_linear(self, bias: torch.Tensor):
         if self.use_awq_kernel:
-            if SYSTEM == "rocm":
-                raise NotImplementedError(
-                    "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
-                    "to use Exllama/GPTQ kernels for AWQ inference."
-                )
             try:
-                from text_generation_server.layers.awq.quantize.qmodule import WQLinear
+                from text_generation_server.layers.awq.quantize import WQLinear
 
                 return WQLinear(
                     w_bit=self.bits,
@@ -50,18 +46,7 @@ class GPTQWeight(Weight):
                 raise NotImplementedError(
                     "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
                 )
-        elif self.use_exllama:
-            try:
-                from text_generation_server.layers.gptq import ExllamaQuantLinear
-            except ImportError:
-                raise NotImplementedError(
-                    "Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
-                )
-
-            return ExllamaQuantLinear(self, bias)
         else:
-            from text_generation_server.layers.gptq.quant_linear import QuantLinear
-
             return QuantLinear(
                 self.qweight,
                 self.qzeros,
@@ -118,23 +103,6 @@ class GPTQWeightsLoader(WeightsLoader):
         else:
             g_idx = None
 
-        from text_generation_server.layers.gptq import (
-            HAS_EXLLAMA,
-            CAN_EXLLAMA,
-            GPTQWeight,
-        )
-
-        if use_exllama:
-            if not HAS_EXLLAMA:
-                if CAN_EXLLAMA:
-                    log_once(
-                        logger.warning,
-                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
-                    )
-                use_exllama = False
-            else:
-                log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
-
         qzeros = weights.get_tensor(f"{prefix}.qzeros")
         scales = weights.get_tensor(f"{prefix}.scales")
 
@@ -247,14 +215,7 @@ class GPTQWeightsLoader(WeightsLoader):
             [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
         )
 
-        from text_generation_server.layers.gptq import HAS_EXLLAMA
-
-        use_exllama = (
-            self.bits == 4
-            and HAS_EXLLAMA
-            and self.quantize == "gptq"
-            and not self.desc_act
-        )
+        use_exllama = self.bits == 4 and self.quantize == "gptq" and not self.desc_act
 
         if self.quantize == "gptq" and self.quant_method == "gptq":
             w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
@@ -298,6 +259,7 @@ class GPTQWeightsLoader(WeightsLoader):
         self._get_gptq_params(weights)
 
         use_exllama = True
+        desc_act = self.desc_act
         if self.bits != 4:
             use_exllama = False
 
@@ -321,7 +283,8 @@ class GPTQWeightsLoader(WeightsLoader):
             if g_idx is not None:
                 if (
                     not torch.equal(
-                        g_idx.cpu(),
+                        # Remove g_idx[0] to adapt the check with TP>1.
+                        (g_idx - g_idx[0]).cpu(),
                         torch.tensor(
                             [i // self.groupsize for i in range(g_idx.shape[0])],
                             dtype=torch.int32,
@@ -332,34 +295,22 @@ class GPTQWeightsLoader(WeightsLoader):
                     # Exllama implementation does not support row tensor parallelism with act-order, as
                     # it would require to reorder input activations that are split unto several GPUs
                     use_exllama = False
+                    desc_act = True
 
         from text_generation_server.layers.gptq import (
-            CAN_EXLLAMA,
-            HAS_EXLLAMA,
             GPTQWeight,
         )
 
-        if use_exllama:
-            if not HAS_EXLLAMA:
-                if CAN_EXLLAMA:
-                    log_once(
-                        logger.warning,
-                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
-                    )
-                use_exllama = False
-            else:
-                log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
-
-        if use_exllama and self.groupsize != -1:
+        if not desc_act and self.groupsize != -1:
             qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
             scales = weights.get_sharded(f"{prefix}.scales", dim=0)
+            if g_idx is not None:
+                # qzeros, scales sharded, and g_idx must be adjusted accordingly
+                g_idx = g_idx - g_idx[0]
         else:
             qzeros = weights.get_tensor(f"{prefix}.qzeros")
             scales = weights.get_tensor(f"{prefix}.scales")
 
-        if use_exllama and g_idx is not None:
-            g_idx = g_idx - g_idx[0]
-
         if self.quantize == "gptq" and self.quant_method == "awq":
             log_once(
                 logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
@@ -392,7 +343,7 @@ class GPTQWeightsLoader(WeightsLoader):
         )
 
     def _get_gptq_params(self, weights: Weights):
-        if weights._has_tensor("gptq_bits") and weights._has_tensor("gptq_groupsize"):
+        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
             self.bits = weights.get_tensor("gptq_bits").item()
             self.groupsize = weights.get_tensor("gptq_groupsize").item()
             self.desc_act = False
@@ -400,41 +351,7 @@ class GPTQWeightsLoader(WeightsLoader):
             # before the `gptq_sym` setting tensor was added.
             self.sym = (
                 weights.get_tensor("gptq_sym").item()
-                if weights._has_tensor("gptq_sym")
+                if weights.has_tensor("gptq_sym")
                 else False
             )
             self.quant_method = "gptq"
-
-
-# Needs to be at the end because circular import.
-try:
-    major, _minor = torch.cuda.get_device_capability()
-except Exception:
-    major = 1
-
-HAS_EXLLAMA = False
-CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
-V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
-if os.getenv("DISABLE_EXLLAMA") == "True":
-    HAS_EXLLAMA = False
-elif CAN_EXLLAMA:
-    try:
-        if V2:
-            from text_generation_server.layers.gptq.exllamav2 import (
-                QuantLinear as ExllamaQuantLinear,  # noqa: F401
-                create_exllama_buffers,  # noqa: F401
-                set_device,  # noqa: F401
-            )
-
-            HAS_EXLLAMA = "2"
-        else:
-            from text_generation_server.layers.gptq.exllama import (
-                Ex4bitLinear as ExllamaQuantLinear,  # noqa: F401
-                create_exllama_buffers,  # noqa: F401
-                set_device,  # noqa: F401
-            )
-
-            HAS_EXLLAMA = "1"
-
-    except ImportError:
-        pass
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/custom_autotune.py b/backends/gaudi/server/text_generation_server/layers/gptq/custom_autotune.py
deleted file mode 100644
index 0388ef20..00000000
--- a/backends/gaudi/server/text_generation_server/layers/gptq/custom_autotune.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# https://github.com/fpgaminer/GPTQ-triton
-"""
-Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
-"""
-
-import builtins
-import math
-import time
-from typing import Dict
-
-import triton
-
-
-class Autotuner(triton.KernelInterface):
-    def __init__(
-        self,
-        fn,
-        arg_names,
-        configs,
-        key,
-        reset_to_zero,
-        prune_configs_by: Dict = None,
-        nearest_power_of_two: bool = False,
-    ):
-        """
-        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
-                'perf_model': performance model used to predicate running time with different configs, returns running time
-                'top_k': number of configs to bench
-                'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
-                'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
-        """
-        if not configs:
-            self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
-        else:
-            self.configs = configs
-        self.key_idx = [arg_names.index(k) for k in key]
-        self.nearest_power_of_two = nearest_power_of_two
-        self.cache = {}
-        # hook to reset all required tensor to zeros before relaunching a kernel
-        self.hook = lambda args: 0
-        if reset_to_zero is not None:
-            self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
-
-            def _hook(args):
-                for i in self.reset_idx:
-                    args[i].zero_()
-
-            self.hook = _hook
-        self.arg_names = arg_names
-        # prune configs
-        if prune_configs_by:
-            perf_model, top_k = (
-                prune_configs_by["perf_model"],
-                prune_configs_by["top_k"],
-            )
-            if "early_config_prune" in prune_configs_by:
-                early_config_prune = prune_configs_by["early_config_prune"]
-        else:
-            perf_model, top_k, early_config_prune = None, None, None
-        self.perf_model, self.configs_top_k = perf_model, top_k
-        self.early_config_prune = early_config_prune
-        self.fn = fn
-
-    def _bench(self, *args, config, **meta):
-        # check for conflicts, i.e. meta-parameters both provided
-        # as kwargs and by the autotuner
-        conflicts = meta.keys() & config.kwargs.keys()
-        if conflicts:
-            raise ValueError(
-                f"Conflicting meta-parameters: {', '.join(conflicts)}."
-                " Make sure that you don't re-define auto-tuned symbols."
-            )
-        # augment meta-parameters with tunable ones
-        current = dict(meta, **config.kwargs)
-
-        def kernel_call():
-            if config.pre_hook:
-                config.pre_hook(self.nargs)
-            self.hook(args)
-            self.fn.run(
-                *args,
-                num_warps=config.num_warps,
-                num_stages=config.num_stages,
-                **current,
-            )
-
-        try:
-            # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
-            # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
-            return triton.testing.do_bench(
-                kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40
-            )
-        except triton.OutOfResources:
-            return [float("inf"), float("inf"), float("inf")]
-
-    def run(self, *args, **kwargs):
-        self.nargs = dict(zip(self.arg_names, args))
-        if len(self.configs) > 1:
-            key = tuple(args[i] for i in self.key_idx)
-
-            # This reduces the amount of autotuning by rounding the keys to the nearest power of two
-            # In my testing this gives decent results, and greatly reduces the amount of tuning required
-            if self.nearest_power_of_two:
-                key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])
-
-            if key not in self.cache:
-                # prune configs
-                pruned_configs = self.prune_configs(kwargs)
-                bench_start = time.time()
-                timings = {
-                    config: self._bench(*args, config=config, **kwargs)
-                    for config in pruned_configs
-                }
-                bench_end = time.time()
-                self.bench_time = bench_end - bench_start
-                self.cache[key] = builtins.min(timings, key=timings.get)
-                self.hook(args)
-                self.configs_timings = timings
-            config = self.cache[key]
-        else:
-            config = self.configs[0]
-        self.best_config = config
-        if config.pre_hook is not None:
-            config.pre_hook(self.nargs)
-        return self.fn.run(
-            *args,
-            num_warps=config.num_warps,
-            num_stages=config.num_stages,
-            **kwargs,
-            **config.kwargs,
-        )
-
-    def prune_configs(self, kwargs):
-        pruned_configs = self.configs
-        if self.early_config_prune:
-            pruned_configs = self.early_config_prune(self.configs, self.nargs)
-        if self.perf_model:
-            top_k = self.configs_top_k
-            if isinstance(top_k, float) and top_k <= 1.0:
-                top_k = int(len(self.configs) * top_k)
-            if len(pruned_configs) > top_k:
-                est_timing = {
-                    config: self.perf_model(
-                        **self.nargs,
-                        **kwargs,
-                        **config.kwargs,
-                        num_stages=config.num_stages,
-                        num_warps=config.num_warps,
-                    )
-                    for config in pruned_configs
-                }
-                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[
-                    :top_k
-                ]
-        return pruned_configs
-
-    def warmup(self, *args, **kwargs):
-        self.nargs = dict(zip(self.arg_names, args))
-        for config in self.prune_configs(kwargs):
-            self.fn.warmup(
-                *args,
-                num_warps=config.num_warps,
-                num_stages=config.num_stages,
-                **kwargs,
-                **config.kwargs,
-            )
-        self.nargs = None
-
-
-def autotune(
-    configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False
-):
-    """
-    Decorator for auto-tuning a :code:`triton.jit`'d function.
-    .. highlight:: python
-    .. code-block:: python
-            @triton.autotune(configs=[
-                    triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
-                    triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
-                    ],
-                    key=['x_size'] # the two above configs will be evaluated anytime
-                                                    # the value of x_size changes
-            )
-            @triton.jit
-            def kernel(x_ptr, x_size, **META):
-                    BLOCK_SIZE = META['BLOCK_SIZE']
-    :note: When all the configurations are evaluated, the kernel will run multiple time.
-                    This means that whatever value the kernel updates will be updated multiple times.
-                    To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
-                    reset the value of the provided tensor to `zero` before running any configuration.
-    :param configs: a list of :code:`triton.Config` objects
-    :type configs: list[triton.Config]
-    :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
-    :type key: list[str]
-    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
-            'perf_model': performance model used to predicate running time with different configs, returns running time
-            'top_k': number of configs to bench
-            'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
-    :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
-    :type reset_to_zero: list[str]
-    """
-
-    def decorator(fn):
-        return Autotuner(
-            fn,
-            fn.arg_names,
-            configs,
-            key,
-            reset_to_zero,
-            prune_configs_by,
-            nearest_power_of_two,
-        )
-
-    return decorator
-
-
-def matmul248_kernel_config_pruner(configs, nargs):
-    """
-    The main purpose of this function is to shrink BLOCK_SIZE_* when the corresponding dimension is smaller.
-    """
-    m = max(2 ** int(math.ceil(math.log2(nargs["M"]))), 16)
-    n = max(2 ** int(math.ceil(math.log2(nargs["N"]))), 16)
-    k = max(2 ** int(math.ceil(math.log2(nargs["K"]))), 16)
-
-    used = set()
-    for config in configs:
-        block_size_m = min(m, config.kwargs["BLOCK_SIZE_M"])
-        block_size_n = min(n, config.kwargs["BLOCK_SIZE_N"])
-        block_size_k = min(k, config.kwargs["BLOCK_SIZE_K"])
-        group_size_m = config.kwargs["GROUP_SIZE_M"]
-
-        if (
-            block_size_m,
-            block_size_n,
-            block_size_k,
-            group_size_m,
-            config.num_stages,
-            config.num_warps,
-        ) in used:
-            continue
-
-        used.add(
-            (
-                block_size_m,
-                block_size_n,
-                block_size_k,
-                group_size_m,
-                config.num_stages,
-                config.num_warps,
-            )
-        )
-        yield triton.Config(
-            {
-                "BLOCK_SIZE_M": block_size_m,
-                "BLOCK_SIZE_N": block_size_n,
-                "BLOCK_SIZE_K": block_size_k,
-                "GROUP_SIZE_M": group_size_m,
-            },
-            num_stages=config.num_stages,
-            num_warps=config.num_warps,
-        )
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/exllama.py b/backends/gaudi/server/text_generation_server/layers/gptq/exllama.py
deleted file mode 100644
index f27666b7..00000000
--- a/backends/gaudi/server/text_generation_server/layers/gptq/exllama.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from text_generation_server.layers.gptq import GPTQWeight
-import torch
-from exllama_kernels import make_q4, q4_matmul, prepare_buffers, set_tuning_params
-
-# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
-none_tensor = torch.empty((1, 1), device="meta")
-
-
-def ext_make_q4(qweight, qzeros, scales, g_idx, device):
-    """Construct Q4Matrix, return handle"""
-    return make_q4(
-        qweight, qzeros, scales, g_idx if g_idx is not None else none_tensor, device
-    )
-
-
-def ext_q4_matmul(x, q4, q4_width):
-    """Matrix multiplication, returns x @ q4"""
-    outshape = x.shape[:-1] + (q4_width,)
-    x = x.view(-1, x.shape[-1])
-    output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
-
-    q4_matmul(x, q4, output)
-
-    return output.view(outshape)
-
-
-MAX_DQ = 1
-MAX_INNER = 1
-ACT_ORDER = False
-DEVICE = None
-
-TEMP_STATE = None
-TEMP_DQ = None
-
-
-def set_device(device):
-    global DEVICE
-    DEVICE = device
-
-
-def create_exllama_buffers(max_total_tokens: int):
-    global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE, TEMP_STATE, TEMP_DQ
-
-    assert DEVICE is not None, "call set_device first"
-
-    if not ACT_ORDER:
-        max_total_tokens = 1
-
-    # This temp_state buffer is required to reorder X in the act-order case.
-    temp_state = torch.zeros(
-        (max_total_tokens, MAX_INNER), dtype=torch.float16, device=DEVICE
-    )
-    temp_dq = torch.zeros((1, MAX_DQ), dtype=torch.float16, device=DEVICE)
-
-    # This temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
-    prepare_buffers(DEVICE, temp_state, temp_dq)
-
-    matmul_recons_thd = 8
-    matmul_fused_remap = False
-    matmul_no_half2 = False
-    set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
-
-    TEMP_STATE, TEMP_DQ = temp_state, temp_dq
-
-
-class Ex4bitLinear(torch.nn.Module):
-    """Linear layer implementation with per-group 4-bit quantization of the weights"""
-
-    def __init__(self, weight: GPTQWeight, bias):
-        super().__init__()
-        global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
-        assert weight.bits == 4
-
-        self.device = weight.qweight.device
-        self.qweight = weight.qweight
-        self.qzeros = weight.qzeros
-        self.scales = weight.scales
-        self.g_idx = weight.g_idx.cpu() if weight.g_idx is not None else None
-        self.bias = bias if bias is not None else None
-
-        if self.g_idx is not None and (
-            (self.g_idx == 0).all()
-            or torch.equal(
-                weight.g_idx.cpu(),
-                torch.tensor(
-                    [i // weight.groupsize for i in range(weight.g_idx.shape[0])],
-                    dtype=torch.int32,
-                ),
-            )
-        ):
-            self.empty_g_idx = True
-            self.g_idx = None
-
-        assert self.device.type == "cuda"
-        assert self.device.index is not None
-
-        self.q4 = ext_make_q4(
-            self.qweight, self.qzeros, self.scales, self.g_idx, self.device.index
-        )
-
-        self.height = weight.qweight.shape[0] * 8
-        self.width = weight.qweight.shape[1]
-
-        # Infer groupsize from height of qzeros
-        self.groupsize = None
-        if self.qzeros.shape[0] > 1:
-            self.groupsize = (self.qweight.shape[0] * 8) // (self.qzeros.shape[0])
-
-        if self.groupsize is not None:
-            assert weight.groupsize == self.groupsize
-
-        # Handle act-order matrix
-        if self.g_idx is not None:
-            if self.groupsize is None:
-                raise ValueError("Found group index but no groupsize. What do?")
-            self.act_order = True
-        else:
-            self.act_order = False
-
-        DEVICE = self.qweight.device
-
-        MAX_DQ = max(MAX_DQ, self.qweight.numel() * 8)
-
-        if self.act_order:
-            MAX_INNER = max(MAX_INNER, self.height, self.width)
-
-            ACT_ORDER = True
-
-    def forward(self, x):
-        out = ext_q4_matmul(x, self.q4, self.width)
-
-        if self.bias is not None:
-            out.add_(self.bias)
-        return out
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/exllamav2.py b/backends/gaudi/server/text_generation_server/layers/gptq/exllamav2.py
deleted file mode 100644
index 920a6adf..00000000
--- a/backends/gaudi/server/text_generation_server/layers/gptq/exllamav2.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
-
-from dataclasses import dataclass
-from typing import Optional
-import torch
-import torch.nn as nn
-
-from loguru import logger
-
-from text_generation_server.layers.exl2 import Exl2Weight
-from text_generation_server.layers.gptq import GPTQWeight
-from text_generation_server.utils.log import log_master
-
-try:
-    from exllamav2.ext import exllamav2_ext
-
-    make_q_matrix = exllamav2_ext.make_q_matrix
-    gemm_half_q_half = exllamav2_ext.gemm_half_q_half
-except ImportError:
-    log_master(logger.warning, "exllamav2_kernels not installed.")
-    raise
-
-# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
-none_tensor = torch.empty((1, 1), device="meta")
-
-
-@dataclass
-class _ExtraTensors:
-    """Additional generated quantizer tensors."""
-
-    q_group_map: Optional[torch.Tensor] = None
-    q_invperm: Optional[torch.Tensor] = None
-    q_perm: Optional[torch.Tensor] = None
-
-
-def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
-    """Matrix multiplication, returns x @ q4"""
-    output_shape = x.shape[:-1] + (q4_width,)
-    x = x.view(-1, x.shape[-1])
-    output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
-    gemm_half_q_half(x, q_handle, output, force_cuda)
-    return output.view(output_shape)
-
-
-def make_group_map(q_groups: torch.Tensor, num_qrows: int):
-    gr = q_groups.tolist()
-    group_map = []
-    num_groups = len(gr) // 2
-
-    for i in range(num_groups):
-        bits = gr[i * 2]
-        if i < num_groups - 1:
-            qrows = gr[i * 2 + 3] - gr[i * 2 + 1]
-        else:
-            qrows = num_qrows - gr[i * 2 + 1]
-        rows = qrows * 32 // bits
-        for j in range(rows):
-            group_map += [i]
-            group_map += [rows - j]
-
-    return torch.tensor(group_map, dtype=torch.short, device=q_groups.device)
-
-
-# Create Q matrix
-
-
-def ext_make_q_matrix(
-    w: Exl2Weight | GPTQWeight,
-    extra: _ExtraTensors,
-    temp_dq,
-    key: Optional[str] = None,
-):
-    """
-    Create Q matrix
-    """
-    # max_dq_size = 512*(1024**2)
-    # max_dq_rows = max_dq_size // out_features[0]
-    max_dq_rows = 0
-
-    # EXL2
-    if isinstance(w, Exl2Weight):
-        extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
-        extra.q_perm = torch.argsort(w.q_invperm).short()
-
-        return make_q_matrix(
-            w.q_weight,
-            extra.q_perm,
-            w.q_invperm,
-            w.q_scale,
-            w.q_scale_max,
-            w.q_groups,
-            extra.q_group_map,
-            none_tensor,  # zeros
-            none_tensor,  # scales
-            none_tensor,  # g_idx
-            none_tensor,  # bias
-            temp_dq,
-            max_dq_rows,
-        )
-    # GPTQ
-    elif isinstance(w, GPTQWeight):
-        if w.scales.dtype == torch.float:
-            w.scales = w.scales.half()
-
-        # GPTQ with g_idx (act_order)
-        if w.g_idx is not None and not (w.g_idx == 0).all().item():
-            extra.q_perm = torch.empty(
-                (w.qweight.shape[0] * 8,),
-                dtype=torch.short,
-                device=w.qweight.device,
-            )
-            extra.q_invperm = torch.empty_like(extra.q_perm)
-            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
-            return make_q_matrix(
-                w.qweight,
-                extra.q_perm,
-                extra.q_invperm,
-                none_tensor,  # q_scale
-                none_tensor,  # q_scale_max
-                none_tensor,  # q_groups
-                none_tensor,  # q_group_map
-                w.qzeros,
-                w.scales,
-                w.g_idx.cpu(),
-                none_tensor,  # bias
-                temp_dq,
-                max_dq_rows,
-            )
-        # GPTQ without g_idx
-        else:
-            return make_q_matrix(
-                w.qweight,
-                none_tensor,  # q_perm
-                none_tensor,  # q_invperm
-                none_tensor,  # q_scale
-                none_tensor,  # q_scale_max
-                none_tensor,  # q_groups
-                none_tensor,  # q_group_map
-                w.qzeros,
-                w.scales,
-                none_tensor,  # g_idx
-                none_tensor,  # bias
-                temp_dq,
-                max_dq_rows,
-            )
-    else:
-        RuntimeError("Cannot create handle")
-
-
-DEVICE = None
-LAYERS = []
-
-
-def set_device(device):
-    global DEVICE
-    DEVICE = device
-
-
-def create_exllama_buffers(max_total_tokens: int):
-    global LAYERS, DEVICE
-
-    # No need to initialize scratch space if there are no layers
-    # that use ExLLamav2.
-    if len(LAYERS) == 0:
-        return
-
-    # Find the size of the scratch space.
-    scratch_bytes = max(
-        layer.scratch_space_fixed(max_input_len=max_total_tokens, max_batch_size=1)
-        for layer in LAYERS
-    )
-    temp_dq = ExLlamaV2DeviceTensors(DEVICE, scratch_bytes)
-
-    for layer in LAYERS:
-        layer.post_init(temp_dq)
-
-
-class QuantLinear(nn.Module):
-    QUANT_TYPE = "exllamav2"
-
-    """Linear layer implementation with per-group 4-bit quantization of the weights"""
-
-    def __init__(
-        self,
-        weight: Exl2Weight | GPTQWeight,
-        bias: torch.Tensor,
-    ):
-        super().__init__()
-
-        self.q_handle = None
-        self.q_tensors = weight
-        self.extra_tensors = _ExtraTensors()
-
-        if isinstance(weight, Exl2Weight):
-            self.infeatures = weight.q_invperm.shape[0]
-            self.outfeatures = weight.q_weight.shape[1]
-        elif isinstance(weight, GPTQWeight):
-            if weight.bits != 4:
-                raise ValueError(
-                    f"Exllamav2 kernel supports only bits=4, requested bits={weight.bits}. Something is wrong in the model initialization."
-                )
-
-            self.infeatures = weight.qweight.shape[0] // weight.bits * 32
-            self.outfeatures = weight.qweight.shape[1]
-
-        self.padding = -self.outfeatures % 32
-        self.outfeatures = self.outfeatures + self.padding
-
-        self.device = weight.device
-        self.bias = bias if bias is not None else None
-
-        global LAYERS
-        LAYERS.append(self)
-
-    def post_init(self, temp_dq):
-        device = self.q_tensors.device
-        assert device.type == "cuda"
-        assert device.index is not None
-        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
-
-        # We NEED to keep a pointer on Python side, otherwise the garbage collector will mess with us,
-        # and `Memory access fault by GPU node-2` will EAT you.
-        self.temp_dq = temp_dq
-        self.q_handle = ext_make_q_matrix(self.q_tensors, self.extra_tensors, temp_dq)
-
-    def forward(self, x, force_cuda=False):
-        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
-
-        if self.bias is not None:
-            output.add_(self.bias)
-        return output
-
-    def temp_dq_size(self):
-        return self.infeatures * self.outfeatures * 2 + 128
-
-    def temp_fwd_size(self, max_input_len, max_batch_size):
-        return self.outfeatures * max_input_len * max_batch_size * 4 + 128
-
-    def scratch_space_fixed(self, max_input_len, max_batch_size):
-        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
-
-
-class ExLlamaV2DeviceTensors:
-
-    device_idx: int
-    scratch_bytes: int
-    scratch_idx: int
-    scratch: torch.tensor = None
-
-    def __init__(self, device, scratch_bytes):
-        self.device = device
-        self.scratch_bytes = scratch_bytes
-
-    def prepare(self):
-        self.scratch = torch.empty(
-            (self.scratch_bytes // 2,), dtype=torch.half, device=self.device
-        )
-
-    def get_scratch_slice(self, size_bytes):
-
-        if self.scratch is None:
-            self.prepare()
-
-        size_bytes = ((size_bytes + 127) // 128) * 128
-        size_half = size_bytes // 2
-        scratch_slice = self.scratch.narrow(0, 0, size_half)
-        return scratch_slice
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/hpu.py b/backends/gaudi/server/text_generation_server/layers/gptq/hpu.py
new file mode 100644
index 00000000..72944fa0
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/hpu.py
@@ -0,0 +1,186 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+
+try:
+
+    convert_from_uint4 = torch.ops.hpu.convert_from_uint4
+except Exception as e:
+    hpu_import_exception = e
+
+    def error_raiser_hpu(*args, **kwargs):
+        raise ValueError(
+            f"Trying to use HPU, but could not import the HPU framework with the following error: {hpu_import_exception}"
+        )
+
+    convert_from_uint4 = error_raiser_hpu
+
+
+def pack_tensor(input, bits=4):
+    normal = input.to(torch.int32)
+    q = torch.zeros((normal.shape[0], normal.shape[1] // 32 * bits), dtype=torch.int32)
+    i = 0
+    col = 0
+    while col < q.shape[1]:
+        for j in range(i, i + (32 // bits)):
+            q[:, col] |= normal[:, j] << (bits * (j - i))
+        i += 32 // bits
+        col += 1
+    q = q.to(torch.int32)
+    return q
+
+
+class QuantLinear(nn.Module):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        self.register_buffer("qweight", qweight)
+        self.register_buffer("qzeros", qzeros)
+        self.register_buffer("scales", scales)
+        self.register_buffer("g_idx", g_idx)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        if bits not in [4]:
+            raise NotImplementedError("Only 4 bits are supported.")
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize
+
+        self.outfeatures = qweight.shape[1]
+        self.infeatures = qweight.shape[0] * 32 // bits
+        self.wf = torch.tensor(
+            list(range(0, 32, self.bits)), dtype=torch.int32
+        ).unsqueeze(0)
+        self._preprocessing()
+
+    def unpack_zeros_from_cuda_old_format(self):
+        zeros = torch.bitwise_right_shift(
+            torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
+            self.wf.unsqueeze(0),
+        ).to(torch.int16 if self.bits == 8 else torch.int8)
+
+        zeros = zeros + 1
+        zeros = torch.bitwise_and(zeros, (2**self.bits) - 1).to(
+            self.scales.dtype
+        )  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
+        zeros = zeros.reshape(-1, zeros.shape[1] * zeros.shape[2])
+        return zeros
+
+    def unpack_weight_from_cuda_old_format(self):
+        weight = torch.bitwise_right_shift(
+            torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
+            self.wf.unsqueeze(-1),
+        ).to(torch.int16 if self.bits == 8 else torch.int8)
+        weight = torch.bitwise_and(weight, (2**self.bits) - 1)
+        weight = weight.reshape((weight.shape[0] * weight.shape[1], weight.shape[2]))
+        return weight
+
+    def _preprocessing(self):
+        orig_device = self.qweight.device
+        self.qweight = self.qweight.cpu()
+        weight = self.unpack_weight_from_cuda_old_format()
+        new_qweight = pack_tensor(weight)
+        self.qweight = new_qweight.to(orig_device)
+        # TODO: Support group indexing and remove the check
+        columns = self.qweight.shape[0]
+        g_idx_trivial = [i // self.groupsize for i in range(columns)]
+        g_idx_trivial = torch.tensor(
+            g_idx_trivial, dtype=torch.int32, device=self.g_idx.device
+        )
+        assert torch.equal(
+            self.g_idx, g_idx_trivial
+        ), "Non-trivial tensor g_idx is not supported"
+        self.qzeros = self.qzeros.cpu()
+        zeros = self.unpack_zeros_from_cuda_old_format()
+        new_qzeros = pack_tensor(zeros)
+        self.qzeros = new_qzeros.to(orig_device)
+
+    @classmethod
+    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
+        if bits not in [4]:
+            raise NotImplementedError("Only 4 bits are supported.")
+
+        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
+        qzeros = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
+            dtype=torch.int32,
+        )
+        scales = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
+        )
+        g_idx = torch.tensor(
+            [i // groupsize for i in range(infeatures)], dtype=torch.int32
+        )
+        if bias:
+            bias = torch.zeros((outfeatures), dtype=torch.float16)
+        else:
+            bias = None
+        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        x = x.reshape(-1, x.shape[-1])
+        weight = convert_from_uint4(self.qweight, self.scales, self.qzeros, x.dtype)
+        out = torch.matmul(x, weight)
+        out = out.reshape(out_shape)
+        out = out + self.bias if self.bias is not None else out
+        return out
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/quant_linear.py b/backends/gaudi/server/text_generation_server/layers/gptq/quant_linear.py
deleted file mode 100644
index 736c357b..00000000
--- a/backends/gaudi/server/text_generation_server/layers/gptq/quant_linear.py
+++ /dev/null
@@ -1,359 +0,0 @@
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.cuda.amp import custom_fwd
-
-import triton
-import triton.language as tl
-from . import custom_autotune
-
-
-# code based https://github.com/fpgaminer/GPTQ-triton
-@custom_autotune.autotune(
-    configs=[
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 256,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 128,
-                "BLOCK_SIZE_N": 128,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 128,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 128,
-                "BLOCK_SIZE_N": 32,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 64,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 128,
-                "BLOCK_SIZE_K": 32,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=2,
-            num_warps=8,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 64,
-                "BLOCK_SIZE_K": 64,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {
-                "BLOCK_SIZE_M": 32,
-                "BLOCK_SIZE_N": 32,
-                "BLOCK_SIZE_K": 128,
-                "GROUP_SIZE_M": 8,
-            },
-            num_stages=2,
-            num_warps=4,
-        ),
-    ],
-    key=["M", "N", "K"],
-    nearest_power_of_two=True,
-    prune_configs_by={
-        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
-        "perf_model": None,
-        "top_k": None,
-    },
-)
-@triton.jit
-def matmul_248_kernel(
-    a_ptr,
-    b_ptr,
-    c_ptr,
-    scales_ptr,
-    zeros_ptr,
-    g_ptr,
-    M,
-    N,
-    K,
-    bits,
-    maxq,
-    stride_am,
-    stride_ak,
-    stride_bk,
-    stride_bn,
-    stride_cm,
-    stride_cn,
-    stride_scales,
-    stride_zeros,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    """
-    Compute the matrix multiplication C = A x B.
-    A is of shape (M, K) float16
-    B is of shape (K//8, N) int32
-    C is of shape (M, N) float16
-    scales is of shape (G, N) float16
-    zeros is of shape (G, N) float16
-    g_ptr is of shape (K) int32
-    """
-    infearure_per_bits = 32 // bits
-
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (
-        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
-    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-    a_mask = offs_am[:, None] < M
-    # b_ptrs is set up such that it repeats elements along the K axis 8 times
-    b_ptrs = b_ptr + (
-        (offs_k[:, None] // infearure_per_bits) * stride_bk
-        + offs_bn[None, :] * stride_bn
-    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-    g_ptrs = g_ptr + offs_k
-    # shifter is used to extract the N bits of each element in the 32-bit word from B
-    scales_ptrs = scales_ptr + offs_bn[None, :]
-    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
-
-    shifter = (offs_k % infearure_per_bits) * bits
-    zeros_shifter = (offs_bn % infearure_per_bits) * bits
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-
-    for k in range(0, num_pid_k):
-        g_idx = tl.load(g_ptrs)
-
-        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-        scales = tl.load(
-            scales_ptrs + g_idx[:, None] * stride_scales
-        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-        zeros = tl.load(
-            zeros_ptrs + g_idx[:, None] * stride_zeros
-        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-
-        zeros = (zeros >> zeros_shifter[None, :]) & maxq
-        zeros = (zeros + 1) & maxq  # eventually avoid overflow
-
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-        # Now we need to unpack b (which is N-bit values) into 32-bit values
-        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-        b = (b - zeros) * scales  # Scale and shift
-
-        accumulator += tl.dot(a, b)
-        a_ptrs += BLOCK_SIZE_K
-        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
-        g_ptrs += BLOCK_SIZE_K
-
-    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
-    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
-    tl.store(c_ptrs, accumulator, mask=c_mask)
-
-
-def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    with torch.cuda.device(input.device):
-        output = torch.empty(
-            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
-        )
-
-        def grid(META):
-            return (
-                triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
-                * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
-            )
-
-        matmul_248_kernel[grid](
-            input,
-            qweight,
-            output,
-            scales,
-            qzeros,
-            g_idx,
-            input.shape[0],
-            qweight.shape[1],
-            input.shape[1],
-            bits,
-            maxq,
-            input.stride(0),
-            input.stride(1),
-            qweight.stride(0),
-            qweight.stride(1),
-            output.stride(0),
-            output.stride(1),
-            scales.stride(0),
-            qzeros.stride(0),
-        )
-        return output
-
-
-class QuantLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
-        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
-        return output
-
-
-class QuantLinear(nn.Module):
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        self.register_buffer("qweight", qweight)
-        self.register_buffer("qzeros", qzeros)
-        self.register_buffer("scales", scales)
-        self.register_buffer("g_idx", g_idx)
-        if bias is not None:
-            self.register_buffer("bias", bias)
-        else:
-            self.bias = None
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        self.bits = bits
-        self.maxq = 2**self.bits - 1
-        self.groupsize = groupsize
-
-        self.outfeatures = qweight.shape[1]
-        self.infeatures = qweight.shape[0] * 32 // bits
-
-    @classmethod
-    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
-        qzeros = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
-            dtype=torch.int32,
-        )
-        scales = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
-        )
-        g_idx = torch.tensor(
-            [i // groupsize for i in range(infeatures)], dtype=torch.int32
-        )
-        if bias:
-            bias = torch.zeros((outfeatures), dtype=torch.float16)
-        else:
-            bias = None
-        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
-
-    def pack(self, linear, scales, zeros, g_idx=None):
-        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
-
-        scales = scales.t().contiguous()
-        zeros = zeros.t().contiguous()
-        scale_zeros = zeros * scales
-        self.scales = scales.clone().half()
-        if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-
-        intweight = []
-        for idx in range(self.infeatures):
-            intweight.append(
-                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
-                    / self.scales[self.g_idx[idx]]
-                ).to(torch.int)[:, None]
-            )
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.numpy().astype(np.uint32)
-        qweight = np.zeros(
-            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
-        )
-        i = 0
-        row = 0
-        while row < qweight.shape[0]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qweight[row] |= intweight[j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                row += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight)
-
-        zeros -= 1
-        zeros = zeros.numpy().astype(np.uint32)
-        qzeros = np.zeros(
-            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
-        )
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                col += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qzeros = qzeros.astype(np.int32)
-        self.qzeros = torch.from_numpy(qzeros)
-
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(
-            x.reshape(-1, x.shape[-1]),
-            self.qweight,
-            self.scales,
-            self.qzeros,
-            self.g_idx,
-            self.bits,
-            self.maxq,
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
diff --git a/backends/gaudi/server/text_generation_server/layers/gptq/quantize.py b/backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
index b0086ea0..aa664ea6 100644
--- a/backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
@@ -12,7 +12,7 @@ from huggingface_hub import HfApi
 from accelerate import init_empty_weights
 from text_generation_server.utils import initialize_torch_distributed, Weights
 from text_generation_server.utils.hub import weight_files
-from text_generation_server.layers.gptq.quant_linear import QuantLinear
+from text_generation_server.layers.gptq import QuantLinear
 from loguru import logger
 from typing import Optional
 from text_generation_server.layers.gptq.utils import torch_snr_error
@@ -956,15 +956,24 @@ def quantize(
 
     pack(model, quantizers, bits, groupsize)
     from safetensors.torch import save_file
-    from transformers.modeling_utils import shard_checkpoint
+    from huggingface_hub import split_torch_state_dict_into_shards
 
     state_dict = model.state_dict()
     state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}
 
     max_shard_size = "10GB"
-    shards, index = shard_checkpoint(
-        state_dict, max_shard_size=max_shard_size, weights_name="model.safetensors"
+    state_dict_split = split_torch_state_dict_into_shards(
+        state_dict,
+        filename_pattern="model.safetensors",
+        max_shard_size=max_shard_size,
     )
+    index = None
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+    shards = state_dict_split.filename_to_tensors
     os.makedirs(output_dir, exist_ok=True)
     for shard_file, shard in shards.items():
         save_file(
diff --git a/backends/gaudi/server/text_generation_server/layers/layernorm.py b/backends/gaudi/server/text_generation_server/layers/layernorm.py
index ce5289f9..84878791 100644
--- a/backends/gaudi/server/text_generation_server/layers/layernorm.py
+++ b/backends/gaudi/server/text_generation_server/layers/layernorm.py
@@ -1,9 +1,6 @@
 import torch
 from torch import nn
 from accelerate import init_empty_weights
-from text_generation_server.utils.import_utils import (
-    SYSTEM,
-)
 
 
 # Monkey patching
@@ -33,69 +30,14 @@ def load_layer_norm_no_bias(cls, prefix, weights, eps):
 torch.nn.LayerNorm.load = load_layer_norm
 torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
 
-if SYSTEM == "cuda":
-    import dropout_layer_norm
 
-    class FastLayerNorm(nn.LayerNorm):
-        def forward(self, hidden_states, residual=None):
-            if hidden_states.shape[-1] > 8192:
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
+class FastLayerNorm(nn.LayerNorm):
+    def forward(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
 
-                return super(FastLayerNorm, self).forward(hidden_states), residual
-            else:
-                (
-                    normed_hidden_states,
-                    residual,
-                    *rest,
-                ) = dropout_layer_norm.dropout_add_ln_fwd(
-                    hidden_states,
-                    residual,
-                    self.weight,
-                    self.bias,
-                    None,
-                    None,
-                    None,
-                    None,
-                    0.0,
-                    self.eps,
-                    1.0,
-                    0,
-                    None,
-                    False,
-                    False,
-                )
-                if residual is None:
-                    residual = hidden_states
-
-                return normed_hidden_states, residual
-
-elif SYSTEM == "rocm":
-    from vllm._C import ops
-
-    class FastLayerNorm(nn.LayerNorm):
-        def forward(self, hidden_states, residual=None):
-            if residual is not None:
-                hidden_states += residual
-            residual = hidden_states
-
-            return super().forward(hidden_states), residual
-
-elif SYSTEM == "ipex":
-    import intel_extension_for_pytorch as ipex
-
-    class FastLayerNorm(nn.LayerNorm):
-        def forward(self, hidden_states, residual=None):
-            out = ipex.llm.functional.add_layer_norm(
-                residual,
-                hidden_states,
-                self.weight,
-                self.bias,
-                self.eps,
-                residual is not None,
-            )
-            return out, residual if residual is not None else hidden_states
+        return super().forward(hidden_states), residual
 
 
 class FastRMSNorm(nn.Module):
@@ -111,74 +53,15 @@ class FastRMSNorm(nn.Module):
         return cls(weight, eps)
 
     def forward(self, hidden_states, residual=None):
-        if SYSTEM == "ipex":
-            out = ipex.llm.functional.add_rms_norm(
-                residual,
-                hidden_states,
-                self.weight,
-                None,
-                self.variance_epsilon,
-                residual is not None,
-            )
-            return out, residual if residual is not None else hidden_states
-        elif hidden_states.shape[-1] > 8192:
-            if residual is not None:
-                hidden_states += residual
-            residual = hidden_states
+        from vllm_hpu_extension.kernels import rms_norm
 
-            hidden_states = hidden_states.to(torch.float32)
-            variance = hidden_states.pow(2).mean(-1, keepdim=True)
-            hidden_states = hidden_states * torch.rsqrt(
-                variance + self.variance_epsilon
-            )
-
-            # convert into half-precision if necessary
-            if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                hidden_states = hidden_states.to(self.weight.dtype)
-
-            return self.weight * hidden_states, residual
-        elif SYSTEM == "cuda":
-            # faster post attention rms norm
-            (
-                normed_hidden_states,
-                res,
-                *rest,
-            ) = dropout_layer_norm.dropout_add_ln_fwd(
-                hidden_states,
-                residual,
-                self.weight,
-                None,
-                None,
-                None,
-                None,
-                None,
-                0.0,
-                self.variance_epsilon,
-                1.0,
-                0,
-                None,
-                False,
-                True,  # Activate RMSNorm
-            )
-            if res is None:
-                res = hidden_states
-
-            return normed_hidden_states, res
-        elif SYSTEM == "rocm":
-            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
-            if residual is not None:
-                hidden_states += residual
-            residual = hidden_states
-
-            out = torch.empty_like(hidden_states)
-            ops.rms_norm(
-                out,
-                hidden_states,
-                self.weight.data,
-                self.variance_epsilon,
-            )
-            return out, residual
+        orig_shape = hidden_states.shape
+        if residual is not None:
+            residual += hidden_states.view(residual.shape)
         else:
-            raise ValueError(
-                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-            )
+            residual = hidden_states
+        # Note: HPUFusedRMSNorm requires 3D tensors as inputs
+        if len(orig_shape) == 2:
+            residual = residual.unsqueeze(0)
+        x = rms_norm().apply(residual, self.weight, self.variance_epsilon)
+        return x.view(orig_shape), residual.view(orig_shape)
diff --git a/backends/gaudi/server/text_generation_server/layers/linear.py b/backends/gaudi/server/text_generation_server/layers/linear.py
index 08306d57..cca80c44 100644
--- a/backends/gaudi/server/text_generation_server/layers/linear.py
+++ b/backends/gaudi/server/text_generation_server/layers/linear.py
@@ -1,21 +1,5 @@
 import torch
-from text_generation_server.utils.import_utils import SYSTEM
 from torch.nn import functional as F
-import os
-
-if SYSTEM == "rocm":
-    ROCM_USE_SKINNY_GEMM = os.getenv("ROCM_USE_SKINNY_GEMM", "True").lower() in (
-        "true",
-        "1",
-    )
-
-    if ROCM_USE_SKINNY_GEMM:
-        try:
-            from vllm import _custom_C
-        except Exception as e:
-            raise ImportError(
-                f"Could not load `vllm._custom_C` for ROCm skinny gemm. Full error: {e}"
-            )
 
 
 class FastLinear(torch.nn.Module):
@@ -44,83 +28,11 @@ class FastLinear(torch.nn.Module):
         return F.linear(input, self.weight, self.bias)
 
 
-class FastLinearROCm(torch.nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        self.weight = torch.nn.Parameter(weight)
-        if bias is not None:
-            self.bias = torch.nn.Parameter(bias)
-        else:
-            self.bias = None
-
-        self.cu_count = torch.cuda.get_device_properties(
-            device="cuda"
-        ).multi_processor_count
-        self.use_skinny_gemm = (
-            ROCM_USE_SKINNY_GEMM
-            and "gfx1" not in torch.cuda.get_device_properties("cuda").gcnArchName
-        )
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_tensor(f"{prefix}.weight")
-        if bias:
-            bias = weights.get_tensor(f"{prefix}.bias")
-        else:
-            bias = None
-        return cls(weight, bias)
-
-    def forward(self, inp: torch.Tensor) -> torch.Tensor:
-        weight = self.weight
-        bias = self.bias
-
-        if (
-            self.use_skinny_gemm
-            and inp.dtype == torch.float16
-            and inp.shape[-1] % 8 == 0
-        ):
-            batched = False
-            inp_shape = inp.shape
-
-            if inp.dim() == 3:
-                inp = inp.view(-1, inp_shape[-1])
-                batched = True
-
-            m, n, k = weight.shape[0], inp_shape[0], inp_shape[1]
-            if m > 8 and n <= 4:
-                out = torch.empty(
-                    inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
-                )
-                _custom_C.wvSpltK(weight, inp, out, n, self.cu_count)
-            elif m % 4 == 0 and n == 1 and k <= 8192:
-                out = torch.empty(
-                    inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
-                )
-                _custom_C.LLMM1(weight, inp, out, 4)
-            else:
-                out = F.linear(inp, weight)
-
-            if batched:
-                out.view(*inp_shape[:-1], out.shape[-1])
-
-            if bias is not None:
-                out = out + bias
-            return out
-        return F.linear(inp, self.weight, self.bias)
-
-
 def get_linear(weight, bias):
     # Weights that are loaded through methods that are not
     # quantization-aware are still bare tensors. We may want
     # to change this in the future.
     if isinstance(weight, torch.Tensor):
-        if SYSTEM == "rocm":
-            return FastLinearROCm(weight, bias)
-        else:
-            return FastLinear(weight, bias)
+        return FastLinear(weight, bias)
 
     return weight.get_linear(bias)
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/__init__.py b/backends/gaudi/server/text_generation_server/layers/marlin/__init__.py
deleted file mode 100644
index 3ff3ed58..00000000
--- a/backends/gaudi/server/text_generation_server/layers/marlin/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from text_generation_server.layers.marlin.fp8 import GPTQMarlinFP8Linear
-from text_generation_server.layers.marlin.gptq import (
-    GPTQMarlinWeightsLoader,
-    can_use_gptq_marlin,
-    repack_gptq_for_marlin,
-)
-from text_generation_server.layers.marlin.marlin import MarlinWeightsLoader
-
-__all__ = [
-    "GPTQMarlinFP8Linear",
-    "GPTQMarlinWeightsLoader",
-    "MarlinWeightsLoader",
-    "can_use_gptq_marlin",
-    "repack_gptq_for_marlin",
-]
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/fp8.py b/backends/gaudi/server/text_generation_server/layers/marlin/fp8.py
deleted file mode 100644
index fe55a58a..00000000
--- a/backends/gaudi/server/text_generation_server/layers/marlin/fp8.py
+++ /dev/null
@@ -1,140 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-from loguru import logger
-from text_generation_server.layers.fp8 import fp8_quantize
-from text_generation_server.layers.marlin.gptq import _check_valid_shape
-from text_generation_server.layers.marlin.util import (
-    _check_marlin_kernels,
-    permute_scales,
-)
-from text_generation_server.utils.log import log_once
-
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
-
-
-MARLIN_TILE_SIZE = 16
-
-
-class GPTQMarlinFP8Linear(nn.Module):
-    """
-    FP8 GPTQ-Marlin linear layer.
-    """
-
-    def __init__(
-        self,
-        qweight: torch.Tensor,
-        scales: torch.Tensor,
-        bias: Optional[torch.Tensor],
-    ) -> None:
-        super().__init__()
-
-        _check_marlin_kernels()
-        assert marlin_kernels is not None
-
-        log_once(logger.info, "GPU does not support FP8, using Marlin FP8 kernel")
-
-        scales = scales.unsqueeze(0)
-        if scales.shape[1] == 1:
-            out_features, in_features = qweight.shape
-            scales = scales.repeat(1, out_features)
-        qweight, scales = repack_fp8_for_marlin(qweight, scales)
-
-        in_features = qweight.shape[0] * MARLIN_TILE_SIZE
-        out_features = scales.shape[1]
-        _check_valid_shape(in_features=in_features, out_features=out_features)
-
-        self.qweight = qweight
-        self.scales = scales
-        self.bias = bias if bias is not None else None
-
-        self.workspace = torch.zeros(
-            out_features // 64 * 16, dtype=torch.int, device=qweight.device
-        )
-
-    @classmethod
-    def from_unquant(cls, weight, bias, dtype):
-        qweight, scales = fp8_quantize(weight)
-        return cls(qweight=qweight, scales=scales.to(dtype), bias=bias)
-
-    @classmethod
-    def from_fp8(cls, weight, scale, _input_scale, bias, dtype):
-        return cls(qweight=weight, scales=scale.to(dtype), bias=bias)
-
-    def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
-
-        A_flat = A.view(-1, A.shape[-1])
-        C = marlin_kernels.fp8_marlin_gemm(
-            A_flat,
-            self.qweight,
-            self.scales,
-            self.workspace,
-            8,
-            A_flat.shape[0],
-            self.scales.shape[1],
-            A_flat.shape[1],
-        )
-        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
-
-        if self.bias is not None:
-            C += self.bias
-
-        return C
-
-
-def pack_fp8_as_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
-    """
-    Repack FP8 weights to gptq format (packed int32 elements).
-    """
-    assert fp8_tensor.dtype == torch.float8_e4m3fn
-
-    if fp8_tensor.shape[0] % 4 != 0:
-        raise ValueError(
-            f"Leading tensor dimension is not divisable by 4: {fp8_tensor.shape[0]}"
-        )
-
-    # Reshape to prepare for packing
-    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
-
-    # Convert fp8 to uint8 (byte) representation
-    byte_tensor = reshaped.view(torch.uint8)
-
-    # Pack 4 uint8 values into one int32
-    packed = torch.zeros(
-        fp8_tensor.shape[0] // 4,
-        fp8_tensor.shape[1],
-        dtype=torch.int32,
-        device=fp8_tensor.device,
-    )
-
-    for i in range(4):
-        packed.bitwise_or_(byte_tensor[:, i].to(torch.int32) << i * 8)
-
-    return packed
-
-
-def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor):
-    """
-    Repack FP8 tensor for GPTQ-Marlin.
-    """
-
-    out_features, in_features = weight.shape
-
-    # Torch linear layers weights with shape [out_features, in_features],
-    # GPTQ-quantized weights use [in_feateres/pack_factor, in_features],
-    # so transpose before packing.
-    qweight = pack_fp8_as_int32(weight.t())
-
-    perm = torch.empty(0, dtype=torch.int, device=qweight.device)
-    repacked = marlin_kernels.gptq_marlin_repack(
-        qweight, perm, in_features, out_features, 8
-    )
-
-    scales = permute_scales(scales)
-
-    return repacked, scales
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/gptq.py b/backends/gaudi/server/text_generation_server/layers/marlin/gptq.py
deleted file mode 100644
index 0a785d94..00000000
--- a/backends/gaudi/server/text_generation_server/layers/marlin/gptq.py
+++ /dev/null
@@ -1,464 +0,0 @@
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import numpy
-import torch
-import torch.nn as nn
-from loguru import logger
-from text_generation_server.layers.marlin.util import (
-    _check_marlin_kernels,
-    marlin_zero_points,
-    permute_scales,
-    unpack_cols,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.utils.log import log_once
-from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
-
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
-
-try:
-    major, _minor = torch.cuda.get_device_capability()
-    has_sm_8_0 = major >= 8
-except Exception:
-    has_sm_8_0 = False
-
-
-GPTQ_MARLIN_BITS = [4, 8]
-GPTQ_MARLIN_GROUP_SIZES = [-1, 32, 64, 128]
-MARLIN_TILE_SIZE = 16
-
-
-def can_use_gptq_marlin(
-    *, bits: int, groupsize: int, quant_method: str, quantize: str, sym: bool
-) -> bool:
-    return (
-        SYSTEM == "cuda"
-        and marlin_kernels is not None
-        and has_sm_8_0
-        and quantize in {"awq", "gptq"}
-        and quant_method in {"awq", "gptq"}
-        and bits in GPTQ_MARLIN_BITS
-        and groupsize in GPTQ_MARLIN_GROUP_SIZES
-        # We only suppord asymmetric quantization for AWQ.
-        and (sym or quant_method == "awq")
-    )
-
-
-class GPTQMarlinWeightsLoader(WeightsLoader):
-    """
-    Loader for using GPTQ- and AWQ-quantized weights with Marlin kernels.
-    """
-
-    def __init__(
-        self,
-        *,
-        bits: int,
-        desc_act: bool,
-        groupsize: int,
-        quant_method: str,
-        quantize: str,
-        sym: bool,
-    ):
-        self.bits = bits
-        self.desc_act = desc_act
-        self.groupsize = groupsize
-        self.quant_method = quant_method
-        self.quantize = quantize
-        self.sym = sym
-
-    def get_weights(self, weights: Weights, prefix: str):
-        log_once(logger.info, "Using GPTQ-Marlin kernels")
-        try:
-            qweight = weights.get_tensor(f"{prefix}.qweight")
-        except RuntimeError:
-            raise RuntimeError(
-                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
-            )
-
-        if not self.sym:
-            qzeros = weights.get_tensor(f"{prefix}.qzeros")
-        else:
-            qzeros = None
-
-        if self.quant_method == "awq":
-            g_idx = None
-        else:
-            g_idx = weights.get_tensor(f"{prefix}.g_idx")
-        scales = weights.get_tensor(f"{prefix}.scales")
-
-        return repack_gptq_for_marlin(
-            qweight=qweight,
-            scales=scales,
-            qzeros=qzeros,
-            g_idx=g_idx,
-            bits=self.bits,
-            desc_act=self.desc_act,
-            groupsize=self.groupsize,
-            quant_method=self.quant_method,
-            sym=self.sym,
-            sharded_infeatures=False,
-        )
-
-    def get_weights_col_packed(
-        self,
-        weights: Weights,
-        prefix: str,
-        block_sizes: Union[int, List[int]],
-    ):
-        try:
-            qweight = weights.get_packed_sharded(
-                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
-            )
-        except RuntimeError:
-            raise RuntimeError(
-                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
-            )
-        scales = weights.get_packed_sharded(
-            f"{prefix}.scales", dim=1, block_sizes=block_sizes
-        )
-        scales = scales.to(dtype=weights.dtype)
-
-        if not self.sym:
-            qzeros = weights.get_packed_sharded(
-                f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
-            )
-        else:
-            qzeros = None
-
-        if self.quant_method == "awq":
-            g_idx = None
-        else:
-            g_idx = weights.get_tensor(f"{prefix}.g_idx")
-        return repack_gptq_for_marlin(
-            qweight=qweight,
-            scales=scales,
-            qzeros=qzeros,
-            g_idx=g_idx,
-            bits=self.bits,
-            desc_act=self.desc_act,
-            groupsize=self.groupsize,
-            quant_method=self.quant_method,
-            sym=self.sym,
-            sharded_infeatures=False,
-        )
-
-    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
-        try:
-            qweight = torch.cat(
-                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
-            )
-        except RuntimeError:
-            raise RuntimeError(
-                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
-            )
-
-        scales = torch.cat(
-            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
-        )
-
-        if not self.sym:
-            qzeros = torch.cat(
-                [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
-            )
-        else:
-            qzeros = None
-
-        if self.quant_method == "awq":
-            g_idx = None
-        else:
-            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
-            for w2 in w[1:]:
-                torch.testing.assert_close(w2, w[0])
-            g_idx = w[0]
-
-        return repack_gptq_for_marlin(
-            qweight=qweight,
-            scales=scales,
-            qzeros=qzeros,
-            g_idx=g_idx,
-            bits=self.bits,
-            desc_act=self.desc_act,
-            groupsize=self.groupsize,
-            quant_method=self.quant_method,
-            sym=self.sym,
-            sharded_infeatures=False,
-        )
-
-    def get_weights_row(self, weights: Weights, prefix: str):
-        log_once(logger.info, "Using GPTQ-Marlin kernels")
-        try:
-            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
-        except RuntimeError:
-            raise RuntimeError(
-                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
-            )
-
-        if not self.sym:
-            if self.desc_act or self.groupsize == -1:
-                qzeros = weights.get_tensor(f"{prefix}.qzeros")
-            else:
-                qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
-        else:
-            qzeros = None
-
-        if self.quant_method == "awq":
-            g_idx = None
-        else:
-            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
-
-        if self.desc_act or self.groupsize == -1:
-            scales = weights.get_tensor(f"{prefix}.scales")
-        else:
-            scales = weights.get_sharded(f"{prefix}.scales", dim=0)
-
-        sharded_in_features = weights.process_group.size() > 1
-
-        return repack_gptq_for_marlin(
-            qweight=qweight,
-            scales=scales,
-            qzeros=qzeros,
-            g_idx=g_idx,
-            bits=self.bits,
-            desc_act=self.desc_act,
-            groupsize=self.groupsize,
-            quant_method=self.quant_method,
-            sym=self.sym,
-            sharded_infeatures=sharded_in_features,
-        )
-
-    def _get_gptq_params(self, weights: Weights):
-        if weights._has_tensor("gptq_bits") and weights._has_tensor("gptq_groupsize"):
-            self.bits = weights.get_tensor("gptq_bits").item()
-            self.groupsize = weights.get_tensor("gptq_groupsize").item()
-            self.desc_act = False
-            # `server quantize` used asymmetric quantization unconditionally
-            # before the `gptq_sym` setting tensor was added.
-            self.sym = (
-                weights.get_tensor("gptq_sym").item()
-                if weights._has_tensor("gptq_sym")
-                else False
-            )
-            self.quant_method = "gptq"
-
-
-@dataclass
-class GPTQMarlinWeight(Weight):
-    """
-    Repacked GPTQ Marlin weights.
-    """
-
-    qweight: torch.Tensor
-    qzeros: torch.Tensor
-    scales: torch.Tensor
-    g_idx: torch.Tensor
-    perm: torch.Tensor
-    bits: int
-    is_full_k: bool
-
-    def __post_init__(self):
-        assert self.qweight.dtype == torch.int32
-        assert self.scales.dtype == torch.float16
-        assert self.g_idx.dtype == torch.int32
-        assert self.perm.dtype == torch.int32
-
-    def get_linear(self, bias: torch.Tensor):
-        return GPTQMarlinLinear(
-            weight=self,
-            bias=bias,
-        )
-
-
-def repack_gptq_for_marlin(
-    *,
-    qweight: torch.Tensor,
-    qzeros: Optional[torch.Tensor],
-    scales: torch.Tensor,
-    g_idx: Optional[torch.Tensor],
-    bits: int,
-    desc_act: bool,
-    groupsize: int,
-    quant_method: str,
-    sym: bool,
-    sharded_infeatures: bool,
-) -> GPTQMarlinWeight:
-    """Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels."""
-    _check_marlin_kernels()
-    assert marlin_kernels is not None
-
-    if bits not in GPTQ_MARLIN_BITS:
-        supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
-        raise RuntimeError(
-            f"Repacking {bits}-bit GPTQ weights as Marlin is not supported, must be one of: {supported_bits}"
-        )
-
-    if groupsize not in GPTQ_MARLIN_GROUP_SIZES:
-        supported_sizes = ", ".join(str(b) for b in GPTQ_MARLIN_GROUP_SIZES)
-        raise RuntimeError(
-            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
-        )
-    if not (sym or quant_method == "awq"):
-        raise RuntimeError(
-            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
-        )
-
-    log_once(logger.info, f"Converting {quant_method} model to Marlin packing format.")
-
-    weights_per_int = 32 // bits
-    in_features = qweight.shape[0]
-    out_features = qweight.shape[1]
-
-    # AWQ uses column packing, GPTQ uses row packing
-    if quant_method == "awq":
-        out_features *= weights_per_int
-    else:
-        in_features *= weights_per_int
-
-    if in_features % groupsize != 0:
-        raise ValueError(
-            f"Number of input features ({in_features}) not divisible by group size ({groupsize})"
-        )
-
-    if g_idx is not None and desc_act and groupsize != -1:
-        perm = torch.argsort(g_idx).to(torch.int)
-        g_idx = g_idx[perm]
-    else:
-        perm = torch.empty(0, dtype=torch.int, device=qweight.device)
-        g_idx = torch.empty(0, dtype=torch.int, device=qweight.device)
-
-    if quant_method == "awq":
-        repacked = marlin_kernels.awq_marlin_repack(
-            qweight, in_features, out_features, bits
-        )
-        if qzeros is not None:
-            qzeros = awq_to_marlin_zero_points(
-                qzeros,
-                in_features // groupsize,
-                out_features,
-                bits,
-            )
-
-    else:
-        repacked = marlin_kernels.gptq_marlin_repack(
-            qweight, perm, in_features, out_features, bits
-        )
-
-    if qzeros is None:
-        qzeros = torch.empty(0, dtype=torch.int, device=qweight.device)
-
-    scales = permute_scales(scales)
-
-    is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)
-
-    return GPTQMarlinWeight(
-        qweight=repacked,
-        qzeros=qzeros,
-        scales=scales,
-        g_idx=g_idx,
-        perm=perm,
-        bits=bits,
-        is_full_k=is_full_k,
-    )
-
-
-class GPTQMarlinLinear(nn.Module):
-    """
-    Linear layer for GPTQ weights that were converted for the GPTQ-Marlin
-    kernels.
-    """
-
-    def __init__(
-        self,
-        *,
-        weight: GPTQMarlinWeight,
-        bias: Optional[torch.Tensor],
-    ):
-        super().__init__()
-
-        _check_marlin_kernels()
-        assert marlin_kernels is not None
-
-        in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE
-        out_features = weight.scales.shape[1]
-        _check_valid_shape(in_features=in_features, out_features=out_features)
-
-        self.bits = weight.bits
-        self.is_full_k = weight.is_full_k
-
-        self.qweight = weight.qweight
-        self.qzeros = weight.qzeros
-        self.scales = weight.scales
-        self.g_idx = weight.g_idx
-        self.perm = weight.perm
-        if bias is not None:
-            self.bias = bias
-        else:
-            self.bias = None
-
-        self.workspace = torch.zeros(
-            out_features // 64 * 16, dtype=torch.int, device=weight.qweight.device
-        )
-
-    def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
-
-        A_flat = A.view(-1, A.shape[-1])
-        C = marlin_kernels.gptq_marlin_gemm(
-            A_flat,
-            self.qweight,
-            self.scales,
-            self.qzeros,
-            self.g_idx,
-            self.perm,
-            self.workspace,
-            self.bits,
-            A_flat.shape[0],
-            self.scales.shape[1],
-            A_flat.shape[1],
-            self.is_full_k,
-            self.qzeros.numel() > 0,
-            True,
-        )
-        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
-
-        if self.bias is not None:
-            C += self.bias
-
-        return C
-
-
-def awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
-    # AWQ zero-points are quantized and packed on the column dim.
-    # In addition, the values are permuted based on dequantizer.
-    # Here we undo both of these, and then apply marlin permutation
-    # and pack it back.
-    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
-
-    # Undo interleaving (use argsort(..) to get inverse perm)
-    if num_bits == 4:
-        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
-    elif num_bits == 8:
-        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
-    else:
-        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
-
-    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
-    q_zp = q_zp.reshape((-1, size_n)).contiguous()
-
-    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
-    return marlin_zp
-
-
-def _check_valid_shape(in_features: int, out_features: int):
-    if (in_features % 128 != 0 or out_features % 64 != 0) and (
-        in_features % 64 != 0 or out_features % 128 != 0
-    ):
-        raise ValueError(
-            f"The GPTQ Marlin kernel does not have a valid thread configuration for weight matrix with shape ({out_features}, {in_features})."
-            " The shape elements must be divisible by (128, 64) or (64, 128)."
-        )
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/marlin.py b/backends/gaudi/server/text_generation_server/layers/marlin/marlin.py
deleted file mode 100644
index 89ebaca6..00000000
--- a/backends/gaudi/server/text_generation_server/layers/marlin/marlin.py
+++ /dev/null
@@ -1,346 +0,0 @@
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import torch
-import torch.nn as nn
-from text_generation_server.layers.marlin.util import _check_marlin_kernels
-from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
-
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
-
-
-class MarlinWeightsLoader(WeightsLoader):
-    """Loader for Marlin-quantized weights."""
-
-    def __init__(self, *, bits: int, is_marlin_24: bool):
-        self.bits = bits
-        self.is_marlin_24 = is_marlin_24
-
-    def get_weights(self, weights: "Weights", prefix: str):
-        """
-        Get weights at the given prefix and apply without tensor paralllism.
-        """
-        is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
-        if is_marlin_24:
-            try:
-                B = weights.get_tensor(f"{prefix}.B_24")
-            except RuntimeError:
-                raise RuntimeError(
-                    "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
-                )
-
-            B_meta = weights.get_tensor(f"{prefix}.B_meta")
-            s = weights.get_tensor(f"{prefix}.s")
-            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
-        else:
-            try:
-                B = weights.get_tensor(f"{prefix}.B")
-            except RuntimeError:
-                raise RuntimeError(
-                    "Cannot load `marlin` weight, make sure the model is already quantized."
-                )
-
-            s = weights.get_tensor(f"{prefix}.s")
-            weight = MarlinWeight(B=B, s=s)
-
-        return weight
-
-    def get_weights_col_packed(
-        self,
-        weights: Weights,
-        prefix: str,
-        block_sizes: Union[int, List[int]],
-    ):
-        if self.is_marlin_24:
-            B = weights.get_packed_sharded(
-                f"{prefix}.B_24", dim=1, block_sizes=block_sizes
-            )
-            B_meta = weights.get_packed_sharded(
-                f"{prefix}.B_meta", dim=1, block_sizes=block_sizes
-            )
-            s = weights.get_packed_sharded(
-                f"{prefix}.s", dim=1, block_sizes=block_sizes
-            )
-
-            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
-        else:
-            B = weights.get_packed_sharded(
-                f"{prefix}.B", dim=1, block_sizes=block_sizes
-            )
-            s = weights.get_packed_sharded(
-                f"{prefix}.s", dim=1, block_sizes=block_sizes
-            )
-            weight = MarlinWeight(B=B, s=s)
-
-        return weight
-
-    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
-        if self.is_marlin_24:
-            try:
-                B = torch.cat(
-                    [weights.get_sharded(f"{p}.B_24", dim=1) for p in prefixes], dim=1
-                )
-            except RuntimeError:
-                raise RuntimeError(
-                    "Cannot load `marlin` weight, make sure the model is already quantized"
-                )
-
-            B_meta = torch.cat(
-                [weights.get_sharded(f"{p}.B_meta", dim=1) for p in prefixes], dim=1
-            )
-
-            s = torch.cat(
-                [weights.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
-            )
-
-            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
-        else:
-            try:
-                B = torch.cat(
-                    [weights.get_sharded(f"{p}.B", dim=1) for p in prefixes], dim=1
-                )
-            except RuntimeError:
-                raise RuntimeError(
-                    "Cannot load `marlin` weight, make sure the model is already quantized"
-                )
-            s = torch.cat(
-                [weights.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
-            )
-
-            weight = MarlinWeight(B=B, s=s)
-
-        return weight
-
-    def get_weights_row(self, weights: Weights, prefix: str):
-        if self.is_marlin_24:
-            try:
-                B = weights.get_sharded(f"{prefix}.B_24", dim=0)
-            except RuntimeError:
-                raise RuntimeError(
-                    "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
-                )
-
-            B_meta = weights.get_sharded(f"{prefix}.B_meta", dim=0)
-            num_groups = weights._get_slice(f"{prefix}.s").get_shape()[0]
-            if num_groups == 1:
-                # The number of groups is 1 when groupsize == -1. share
-                # scales between all shards in this case.
-                s = weights.get_tensor(f"{prefix}.s")
-            else:
-                s = weights.get_sharded(f"{prefix}.s", dim=0)
-
-            weight = GPTQMarlin24Weight(B=B, B_meta=B_meta, s=s, bits=self.bits)
-        else:
-            try:
-                B = weights.get_sharded(f"{prefix}.B", dim=0)
-            except RuntimeError:
-                raise RuntimeError(
-                    "Cannot load `marlin` weight, make sure the model is already quantized."
-                )
-
-            num_groups = weights._get_slice(f"{prefix}.s").get_shape()[0]
-            if num_groups == 1:
-                # The number of groups is 1 when groupsize == -1. share
-                # scales between all shards in this case.
-                s = weights.get_tensor(f"{prefix}.s")
-            else:
-                s = weights.get_sharded(f"{prefix}.s", dim=0)
-            weight = MarlinWeight(B=B, s=s)
-
-        return weight
-
-
-@dataclass
-class MarlinWeight(Weight):
-    """
-    Marlin weights.
-
-    Attributes:
-        B (torch.Tensor): int4-quantized weights packed into int32.
-        s (torch.Tensor): bfloat16/float16 scales.
-    """
-
-    B: torch.Tensor
-    s: torch.Tensor
-
-    def __post_init__(self):
-        assert self.B.dtype == torch.int32
-        assert self.s.dtype in [torch.float16, torch.bfloat16]
-
-    def get_linear(self, bias: torch.Tensor):
-        return MarlinLinear(weight=self, bias=bias)
-
-
-class MarlinLinear(nn.Module):
-    def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tensor]):
-        super().__init__()
-
-        _check_marlin_kernels()
-        assert marlin_kernels is not None
-
-        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE
-        out_features = weight.s.shape[1]
-        assert (
-            in_features % 128 == 0
-        ), f"Number of input features ({in_features}) not divisable by 128"
-        assert (
-            out_features % 256 == 0
-        ), f"Number of output features ({out_features}) not divisable by 256"
-
-        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
-        assert groupsize in {
-            -1,
-            128,
-        }, f"Group size must be -1 or 128, was {groupsize}"
-
-        self.B = weight.B
-        self.s = weight.s
-        if bias is not None:
-            self.bias = bias
-        else:
-            self.bias = None
-
-        self.workspace = torch.zeros(
-            out_features // 64 * 16, dtype=torch.int, device=weight.B.device
-        )
-
-    def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
-
-        C = marlin_kernels.marlin_gemm(
-            A.view(-1, A.shape[-1]),
-            self.B,
-            self.s,
-            self.workspace,
-            A.shape[0],
-            self.s.shape[1],
-            A.shape[1],
-        )
-        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
-
-        if self.bias is not None:
-            C += self.bias
-
-        return C
-
-
-GPTQ_MARLIN_24_MIN_THREAD_N = 128
-GPTQ_MARLIN_24_MIN_THREAD_K = 128
-GPTQ_MARLIN_24_MAX_PARALLEL = 64
-GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8]
-GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
-MARLIN_TILE_SIZE = 16
-
-
-@dataclass
-class GPTQMarlin24Weight:
-    """
-    GPTQ-Marlin 2:4 weights.
-
-    Attributes:
-        B (torch.Tensor): int4-quantized weights packed into int32.
-        B_meta (torch.Tensor): metadata for 2:4 sparsity.
-        s (torch.Tensor): float16 scales.
-        bits: quantized weight size.
-    """
-
-    B: torch.Tensor
-    B_meta: torch.Tensor
-    s: torch.Tensor
-    bits: int
-
-    def __post_init__(self):
-        assert self.B.dtype == torch.int32
-        assert self.B_meta.dtype == torch.int16
-        assert self.s.dtype == torch.float16
-
-    def get_linear(self, bias: torch.Tensor):
-        return GPTQMarlin24Linear(
-            weight=self,
-            bias=bias,
-        )
-
-
-class GPTQMarlin24Linear(nn.Module):
-    def __init__(self, *, weight: GPTQMarlin24Weight, bias: Optional[torch.Tensor]):
-        super().__init__()
-
-        _check_marlin_kernels()
-        assert marlin_kernels is not None
-
-        if weight.bits not in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS:
-            supported_bits = ", ".join(
-                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
-            )
-            raise RuntimeError(
-                f"{weight.bits}-bit GPTQ Sparse 2:4 Marlin is not supported, must be one of: {supported_bits}"
-            )
-
-        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE * 2
-        out_features = weight.s.shape[1]
-        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
-
-        if groupsize not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
-            supported_sizes = ", ".join(
-                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
-            )
-            raise RuntimeError(
-                f"Group size {groupsize} is not supported, must be one of: {supported_sizes}"
-            )
-
-        self.bits = weight.bits
-        weights_per_int32 = 32 // self.bits
-
-        assert (
-            out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0
-        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_N} threads"
-        assert (
-            out_features % weights_per_int32 == 0
-        ), f"Number of output features ({out_features}) not divisable by weights per int32 ({weights_per_int32})"
-
-        assert (
-            in_features % GPTQ_MARLIN_24_MIN_THREAD_K == 0
-        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_K} threads"
-        if groupsize != -1 and in_features % groupsize != 0:
-            raise ValueError(
-                f"Number of input features ({in_features}) not divisable by group size ({groupsize})"
-            )
-
-        self.B = weight.B
-        self.B_meta = weight.B_meta
-        self.s = weight.s
-        if bias is not None:
-            self.bias = bias
-        else:
-            self.bias = None
-
-        self.workspace = torch.zeros(
-            (out_features // GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL,
-            dtype=torch.int,
-            device=weight.B.device,
-        )
-
-    def forward(self, A: torch.Tensor) -> torch.Tensor:
-        assert marlin_kernels is not None
-
-        C = marlin_kernels.gptq_marlin_24_gemm(
-            A.view(-1, A.shape[-1]),
-            self.B,
-            self.B_meta,
-            self.s,
-            self.workspace,
-            self.bits,
-            A.shape[0],
-            self.s.shape[1],
-            A.shape[1],
-        )
-
-        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
-
-        if self.bias is not None:
-            C += self.bias
-
-        return C
diff --git a/backends/gaudi/server/text_generation_server/layers/marlin/util.py b/backends/gaudi/server/text_generation_server/layers/marlin/util.py
deleted file mode 100644
index 250d1714..00000000
--- a/backends/gaudi/server/text_generation_server/layers/marlin/util.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import functools
-from typing import List, Tuple
-
-import numpy
-import torch
-from text_generation_server.utils.import_utils import SYSTEM
-
-try:
-    import marlin_kernels
-except ImportError:
-    marlin_kernels = None
-
-try:
-    major, _minor = torch.cuda.get_device_capability()
-    has_sm_8_0 = major >= 8
-except Exception:
-    has_sm_8_0 = False
-
-
-def _check_marlin_kernels():
-    if not (SYSTEM == "cuda" and has_sm_8_0):
-        raise NotImplementedError(
-            "Using quantized Marlin models requires a GPU with CUDA capability 8.0 or later."
-        )
-
-    if marlin_kernels is None:
-        raise NotImplementedError(
-            "marlin is not installed, install it with: pip install server/marlin"
-        )
-
-
-# https://github.com/IST-DASLab/marlin/blob/2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c/marlin/__init__.py#L40C1-L68C54
-@functools.cache
-def get_perms() -> Tuple[List[int], List[int]]:
-    scale_perm = []
-    for i in range(8):
-        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
-    for i in range(4):
-        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
-    return scale_perm, scale_perm_single
-
-
-def permute_scales(scales: torch.Tensor):
-    scale_perm, scale_perm_single = get_perms()
-    out_features = scales.shape[1]
-    if scales.shape[0] == 1:
-        scales = scales.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
-    else:
-        scales = scales.reshape((-1, len(scale_perm)))[:, scale_perm]
-    return scales.reshape((-1, out_features)).contiguous()
-
-
-# Functions below are from vLLM
-
-
-def get_pack_factor(bits: int) -> int:
-    if 32 % bits != 0:
-        raise ValueError(f"Cannot {bits} bit values into uint32")
-    return 32 // bits
-
-
-def pack_cols(
-    q_w: torch.Tensor,
-    num_bits: int,
-    size_k: int,
-    size_n: int,
-):
-    assert q_w.shape == (size_k, size_n)
-
-    pack_factor = get_pack_factor(num_bits)
-    assert size_n % pack_factor == 0
-
-    orig_device = q_w.device
-
-    q_w = q_w.cpu().numpy().astype(numpy.uint32)
-
-    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
-
-    for i in range(pack_factor):
-        q_res |= q_w[:, i::pack_factor] << num_bits * i
-
-    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
-    q_res = q_res.contiguous()
-
-    return q_res
-
-
-def unpack_cols(
-    packed_q_w: torch.Tensor,
-    num_bits: int,
-    size_k: int,
-    size_n: int,
-):
-    pack_factor = get_pack_factor(num_bits)
-    assert size_n % pack_factor == 0
-    assert packed_q_w.shape == (
-        size_k,
-        size_n // pack_factor,
-    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
-        packed_q_w.shape, size_k, size_n, pack_factor
-    )
-
-    orig_device = packed_q_w.device
-
-    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
-    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
-
-    mask = (1 << num_bits) - 1
-    for i in range(pack_factor):
-        vals = packed_q_w_cpu & mask
-        packed_q_w_cpu >>= num_bits
-        q_res[:, i::pack_factor] = vals
-
-    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
-    q_res = q_res.contiguous()
-
-    return q_res
-
-
-def marlin_zero_points(
-    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
-) -> torch.Tensor:
-    scale_perm, _ = get_perms()
-    # Permute zero-points in a similar way to scales, but do not use the
-    # "single" permutation, since zero-points are applied on every MMA
-    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
-
-    # Interleave column dim (for the dequantize code) and pack it to int32
-    if num_bits == 4:
-        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
-    elif num_bits == 8:
-        interleave = numpy.array([0, 2, 1, 3])
-    else:
-        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
-
-    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
-    zp = zp.reshape((-1, size_n)).contiguous()
-    zp = pack_cols(zp, num_bits, size_k, size_n)
-
-    return zp
diff --git a/backends/gaudi/server/text_generation_server/layers/moe/__init__.py b/backends/gaudi/server/text_generation_server/layers/moe/__init__.py
index 2c46ca02..8b9d6fcb 100644
--- a/backends/gaudi/server/text_generation_server/layers/moe/__init__.py
+++ b/backends/gaudi/server/text_generation_server/layers/moe/__init__.py
@@ -10,13 +10,8 @@ from text_generation_server.layers import (
     TensorParallelRowLinear,
 )
 from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
-from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader
-from text_generation_server.layers.moe.gptq_marlin import (
-    GPTQMarlinSparseMoELayer,
-    can_use_marlin_moe_gemm,
-)
 from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
-from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.moe.fp8 import FP8SparseMoELayer
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import (
     DefaultWeightsLoader,
@@ -24,12 +19,7 @@ from text_generation_server.utils.weights import (
     UnquantizedWeight,
 )
 
-if SYSTEM == "rocm":
-    from .fused_moe_rocm import grouped_topk
-    from vllm.model_executor.layers.fused_moe import fused_topk
-elif SYSTEM != "ipex":
-    from moe_kernels.fused_moe import fused_topk, grouped_topk
-
+from .fused_moe import fused_topk, grouped_topk
 
 # NOTE: we are using a protocol here, because multiple inherance is not nice.
 #       We need `Module`, and `Module` -> some abstract class -> some concrete
@@ -52,6 +42,8 @@ class MoELayer(Protocol):
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
         hidden_act: str = "silu",
+        scoring_func: Optional[str] = None,
+        e_score_correction_bias: Optional[float] = None,
     ): ...
 
     def forward(
@@ -81,9 +73,14 @@ class DenseMoELayer(nn.Module):
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
         hidden_act: str = "silu",
+        scoring_func: Optional[str] = None,
+        e_score_correction_bias: Optional[float] = None,
     ):
         super().__init__()
 
+        assert scoring_func is None, "scoring func is not handled"
+        assert e_score_correction_bias is None, "scoring correction bias is not handled"
+
         log_once(
             logger.info,
             "No fused layers are available for this model type, using (slower) dense MoE layer",
@@ -199,22 +196,27 @@ class SparseMoELayer(nn.Module):
         topk: int,
         topk_group: Optional[int],
         weights: Weights,
+        scoring_func: Optional[str] = "softmax",
+        e_score_correction_bias: Optional[float] = None,
         gate_proj_name: str = "gate_proj",
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
     ):
         super().__init__()
-
         if (
             isinstance(weights.loader, DefaultWeightsLoader)
             and isinstance(weights.loader.weight_class, UnquantizedWeight)
         ) or isinstance(weights.loader, HybridFP8UnquantLoader):
-            cls = UnquantizedSparseMoELayer
-        elif isinstance(weights.loader, GPTQMarlinWeightsLoader) and weights.loader.sym:
-            cls = GPTQMarlinSparseMoELayer
+            if (
+                isinstance(weights.loader, HybridFP8UnquantLoader)
+                and weights.loader.to_fp8
+            ):
+                cls = FP8SparseMoELayer
+            else:
+                cls = UnquantizedSparseMoELayer
         else:
             raise ValueError(
-                f"Unsupported weights loader: {weights.loader}, sparse MoE is only supported for unquantized and GPTQ weights"
+                f"Unsupported weights loader: {type(weights.loader)}, sparse MoE is only supported for unquantized, AWQ, and GPTQ weights"
             )
 
         log_once(
@@ -230,6 +232,8 @@ class SparseMoELayer(nn.Module):
             topk=topk,
             topk_group=topk_group,
             weights=weights,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
             gate_proj_name=gate_proj_name,
             up_proj_name=up_proj_name,
             down_proj_name=down_proj_name,
@@ -241,17 +245,6 @@ class SparseMoELayer(nn.Module):
     @staticmethod
     def is_supported(weights: Weights) -> bool:
         return (
-            (
-                isinstance(weights.loader, DefaultWeightsLoader)
-                and isinstance(weights.loader.weight_class, UnquantizedWeight)
-            )
-            or isinstance(weights.loader, HybridFP8UnquantLoader)
-            or (
-                isinstance(weights.loader, GPTQMarlinWeightsLoader)
-                and can_use_marlin_moe_gemm(
-                    quant_method=weights.loader.quant_method,
-                    quantize=weights.loader.quantize,
-                    sym=weights.loader.sym,
-                )
-            )
-        )
+            isinstance(weights.loader, DefaultWeightsLoader)
+            and isinstance(weights.loader.weight_class, UnquantizedWeight)
+        ) or isinstance(weights.loader, HybridFP8UnquantLoader)
diff --git a/backends/gaudi/server/text_generation_server/layers/moe/fp8.py b/backends/gaudi/server/text_generation_server/layers/moe/fp8.py
new file mode 100644
index 00000000..071b2abe
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/layers/moe/fp8.py
@@ -0,0 +1,173 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from text_generation_server.utils.weights import Weights
+from text_generation_server.layers.fp8 import (
+    Fp8Weight,
+    fp8_quantize,
+    quant_dtype,
+    normalize_e4m3fn_to_native_float8,
+)
+
+try:
+    from .unquantized import fused_moe
+except Exception:
+    fused_moe = None
+
+
+class FP8SparseMoELayer(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_expert_group: Optional[int],
+        n_experts: int,
+        prefix: str,
+        renormalize: bool,
+        topk: int,
+        topk_group: Optional[int],
+        weights: Weights,
+        scoring_func: Optional[str] = "softmax",
+        e_score_correction_bias: Optional[float] = None,
+        gate_proj_name: str = "gate_proj",
+        up_proj_name: str = "up_proj",
+        down_proj_name: str = "down_proj",
+    ):
+        super().__init__()
+
+        assert (n_expert_group is None) == (
+            topk_group is None
+        ), "n_expert_group and topk_group must both be None or have some value"
+
+        self.n_expert_group = n_expert_group
+        self.topk = topk
+        self.topk_group = topk_group
+        self.renormalize = renormalize
+        self.weight_block_size = weights.weights_loader.weight_block_size
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
+
+        (
+            self.gate_up_proj,
+            self.gate_up_proj_weight_scale,
+            self.gate_up_proj_input_scale,
+        ) = _load_expert_multi_weights_col(
+            prefix=prefix,
+            n_experts=n_experts,
+            gate_proj_name=gate_proj_name,
+            up_proj_name=up_proj_name,
+            weights=weights,
+        )
+
+        self.down_proj, self.down_proj_weight_scale, self.down_proj_input_scale = (
+            _load_expert_weights_row(
+                prefix=prefix,
+                n_experts=n_experts,
+                name=down_proj_name,
+                weights=weights,
+            )
+        )
+
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        return fused_moe(
+            x,
+            w1=self.gate_up_proj,
+            w2=self.down_proj,
+            gating_output=gating_output,
+            topk=self.topk,
+            renormalize=self.renormalize,
+            inplace=True,
+            use_grouped_topk=self.n_expert_group is not None,
+            num_expert_group=self.n_expert_group,
+            topk_group=self.topk_group,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias,
+            use_fp8_w8a8=True,
+            w1_scale=self.gate_up_proj_weight_scale,
+            w2_scale=self.down_proj_weight_scale,
+            a1_scale=self.gate_up_proj_input_scale,
+            a2_scale=self.down_proj_input_scale,
+        )
+
+
+def _load_expert_weights(
+    get_weight_fn,
+    *,
+    prefix: str,
+    n_experts: int,
+    name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    all_weight = None
+    all_weight_scales = None
+    max_input_scale = None
+
+    for i in range(n_experts):
+        weight = get_weight_fn(prefix, i, name, weights)
+
+        assert isinstance(weight, Fp8Weight)
+
+        if all_weight is None:
+            all_weight = torch.empty(
+                (n_experts,) + weight.weight.shape,
+                dtype=quant_dtype,
+                device=weight.weight.device,
+            )
+        if all_weight_scales is None:
+            all_weight_scales = torch.empty(
+                (n_experts,) + weight.weight_scale.shape,
+                dtype=torch.float32,
+                device=weight.weight.device,
+            )
+
+        if weight.weight.dtype in {torch.float8_e4m3fn, torch.float8_e4m3fnuz}:
+            all_weight[i], all_weight_scales[i], current_input_scale = (
+                normalize_e4m3fn_to_native_float8(
+                    weight.weight, weight.weight_scale, weight.input_scale
+                )
+            )
+            if current_input_scale is not None:
+                if max_input_scale is None or current_input_scale > max_input_scale:
+                    max_input_scale = current_input_scale
+        else:
+            all_weight[i], all_weight_scales[i] = fp8_quantize(
+                weight.weight, scalar=True
+            )
+
+    assert all_weight is not None
+
+    return all_weight, all_weight_scales, max_input_scale
+
+
+def _load_expert_multi_weights_col(
+    *,
+    prefix: str,
+    n_experts: int,
+    gate_proj_name: str,
+    up_proj_name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    def get_weight_fn(prefix, i, name, weights):
+        return weights.get_multi_weights_col(
+            [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
+        )
+
+    return _load_expert_weights(
+        get_weight_fn, prefix=prefix, n_experts=n_experts, name=None, weights=weights
+    )
+
+
+def _load_expert_weights_row(
+    *,
+    prefix: str,
+    n_experts: int,
+    name: str,
+    weights: Weights,
+) -> torch.Tensor:
+    def get_weight_fn(prefix, i, name, weights):
+        return weights.get_weights_row(f"{prefix}.{i}.{name}")
+
+    return _load_expert_weights(
+        get_weight_fn, prefix=prefix, n_experts=n_experts, name=name, weights=weights
+    )
diff --git a/backends/gaudi/server/text_generation_server/layers/moe/fused_moe_rocm.py b/backends/gaudi/server/text_generation_server/layers/moe/fused_moe.py
similarity index 80%
rename from backends/gaudi/server/text_generation_server/layers/moe/fused_moe_rocm.py
rename to backends/gaudi/server/text_generation_server/layers/moe/fused_moe.py
index 68accb99..e26ff877 100644
--- a/backends/gaudi/server/text_generation_server/layers/moe/fused_moe_rocm.py
+++ b/backends/gaudi/server/text_generation_server/layers/moe/fused_moe.py
@@ -16,10 +16,8 @@
 from typing import Tuple
 
 import torch
-import torch.distributed
 
 
-# TODO: Remove the functions once moe_kernel are built for ROCM
 def grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -50,3 +48,18 @@ def grouped_topk(
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
     return topk_weights, topk_ids
+
+
+def fused_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    topk_weights = torch.nn.functional.softmax(
+        gating_output, dim=1, dtype=torch.float32
+    )
+    topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
+    if renormalize:
+        topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
diff --git a/backends/gaudi/server/text_generation_server/layers/moe/gptq_marlin.py b/backends/gaudi/server/text_generation_server/layers/moe/gptq_marlin.py
deleted file mode 100644
index 3217cdc2..00000000
--- a/backends/gaudi/server/text_generation_server/layers/moe/gptq_marlin.py
+++ /dev/null
@@ -1,215 +0,0 @@
-from dataclasses import dataclass
-from typing import List, Optional
-
-import torch
-import torch.nn as nn
-
-from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.utils.weights import Weights
-from text_generation_server.layers.marlin.gptq import (
-    GPTQMarlinWeight,
-    GPTQMarlinWeightsLoader,
-)
-
-if SYSTEM == "cuda":
-    from moe_kernels.fused_marlin_moe import fused_marlin_moe
-else:
-    fused_marlin_moe = None
-
-
-try:
-    major, _minor = torch.cuda.get_device_capability()
-    has_sm_8_0 = major >= 8
-except Exception:
-    has_sm_8_0 = False
-
-
-def can_use_marlin_moe_gemm(
-    *,
-    quant_method: str,
-    quantize: str,
-    sym: bool,
-):
-    return (
-        SYSTEM == "cuda"
-        and fused_marlin_moe is not None
-        and has_sm_8_0
-        and quantize == "gptq"
-        and quant_method == "gptq"
-        and sym
-    )
-
-
-@dataclass
-class GPTQMarlinMoEWeight:
-    qweight: torch.Tensor
-    qzeros: torch.Tensor
-    scales: torch.Tensor
-    g_idx: torch.Tensor
-    perm: torch.Tensor
-    is_full_k: bool
-
-
-class GPTQMarlinSparseMoELayer(nn.Module):
-    """
-    MoE layer that uses a fused GPTQ-Marlin kernel.
-    """
-
-    def __init__(
-        self,
-        *,
-        n_expert_group: Optional[int],
-        n_experts: int,
-        prefix: str,
-        renormalize: bool,
-        topk: int,
-        topk_group: Optional[int],
-        weights: Weights,
-        gate_proj_name: str = "gate_proj",
-        up_proj_name: str = "up_proj",
-        down_proj_name: str = "down_proj",
-    ):
-        super().__init__()
-
-        if not (
-            isinstance(weights.loader, GPTQMarlinWeightsLoader) and weights.loader.sym
-        ):
-            raise ValueError(
-                f"Unsupported weights loader: {weights.loader}, only GPTQMarlinWeightsLoader with symmetric quantization is supported"
-            )
-
-        assert (n_expert_group is None) == (
-            topk_group is None
-        ), "n_expert_group and topk_group must both be None or have some value"
-
-        self.n_expert_group = n_expert_group
-        self.topk = topk
-        self.topk_group = topk_group
-        self.renormalize = renormalize
-
-        self.gate_up_proj = _load_expert_multi_weights_col(
-            prefix=prefix,
-            n_experts=n_experts,
-            names=[gate_proj_name, up_proj_name],
-            weights=weights,
-        )
-
-        self.down_proj = _load_expert_weights_row(
-            prefix=prefix, n_experts=n_experts, name=down_proj_name, weights=weights
-        )
-
-        self.bits = weights.loader.bits
-
-    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
-        return fused_marlin_moe(
-            x,
-            w1=self.gate_up_proj.qweight,
-            w2=self.down_proj.qweight,
-            g_idx1=self.gate_up_proj.g_idx,
-            g_idx2=self.down_proj.g_idx,
-            perm1=self.gate_up_proj.perm,
-            perm2=self.down_proj.perm,
-            w1_scale=self.gate_up_proj.scales,
-            w2_scale=self.down_proj.scales,
-            is_full_k1=self.gate_up_proj.is_full_k,
-            is_full_k2=self.down_proj.is_full_k,
-            gating_output=gating_output,
-            topk=self.topk,
-            renormalize=self.renormalize,
-            use_grouped_topk=self.n_expert_group is not None,
-            num_expert_group=self.n_expert_group,
-            topk_group=self.topk_group,
-            num_bits=self.bits,
-        )
-
-
-def _load_expert_multi_weights_col(
-    *,
-    prefix: str,
-    n_experts: int,
-    names: List[str],
-    weights: Weights,
-) -> GPTQMarlinMoEWeight:
-    moe_weight = None
-    for i in range(n_experts):
-        weight = weights.get_multi_weights_col(
-            [f"{prefix}.{i}.{name}" for name in names], 0
-        )
-        assert isinstance(weight, GPTQMarlinWeight)
-        moe_weight = _pack_weight(
-            n_experts=n_experts, expert=i, weight=weight, moe_weight=moe_weight
-        )
-    assert moe_weight is not None
-    return moe_weight
-
-
-def _load_expert_weights_row(
-    *,
-    prefix: str,
-    n_experts: int,
-    name: str,
-    weights: Weights,
-) -> GPTQMarlinMoEWeight:
-    moe_weight = None
-    for i in range(n_experts):
-        weight = weights.get_weights_row(
-            f"{prefix}.{i}.{name}",
-        )
-        assert isinstance(weight, GPTQMarlinWeight)
-        moe_weight = _pack_weight(
-            n_experts=n_experts, expert=i, weight=weight, moe_weight=moe_weight
-        )
-    assert moe_weight is not None
-    return moe_weight
-
-
-def _pack_weight(
-    *,
-    n_experts: int,
-    expert: int,
-    moe_weight: Optional[GPTQMarlinMoEWeight],
-    weight: GPTQMarlinWeight,
-) -> GPTQMarlinMoEWeight:
-    if moe_weight is None:
-        qweight = torch.empty(
-            (n_experts,) + weight.qweight.shape,
-            dtype=weight.qweight.dtype,
-            device=weight.qweight.device,
-        )
-        qzeros = torch.empty(
-            (n_experts,) + weight.qzeros.shape,
-            dtype=weight.qzeros.dtype,
-            device=weight.qzeros.device,
-        )
-        scales = torch.empty(
-            (n_experts,) + weight.scales.shape,
-            dtype=weight.scales.dtype,
-            device=weight.scales.device,
-        )
-        g_idx = torch.empty(
-            (n_experts,) + weight.g_idx.shape,
-            dtype=weight.g_idx.dtype,
-            device=weight.g_idx.device,
-        )
-        perm = torch.empty(
-            (n_experts,) + weight.perm.shape,
-            dtype=weight.perm.dtype,
-            device=weight.perm.device,
-        )
-
-        moe_weight = GPTQMarlinMoEWeight(
-            qweight=qweight,
-            qzeros=qzeros,
-            scales=scales,
-            g_idx=g_idx,
-            perm=perm,
-            is_full_k=weight.is_full_k,
-        )
-
-    moe_weight.qweight[expert] = weight.qweight
-    moe_weight.qzeros[expert] = weight.qzeros
-    moe_weight.scales[expert] = weight.scales
-    moe_weight.g_idx[expert] = weight.g_idx
-    moe_weight.perm[expert] = weight.perm
-
-    return moe_weight
diff --git a/backends/gaudi/server/text_generation_server/layers/moe/unquantized.py b/backends/gaudi/server/text_generation_server/layers/moe/unquantized.py
index d9d62c0e..ec158398 100644
--- a/backends/gaudi/server/text_generation_server/layers/moe/unquantized.py
+++ b/backends/gaudi/server/text_generation_server/layers/moe/unquantized.py
@@ -3,13 +3,8 @@ from typing import Optional
 import torch
 import torch.nn as nn
 
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.weights import UnquantizedWeight, Weights
-
-if SYSTEM == "rocm":
-    from vllm.model_executor.layers.fused_moe import fused_moe
-elif SYSTEM != "ipex":
-    from moe_kernels.fused_moe import fused_moe
+from vllm_hpu_extension.ops import DynamicFusedMOE
 
 
 class UnquantizedSparseMoELayer(nn.Module):
@@ -23,6 +18,8 @@ class UnquantizedSparseMoELayer(nn.Module):
         topk: int,
         topk_group: Optional[int],
         weights: Weights,
+        scoring_func: Optional[str] = "softmax",
+        e_score_correction_bias: Optional[float] = None,
         gate_proj_name: str = "gate_proj",
         up_proj_name: str = "up_proj",
         down_proj_name: str = "down_proj",
@@ -37,6 +34,9 @@ class UnquantizedSparseMoELayer(nn.Module):
         self.topk = topk
         self.topk_group = topk_group
         self.renormalize = renormalize
+        self.weight_block_size = weights.weights_loader.weight_block_size
+        self.scoring_func = scoring_func
+        self.e_score_correction_bias = e_score_correction_bias
 
         self.gate_up_proj = _load_expert_multi_weights_col(
             prefix=prefix,
@@ -53,30 +53,13 @@ class UnquantizedSparseMoELayer(nn.Module):
             weights=weights,
         )
 
-    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
-        if SYSTEM == "rocm":
-            return fused_moe(
-                x,
-                self.gate_up_proj,
-                self.down_proj,
-                gating_output,
-                self.topk,
-                renormalize=self.renormalize,
-                inplace=True,
-            )
+        self.hpu_fused_moe = DynamicFusedMOE(n_experts)
+        for i in range(n_experts):
+            self.hpu_fused_moe.MoeOp.w13_list[i].set_weight(self.gate_up_proj[i])
+            self.hpu_fused_moe.MoeOp.w2_list[i].set_weight(self.down_proj[i])
 
-        return fused_moe(
-            x,
-            w1=self.gate_up_proj,
-            w2=self.down_proj,
-            gating_output=gating_output,
-            topk=self.topk,
-            renormalize=self.renormalize,
-            inplace=True,
-            use_grouped_topk=self.n_expert_group is not None,
-            num_expert_group=self.n_expert_group,
-            topk_group=self.topk_group,
-        )
+    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
+        return self.hpu_fused_moe(x, gating_output, self.topk)
 
 
 def _load_expert_multi_weights_col(
diff --git a/backends/gaudi/server/text_generation_server/layers/rotary.py b/backends/gaudi/server/text_generation_server/layers/rotary.py
index a2076bb2..6a83d6a5 100644
--- a/backends/gaudi/server/text_generation_server/layers/rotary.py
+++ b/backends/gaudi/server/text_generation_server/layers/rotary.py
@@ -2,14 +2,10 @@ import os
 import math
 import torch
 from torch import nn
-from text_generation_server.utils.import_utils import SYSTEM
-
-if SYSTEM == "cuda":
-    import rotary_emb
-elif SYSTEM == "rocm":
-    from vllm._C import ops
-elif SYSTEM == "ipex":
-    import intel_extension_for_pytorch as ipex
+from habana_frameworks.torch.hpex.kernels import (
+    RotaryPosEmbeddingMode,
+    apply_rotary_pos_emb,
+)
 
 
 def _create_inv_freq(dim, base, device):
@@ -30,7 +26,7 @@ def _get_rope_config(config):
 
 
 class PositionRotaryEmbedding(nn.Module):
-    def __init__(self, inv_freq, scaling_factor):
+    def __init__(self, inv_freq, scaling_factor, max_position_embeddings):
         super().__init__()
         self.inv_freq = inv_freq
         self._seq_len_cached = 0
@@ -40,6 +36,9 @@ class PositionRotaryEmbedding(nn.Module):
         self._sin_k_cached = None
         self.scaling_factor = scaling_factor
         self.dynamic_args = None
+        self._update_cos_sin_cache(
+            torch.float32, inv_freq.device, max_position_embeddings
+        )
 
     def forward(
         self,
@@ -48,40 +47,41 @@ class PositionRotaryEmbedding(nn.Module):
         cos: torch.Tensor,
         sin: torch.Tensor,
     ):
-        # Such controlflows may add some overhead.
-        if SYSTEM == "cuda":
-            rotary_dim = cos.shape[-1]
-            q1 = query[..., :rotary_dim]
-            q2 = query[..., rotary_dim : 2 * rotary_dim]
+        num_tokens = query.shape[0]
+        head_size = query.shape[-1]
+        # HPU RoPE kernel requires hidden dimension for cos and sin to be equal
+        # to query hidden dimension, so the original tensors need to be
+        # expanded
+        # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
+        # and expansion of cos/sin tensors via concatenation
+        rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
+        cos = torch.cat((cos, cos), dim=-1)
+        sin = torch.cat((sin, sin), dim=-1)
+        rotary_dim = cos.shape[-1]
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, head_size)
+        query_rot = query[..., :rotary_dim]
+        query_pass = query[..., rotary_dim:]
+        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
+        query.copy_(torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape))
 
-            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
-
-            k1 = key[..., :rotary_dim]
-            k2 = key[..., rotary_dim : 2 * rotary_dim]
-
-            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-        elif SYSTEM == "rocm":
-            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
-            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
-
-            head_size = query.shape[-1]
-
-            # Inplace operation, updating query and key.
-            ops.rotary_embedding(query, key, head_size, cos, sin, True)
-        elif SYSTEM == "ipex":
-            ipex.llm.functional.rotary_embedding(
-                query, key, sin, cos, query.size(-1), True
-            )
-        else:
-            raise ValueError(
-                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-            )
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, head_size)
+        key_rot = key[..., :rotary_dim]
+        key_pass = key[..., rotary_dim:]
+        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
+        key.copy_(torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape))
 
     @classmethod
     def static(cls, config, dim, base, device):
         inv_freq = _create_inv_freq(dim, base, device)
         scaling_factor = None
         rope_scaling = _get_rope_config(config)
+        if not hasattr(config, "max_position_embeddings") and hasattr(
+            config, "max_seq_len"
+        ):
+            # handling for dbrx
+            config.max_position_embeddings = config.max_seq_len
         if rope_scaling is not None:
             # `rope_type` is now standard in transformers, but some existing models
             # have `type` instead.
@@ -89,6 +89,17 @@ class PositionRotaryEmbedding(nn.Module):
 
             if rope_type == "linear":
                 pass
+            elif rope_type == "default":
+                pass
+            elif rope_type == "mrope":
+                mrope_section = rope_scaling["mrope_section"]
+                if mrope_section is not None:
+                    return RotaryPositionEmbeddingMultimodalSections(
+                        inv_freq,
+                        scaling_factor,
+                        mrope_section,
+                        config.max_position_embeddings,
+                    )
             elif rope_type == "dynamic":
                 scaling_factor = rope_scaling["factor"]
                 return DynamicPositionRotaryEmbedding(
@@ -109,7 +120,7 @@ class PositionRotaryEmbedding(nn.Module):
                     ],
                 )
 
-                return cls(inv_freq, scaling_factor)
+                return cls(inv_freq, scaling_factor, config.max_position_embeddings)
 
             elif rope_type == "yarn":
                 scaling_factor = rope_scaling["factor"]
@@ -185,12 +196,13 @@ class PositionRotaryEmbedding(nn.Module):
                     long_inv_freq=long_inv_freq,
                     scaling_factor=scaling_factor,
                     original_max_position_embeddings=original_max_position_embeddings,
+                    max_position_embeddings=config.max_position_embeddings,
                 )
             else:
                 raise NotImplementedError(
                     f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                 )
-        return cls(inv_freq, scaling_factor)
+        return cls(inv_freq, scaling_factor, config.max_position_embeddings)
 
     @classmethod
     def load(cls, config, prefix, weights):
@@ -236,7 +248,7 @@ class PositionRotaryEmbedding(nn.Module):
                 raise NotImplementedError(
                     f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                 )
-        return cls(inv_freq, scaling_factor)
+        return cls(inv_freq, scaling_factor, config.max_position_embeddings)
 
     def _update_cos_sin_cache(self, dtype, device, seqlen):
         # Reset the tables if the sequence length has changed,
@@ -257,17 +269,7 @@ class PositionRotaryEmbedding(nn.Module):
             self._cos_cached = torch.cos(freqs).to(dtype)
             self._sin_cached = torch.sin(freqs).to(dtype)
 
-    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
-        """
-        Return cos and sin for the asked position ids
-        """
-        if SYSTEM == "rocm":
-            # For RoCm, we always use float cos/sin to avoid a cast.
-            # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
-            # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
-            dtype = torch.float32
-
-        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
+    def get_cos_sin(self, position_ids: torch.Tensor):
 
         cos = torch.index_select(self._cos_cached, 0, position_ids)
         sin = torch.index_select(self._sin_cached, 0, position_ids)
@@ -283,6 +285,7 @@ class SuRotaryEmbedding(PositionRotaryEmbedding):
         long_inv_freq,
         scaling_factor,
         original_max_position_embeddings,
+        max_position_embeddings,
     ):
         super(PositionRotaryEmbedding, self).__init__()
         self.short_inv_freq = short_inv_freq
@@ -295,6 +298,9 @@ class SuRotaryEmbedding(PositionRotaryEmbedding):
         self._cos_k_cached = None
         self._sin_k_cached = None
         self.dynamic_args = None
+        self._update_cos_sin_cache(
+            torch.float32, short_inv_freq.device, max_position_embeddings
+        )
 
     def _update_cos_sin_cache(self, dtype, device, seqlen):
         # Reset the tables if the sequence length has changed,
@@ -348,6 +354,9 @@ class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding):
         self._cos_k_cached = None
         self._sin_k_cached = None
         self.dynamic_args = None
+        self._update_cos_sin_cache(
+            torch.float32, short_inv_freq.device, max_position_embeddings
+        )
 
     def _update_cos_sin_cache(self, dtype, device, seqlen):
         if (
@@ -383,7 +392,7 @@ class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding):
 class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
     def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
         inv_freq = _create_inv_freq(dim, base, device)
-        super().__init__(inv_freq, scaling_factor)
+        super().__init__(inv_freq, scaling_factor, max_position_embeddings)
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
@@ -461,7 +470,9 @@ class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
         mscale_all_dim: float,
     ):
         inv_freq = _create_inv_freq(dim, base, device)
-        super().__init__(inv_freq, scaling_factor)
+        super().__init__(
+            inv_freq, scaling_factor, max_position_embeddings * self.scaling_factor
+        )
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
@@ -546,3 +557,50 @@ def apply_llama3_scaling(
             new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)
 
     return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+
+
+class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        inv_freq: torch.Tensor,
+        scaling_factor: float,
+        sections: list,
+        max_position_embeddings,
+    ):
+        self.sections = sections
+        self._cos_cached = None
+        self._sin_cached = None
+        self.section_indices = (
+            torch.arange(len(self.sections))
+            .repeat_interleave(torch.tensor(self.sections))
+            .view(1, 1, -1)
+            .to(inv_freq.device)
+        )
+        super().__init__(inv_freq, scaling_factor, max_position_embeddings)
+
+    def _update_cos_sin_cache(
+        self, dtype: torch.dtype, device: torch.device, seqlen: int
+    ):
+        # always cache the cos/sin for the full sequence length to avoid
+        # recomputing if the sequence length is smaller than the cached one
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+            self._sections = self.section_indices.expand(seqlen, -1, -1)
+
+    def get_cos_sin(
+        self,
+        position_ids: torch.Tensor,
+    ):
+        slen = position_ids.shape[0]
+
+        cos = self._cos_cached[position_ids].gather(1, self._sections[:slen])
+        sin = self._sin_cached[position_ids].gather(1, self._sections[:slen])
+        return cos, sin
diff --git a/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py b/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py
index 13f12ef1..8f19174f 100644
--- a/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py
+++ b/backends/gaudi/server/text_generation_server/layers/tensor_parallel.py
@@ -2,10 +2,8 @@ import torch
 from torch.nn import functional as F
 from typing import Iterable, List
 from text_generation_server.layers.linear import get_linear, FastLinear
-from text_generation_server.utils.import_utils import SYSTEM
 
-if SYSTEM == "ipex":
-    import intel_extension_for_pytorch as ipex
+import habana_frameworks.torch as htorch
 
 
 class LayerConcat(torch.nn.Module):
@@ -90,14 +88,10 @@ class TensorParallelHead(SuperLayer):
                 local_out = gather_input.T
 
             torch.mm(input, self.linear.weight.T, out=local_out)
-            if SYSTEM == "ipex":
-                ipex.distributed.all_gather_into_tensor(
-                    world_out, gather_input, group=self.process_group
-                )
-            else:
-                torch.distributed.all_gather_into_tensor(
-                    world_out, gather_input, group=self.process_group
-                )
+            htorch.core.mark_step()
+            torch.distributed.all_gather_into_tensor(
+                world_out, gather_input, group=self.process_group
+            )
 
             if input.shape[0] == 1:
                 return world_out
@@ -107,10 +101,9 @@ class TensorParallelHead(SuperLayer):
         world_output = [
             torch.empty_like(output) for _ in range(self.process_group.size())
         ]
-        if SYSTEM == "ipex":
-            ipex.distributed.all_gather(world_output, output, group=self.process_group)
-        else:
-            torch.distributed.all_gather(world_output, output, group=self.process_group)
+
+        htorch.core.mark_step()
+        torch.distributed.all_gather(world_output, output, group=self.process_group)
         world_output = torch.cat(world_output, dim=-1)
         return world_output
 
@@ -202,10 +195,11 @@ class TensorParallelRowLinear(SuperLayer):
     def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
         out = super().forward(input)
         if self.process_group.size() > 1 and reduce:
-            if SYSTEM == "ipex":
-                ipex.distributed.all_reduce(out, group=self.process_group)
-            else:
-                torch.distributed.all_reduce(out, group=self.process_group)
+            # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
+            # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
+            # (which is required for tensor parallel HPUGraph inference)
+            htorch.core.mark_step()
+            torch.distributed.all_reduce(out, group=self.process_group)
         return out
 
 
@@ -242,8 +236,9 @@ class TensorParallelEmbedding(torch.nn.Module):
         )
         out = torch.nn.functional.embedding(input, self.weight)
         if self.reduce and self.process_group.size() > 1:
-            if SYSTEM == "ipex":
-                ipex.distributed.all_reduce(out, group=self.process_group)
-            else:
-                torch.distributed.all_reduce(out, group=self.process_group)
+            # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
+            # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
+            # (which is required for tensor parallel HPUGraph inference)
+            htorch.core.mark_step()
+            torch.distributed.all_reduce(out, group=self.process_group)
         return out
diff --git a/backends/gaudi/server/text_generation_server/models/__init__.py b/backends/gaudi/server/text_generation_server/models/__init__.py
index 346016c2..778b14a1 100644
--- a/backends/gaudi/server/text_generation_server/models/__init__.py
+++ b/backends/gaudi/server/text_generation_server/models/__init__.py
@@ -1,3 +1,5 @@
+# ruff: noqa: F821
+# the above line disables the `undefined-name` rule for the model type variables
 import torch
 import os
 
@@ -8,6 +10,7 @@ from huggingface_hub import hf_hub_download, HfApi
 from typing import Optional
 from pathlib import Path
 from typing import List, Dict
+import enum
 
 # Needed to properly setup habana_frameworks
 
@@ -16,15 +19,10 @@ from text_generation_server.models.model import Model
 from text_generation_server.models.causal_lm import CausalLM
 from text_generation_server.models.bloom import BLOOM
 from text_generation_server.models.starcoder import StarCoder
-from text_generation_server.models.vlm_causal_lm import VlmCausalLM
-from text_generation_server.models.custom_modeling.mllama import (
-    MllamaForConditionalGeneration,
-)
-from text_generation_server.models.custom_modeling.llava_next import (
-    LlavaNextForConditionalGeneration,
+from text_generation_server.models.custom_modeling.flash_phi_moe_modeling import (
+    PhiMoEConfig,
 )
 
-# from text_generation_server.models.mllama_causal_lm import MllamaCausalLMBatch
 from text_generation_server.utils.adapter import (
     AdapterParameters,
     build_layer_weight_lookup,
@@ -33,9 +31,285 @@ from text_generation_server.utils.adapter import (
 )
 from text_generation_server.adapters.lora import LoraWeights
 
-
+from text_generation_server.utils.log import log_master
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
+__all__ = [
+    "Model",
+    "CausalLM",
+    "Seq2SeqLM",
+    "get_model_with_lora_adapters",
+]
+from text_generation_server.models.globals import ATTENTION
+
+FLASH_ATT_ERROR_MESSAGE = "{} requires Flash Attention enabled models."
+
+FLASH_ATTENTION = False
+if ATTENTION == "paged":
+    FLASH_ATTENTION = True
+
+try:
+    from text_generation_server.models.flash_causal_lm import FlashCausalLM
+    from text_generation_server.models.flash_vlm_causal_lm import FlashVlmCausalLM
+    from text_generation_server.models.mllama_causal_lm import FlashMllamaCausalLM
+    from text_generation_server.models.custom_modeling.flash_deepseek_v2_modeling import (
+        FlashDeepseekV2ForCausalLM,
+        DeepseekV2Config,
+    )
+    from text_generation_server.models.custom_modeling.flash_deepseek_v3_modeling import (
+        FlashDeepseekV3ForCausalLM,
+        DeepseekV3Config,
+    )
+    from text_generation_server.models.custom_modeling.flash_llama_modeling import (
+        FlashLlamaForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_cohere_modeling import (
+        FlashCohereForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
+        FlashGemmaForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
+        FlashGemma2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_dbrx_modeling import (
+        FlashDbrxForCausalLM,
+        DbrxConfig,
+    )
+    from text_generation_server.models.custom_modeling.flash_rw_modeling import (
+        RWConfig,
+        FlashRWForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_neox_modeling import (
+        FlashGPTNeoXForCausalLM,
+    )
+    from text_generation_server.models.pali_gemma import (
+        PaliGemmaBatch,
+    )
+    from text_generation_server.models.custom_modeling.flash_pali_gemma_modeling import (
+        PaliGemmaForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.flash_phi_modeling import (
+        FlashPhiForCausalLM,
+    )
+    from text_generation_server.models.mllama_causal_lm import FlashMllamaCausalLMBatch
+    from text_generation_server.models.custom_modeling.flash_mllama import (
+        FlashMllamaForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.flash_llava_next import (
+        FlashLlavaNextForConditionalGeneration,
+    )
+
+    from text_generation_server.models.custom_modeling.flash_santacoder_modeling import (
+        FlashSantacoderForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_starcoder2_modeling import (
+        FlashStarcoder2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
+        Qwen2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
+        FlashMistralForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
+        FlashMixtralForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_gpt2_modeling import (
+        FlashGPT2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_gptj_modeling import (
+        FlashGPTJForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.idefics2 import (
+        Idefics2ForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.idefics3 import (
+        Idefics3ForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.qwen2_vl import (
+        Qwen2VLForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.qwen2_5_vl import (
+        Qwen2_5VLForConditionalGeneration,
+        Qwen2_5_VLConfig,
+        Qwen2_5_VLProcessor,
+    )
+    from text_generation_server.layers.attention import SUPPORTS_WINDOWING
+except ImportError as e:
+    log_master(logger.warning, f"Could not import Flash Attention enabled models: {e}")
+    SUPPORTS_WINDOWING = False
+    FLASH_ATTENTION = False
+
+if FLASH_ATTENTION:
+    __all__.append(FlashCausalLM)
+
+
+class ModelType(enum.Enum):
+    DEEPSEEK_V2 = {
+        "type": "deepseek_v2",
+        "name": "Deepseek V2",
+        "url": "https://huggingface.co/deepseek-ai/DeepSeek-V2",
+    }
+    DEEPSEEK_V3 = {
+        "type": "deepseek_v3",
+        "name": "Deepseek V3",
+        "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3",
+    }
+    IDEFICS2 = {
+        "type": "idefics2",
+        "name": "Idefics 2",
+        "url": "https://huggingface.co/HuggingFaceM4/idefics2-8b",
+        "multimodal": True,
+    }
+    IDEFICS3 = {
+        "type": "idefics3",
+        "name": "Idefics 3",
+        "url": "https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3",
+        "multimodal": True,
+    }
+    LLAVA_NEXT = {
+        "type": "llava_next",
+        "name": "Llava Next (1.6)",
+        "url": "https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf",
+        "multimodal": True,
+    }
+    LLAMA = {
+        "type": "llama",
+        "name": "Llama",
+        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
+    }
+    PHI3 = {
+        "type": "phi3",
+        "name": "Phi 3",
+        "url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
+    }
+    GRANITE = {
+        "type": "granite",
+        "name": "Granite",
+        "url": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct",
+    }
+    GEMMA = {
+        "type": "gemma",
+        "name": "Gemma",
+        "url": "https://huggingface.co/google/gemma-7b",
+    }
+    PALIGEMMA = {
+        "type": "paligemma",
+        "name": "PaliGemma",
+        "url": "https://huggingface.co/google/paligemma-3b-pt-224",
+    }
+    GEMMA2 = {
+        "type": "gemma2",
+        "name": "Gemma2",
+        "url": "https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315",
+    }
+    COHERE = {
+        "type": "cohere",
+        "name": "Cohere",
+        "url": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
+    }
+    DBRX = {
+        "type": "dbrx",
+        "name": "Dbrx",
+        "url": "https://huggingface.co/databricks/dbrx-instruct",
+    }
+    MAMBA = {
+        "type": "mamba",
+        "name": "Mamba",
+        "url": "https://huggingface.co/state-spaces/mamba-2.8b-slimpj",
+    }
+    MISTRAL = {
+        "type": "mistral",
+        "name": "Mistral",
+        "url": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
+    }
+    MIXTRAL = {
+        "type": "mixtral",
+        "name": "Mixtral",
+        "url": "https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1",
+    }
+    GPT_BIGCODE = {
+        "type": "gpt_bigcode",
+        "name": "Gpt Bigcode",
+        "url": "https://huggingface.co/bigcode/gpt_bigcode-santacoder",
+    }
+    PHI = {
+        "type": "phi",
+        "name": "Phi",
+        "url": "https://huggingface.co/microsoft/phi-1_5",
+    }
+    PHI_MOE = {
+        "type": "phimoe",
+        "name": "PhiMoe",
+        "url": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
+    }
+    BAICHUAN = {
+        "type": "baichuan",
+        "name": "Baichuan",
+        "url": "https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat",
+    }
+    FALCON = {
+        "type": "falcon",
+        "name": "Falcon",
+        "url": "https://huggingface.co/tiiuae/falcon-7b-instruct",
+    }
+    STARCODER2 = {
+        "type": "starcoder2",
+        "name": "StarCoder 2",
+        "url": "https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1",
+    }
+    QWEN2 = {
+        "type": "qwen2",
+        "name": "Qwen 2",
+        "url": "https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f",
+    }
+    QWEN2_VL = {
+        "type": "qwen2_vl",
+        "name": "Qwen 2 VL",
+        "url": "https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d",
+    }
+    QWEN2_5_VL = {
+        "type": "qwen2_5_vl",
+        "name": "Qwen 2.5 VL",
+        "url": "https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e",
+    }
+    GALACTICA = {
+        "type": "galactica",
+        "name": "Galactica",
+        "url": "https://huggingface.co/facebook/galactica-120b",
+    }
+    SANTACODER = {
+        "type": "santacoder",
+        "name": "SantaCoder",
+        "url": "https://huggingface.co/bigcode/santacoder",
+    }
+    GPT2 = {
+        "type": "gpt2",
+        "name": "Gpt2",
+        "url": "https://huggingface.co/openai-community/gpt2",
+    }
+    GPT_NEOX = {
+        "type": "gpt_neox",
+        "name": "Gpt Neox",
+        "url": "https://huggingface.co/EleutherAI/gpt-neox-20b",
+    }
+    GPTJ = {
+        "type": "gptj",
+        "name": "Gptj",
+        "url": "https://huggingface.co/EleutherAI/gpt-j-6b",
+    }
+    MLLAMA = {
+        "type": "mllama",
+        "name": "Mllama",
+        "url": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
+        "multimodal": True,
+    }
+
+
+__GLOBALS = locals()
+for data in ModelType:
+    __GLOBALS[data.name] = data.value["type"]
 
 SDP_ON_BF16 = int(os.environ.get("SDP_ON_BF16", 0))
 # Disable gradients
@@ -53,9 +327,7 @@ def get_model(
     trust_remote_code: bool,
     max_input_tokens: int,
 ) -> Model:
-    adapt_transformers_to_gaudi()
-    if SDP_ON_BF16 == 1:
-        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+    global FLASH_ATTENTION
 
     if speculate is not None:
         set_speculate(speculate)
@@ -177,9 +449,393 @@ def get_model(
 
     model_type = config_dict["model_type"]
 
+    kv_cache_dtype = dtype
+
+    if FLASH_ATTENTION:
+        if model_type == DEEPSEEK_V2:
+            head_size = max(
+                config_dict.get("qk_nope_dim", 128)
+                + config_dict.get("qk_rope_dim", 64),
+                config_dict.get("v_head_dim", 128),
+            )
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashDeepseekV2ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                default_dtype=torch.bfloat16,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=DeepseekV2Config,
+                head_size=head_size,
+            )
+        elif model_type == DEEPSEEK_V3:
+            head_size = max(
+                config_dict.get("qk_nope_dim", 128)
+                + config_dict.get("qk_rope_dim", 64),
+                config_dict.get("v_head_dim", 128),
+            )
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashDeepseekV3ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                default_dtype=torch.bfloat16,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=DeepseekV3Config,
+                head_size=head_size,
+            )
+
+        elif (
+            model_type == GPT_BIGCODE
+            or model_type == GPT2
+            and model_id.startswith("bigcode/")
+        ):
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashSantacoderForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                aliases={"transformer.wte.weight": ["lm_head.weight"]},
+                num_kv_heads=1,
+            )
+        elif model_type == GPT2:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGPT2ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == GPTJ:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGPTJForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == GPT_NEOX:
+            from text_generation_server.models.custom_modeling.flash_neox_modeling import (
+                GPTNeoXConfig,
+            )
+
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGPTNeoXForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=GPTNeoXConfig,
+            )
+        elif model_type == PHI:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashPhiForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == PHI_MOE:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashLlamaForCausalLM,
+                config_class=PhiMoEConfig,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == LLAMA or model_type == PHI3 or model_type == GRANITE:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashLlamaForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == BAICHUAN:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashLlamaForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == GEMMA:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGemmaForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == GEMMA2:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGemma2ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == COHERE:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashCohereForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == DBRX:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashDbrxForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # Dbrx works better in bfloat16.
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=DbrxConfig,
+            )
+        elif (
+            model_type in ["RefinedWeb", "RefinedWebModel", FALCON]
+            and not sharded
+            and not config_dict.get("alibi", False)
+        ):
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashRWForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                aliases={
+                    "lm_head.weight": ["transformer.word_embeddings.weight"],
+                    "transformer.word_embeddings.weight": ["lm_head.weight"],
+                },
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=RWConfig,
+            )
+        elif model_type == MISTRAL:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashMistralForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == MIXTRAL:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashMixtralForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == STARCODER2:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashStarcoder2ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == QWEN2:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=Qwen2ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == QWEN2_VL:
+            return FlashVlmCausalLM(
+                model_id=model_id,
+                model_class=Qwen2VLForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == QWEN2_5_VL:
+            return FlashVlmCausalLM(
+                model_id=model_id,
+                model_class=Qwen2_5VLForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=Qwen2_5_VLConfig,
+                processor_class=Qwen2_5_VLProcessor,
+            )
+        elif model_type == MLLAMA:
+            return FlashMllamaCausalLM(
+                model_id=model_id,
+                model_class=FlashMllamaForConditionalGeneration,
+                batch_class=FlashMllamaCausalLMBatch,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif model_type == IDEFICS2:
+            return FlashVlmCausalLM(
+                model_id=model_id,
+                model_class=Idefics2ForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                # XXX: Extremely important to cap resolution in order to limit
+                # VRAM usage.
+                processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
+            )
+        elif model_type == IDEFICS3:
+            return FlashVlmCausalLM(
+                model_id=model_id,
+                model_class=Idefics3ForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                # XXX: Extremely important to cap resolution in order to limit
+                # VRAM usage.
+                processor_kwargs={"size": {"longest_edge": 1456}},
+            )
+        elif model_type == PALIGEMMA:
+            return FlashVlmCausalLM(
+                model_id=model_id,
+                model_class=PaliGemmaForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                batch_class=PaliGemmaBatch,
+            )
+        elif model_type == LLAVA_NEXT:
+            return FlashVlmCausalLM(
+                model_class=FlashLlavaNextForConditionalGeneration,
+                model_id=model_id,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+    from text_generation_server.models.vlm_causal_lm import VlmCausalLM
+    from text_generation_server.models.custom_modeling.mllama import (
+        MllamaForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.llava_next import (
+        LlavaNextForConditionalGeneration,
+    )
+
+    adapt_transformers_to_gaudi()
+    if SDP_ON_BF16 == 1:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
     if model_type == "gpt_bigcode":
         return StarCoder(model_id=model_id, revision=revision, dtype=dtype)
-
     if model_type == "bloom":
         return BLOOM(
             model_id=model_id,
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py
index e2719fad..84835ab8 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -377,7 +377,7 @@ class BloomAttention(nn.Module):
                 past_value.view(-1, *past_value.shape[-2:]),
             )
 
-        if CUSTOM_KERNELS_ENABLED:
+        if CUSTOM_KERNELS_ENABLED and attention_mask.shape[-1] < 4096:
             assert self.training is False, "Only foward pass was implemented"
             assert (
                 attention_mask.shape[-1] < 4096
@@ -580,7 +580,7 @@ class BloomPreTrainedModel(PreTrainedModel):
 
     @staticmethod
     def _convert_to_bloom_cache(
-        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
     ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
         """
         Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
index 30656038..3bcc689d 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -28,10 +28,10 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
-from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
@@ -39,7 +39,6 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
@@ -47,11 +46,10 @@ from text_generation_server.layers.rotary import (
     PositionRotaryEmbedding,
 )
 from text_generation_server.utils.weights import UnquantizedWeight
-
-if SYSTEM == "cuda":
-    import dropout_layer_norm
-else:
-    dropout_layer_norm = None
+from habana_frameworks.torch.hpex.kernels import (
+    RotaryPosEmbeddingMode,
+    apply_rotary_pos_emb,
+)
 
 
 class CohereRotary(PositionRotaryEmbedding):
@@ -63,38 +61,25 @@ class CohereRotary(PositionRotaryEmbedding):
         sin: torch.Tensor,
     ):
         # Such controlflows may add some overhead.
-        if SYSTEM == "cuda":
-            import rotary_emb
+        num_tokens = query.shape[0]
+        head_size = query.shape[-1]
+        rope_mode = RotaryPosEmbeddingMode.PAIRWISE
+        sin = torch.repeat_interleave(sin, 2, dim=-1)
+        cos = torch.repeat_interleave(cos, 2, dim=-1)
+        rotary_dim = cos.shape[-1]
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, head_size)
+        query_rot = query[..., :rotary_dim]
+        query_pass = query[..., rotary_dim:]
+        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
+        query.copy_(torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape))
 
-            q1 = query[..., ::2]
-            q2 = query[..., 1::2]
-
-            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
-
-            k1 = key[..., ::2]
-            k2 = key[..., 1::2]
-
-            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-        elif SYSTEM == "rocm":
-            from vllm._C import ops
-
-            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
-            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
-
-            head_size = query.shape[-1]
-
-            # Inplace operation, updating query and key.
-            ops.rotary_embedding(query, key, head_size, cos, sin, False)
-        elif SYSTEM == "ipex":
-            import intel_extension_for_pytorch as ipex
-
-            ipex.llm.functional.rotary_embedding(
-                query, key, sin, cos, query.size(-1), False
-            )
-        else:
-            raise ValueError(
-                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-            )
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, head_size)
+        key_rot = key[..., :rotary_dim]
+        key_pass = key[..., rotary_dim:]
+        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
+        key.copy_(torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape))
 
 
 class CohereLayerNorm(nn.Module):
@@ -107,49 +92,18 @@ class CohereLayerNorm(nn.Module):
         self.eps = eps
 
     def forward(self, hidden_states):
-        if hidden_states.shape[-1] > 8192 or SYSTEM != "cuda":
-            hidden_states = hidden_states.reshape(
-                -1, self.weight.shape[0], self.weight.shape[1]
-            )
-            input_dtype = hidden_states.dtype
-            hidden_states = hidden_states.to(torch.float32)
-            mean = hidden_states.mean(-1, keepdim=True)
-            hidden_states_minus_mean = hidden_states - mean
-            variance = hidden_states_minus_mean.pow(2).mean(-1, keepdim=True)
-            hidden_states = hidden_states_minus_mean * torch.rsqrt(variance + self.eps)
-            hidden_states = self.weight.to(torch.float32) * hidden_states
-            hidden_states = hidden_states.view(-1, self.weight.shape[1])
-            return hidden_states.to(input_dtype)
-
-        (
-            hidden_states,
-            *rest,
-        ) = dropout_layer_norm.dropout_add_ln_fwd(
-            hidden_states,
-            None,
-            self.ones,
-            None,
-            None,
-            None,
-            None,
-            None,
-            0.0,
-            self.eps,
-            1.0,
-            0,
-            None,
-            False,
-            False,
-        )
-
-        # Required to apply one weight matrix per head
-        hidden_states = hidden_states.view(
+        hidden_states = hidden_states.reshape(
             -1, self.weight.shape[0], self.weight.shape[1]
         )
-        hidden_states = self.weight * hidden_states
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        mean = hidden_states.mean(-1, keepdim=True)
+        hidden_states_minus_mean = hidden_states - mean
+        variance = hidden_states_minus_mean.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states_minus_mean * torch.rsqrt(variance + self.eps)
+        hidden_states = self.weight.to(torch.float32) * hidden_states
         hidden_states = hidden_states.view(-1, self.weight.shape[1])
-
-        return hidden_states
+        return hidden_states.to(input_dtype)
 
 
 def load_attention(config, prefix, weights):
@@ -229,6 +183,7 @@ class FlashCohereAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.use_qk_norm = config.use_qk_norm
         if self.use_qk_norm:
@@ -264,10 +219,9 @@ class FlashCohereAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states)
         query, key, value = qkv.split(
@@ -291,30 +245,35 @@ class FlashCohereAttention(torch.nn.Module):
 
         self.rotary_emb(query, key, cos, sin)
 
-        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else key,
-                kv_cache[1] if PREFILL_IN_KV_CACHE else value,
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(
@@ -386,10 +345,9 @@ class FlashCohereLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -400,10 +358,9 @@ class FlashCohereLayer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         mlp_output = self.mlp(normed_hidden_states)
@@ -452,18 +409,15 @@ class FlashCohereModel(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: torch.Tensor,
-        max_s: int,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
 
@@ -475,10 +429,9 @@ class FlashCohereModel(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -516,11 +469,9 @@ class FlashCohereForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -529,10 +480,9 @@ class FlashCohereForCausalLM(torch.nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
index 1137a453..15c243c9 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -20,17 +20,14 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
-from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 
-if SYSTEM != "ipex":
-    from vllm.model_executor.layers.fused_moe import fused_moe
 
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
-    PREFILL_IN_KV_CACHE,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     FastLinear,
@@ -46,6 +43,7 @@ from text_generation_server.layers.rotary import (
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
+from vllm_hpu_extension.ops import DynamicFusedMOE
 
 
 class DbrxAttentionConfig(PretrainedConfig):
@@ -290,6 +288,7 @@ class DbrxAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -309,10 +308,9 @@ class DbrxAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states)
         if self.clip_qkv is not None:
@@ -330,30 +328,35 @@ class DbrxAttention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -387,10 +390,9 @@ class DbrxNormAttentionNorm(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         normed_hidden_states, res = self.norm_1(hidden_states, residual)
 
@@ -401,10 +403,9 @@ class DbrxNormAttentionNorm(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         # faster post attention rms norm
@@ -482,18 +483,15 @@ class BlockSparseMoE(nn.Module):
 
         self.process_group = weights.process_group
 
+        self.hpu_fused_moe = DynamicFusedMOE(self.num_experts)
+        for i in range(self.num_experts):
+            self.hpu_fused_moe.MoeOp.w13_list[i].set_weight(self.wv1[i])
+            self.hpu_fused_moe.MoeOp.w2_list[i].set_weight(self.w2[i])
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(x)
-        out = fused_moe(
-            x,
-            self.wv1,
-            self.w2,
-            router_logits,
-            self.top_k,
-            renormalize=self.moe_normalize_expert_weights,
-            inplace=True,
-        )
+        out = self.hpu_fused_moe(x, router_logits, self.top_k)
 
         # Reduce sum
         if self.process_group.size() > 1:
@@ -620,10 +618,9 @@ class DbrxLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         # Self Attention
         attn_output, attn_res = self.attn(
@@ -633,10 +630,9 @@ class DbrxLayer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         moe_output = self.moe(attn_output)
@@ -677,18 +673,15 @@ class DbrxModel(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].attn.self_attn.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].attn.self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -699,10 +692,9 @@ class DbrxModel(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -732,11 +724,9 @@ class FlashDbrxForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -745,10 +735,9 @@ class FlashDbrxForCausalLM(torch.nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
index 88c2cf80..9d61c694 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
@@ -33,21 +33,14 @@ from text_generation_server.layers.attention import (
     Seqlen,
     attention,
     paged_attention,
-    reshape_and_cache,
+    HPUPagedAttentionMetadata,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales
 from text_generation_server.layers.layernorm import FastRMSNorm
 from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
 from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.weights import Weights
 
-if SYSTEM == "rocm":
-    try:
-        from vllm import _custom_C
-    except Exception as e:
-        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
-
 
 class DeepseekV2Config(PretrainedConfig):
     def __init__(
@@ -232,6 +225,8 @@ class DeepseekV2Attention(torch.nn.Module):
             ),
         )
 
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
         self.kv_a_layernorm = FastRMSNorm.load(
             prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
         )
@@ -260,11 +255,10 @@ class DeepseekV2Attention(torch.nn.Module):
         cos: torch.Tensor,
         sin: torch.Tensor,
         cu_seqlen_prefill: torch.Tensor,
-        kv_cache: Tuple[torch.Tensor, torch.Tensor],
-        block_tables: torch.Tensor,
+        kv_cache: KVCache,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ):
         if self.q_lora_rank is None:
             query = self.q_proj(hidden_states)
@@ -321,30 +315,35 @@ class DeepseekV2Attention(torch.nn.Module):
             value, (0, self.head_pad_size - self.value_head_size), value=0
         )
 
-        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else key,
-                kv_cache[1] if PREFILL_IN_KV_CACHE else value,
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         # Remove padding.
@@ -387,27 +386,11 @@ class DeepseekV2MLP(nn.Module):
         self.quantize = config.quantize
 
     def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
-        if (
-            SYSTEM == "rocm"
-            and self.hidden_act == "silu"
-            and hidden_states.dtype == torch.float16
-            and hidden_states.shape[0] == 1
-            and not self.quantize
-        ):
-            out = torch.empty(
-                hidden_states.shape[0],
-                self.intermediate_size,
-                dtype=hidden_states.dtype,
-                device="cuda",
-            )
-            _custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
-            return self.down_proj(out, reduce=reduce)
-        else:
-            gate_up_states = self.gate_up_proj(hidden_states)
-            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
-            return self.down_proj(
-                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
-            )
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
+        )
 
 
 class DeepseekV2MoE(nn.Module):
@@ -520,10 +503,9 @@ class DeepseekV2Layer(nn.Module):
         sin: torch.Tensor,
         cu_seqlen_prefill: torch.Tensor,
         kv_cache,
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ):
         normed_hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
@@ -534,10 +516,9 @@ class DeepseekV2Layer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         # faster post attention rms norm
@@ -583,18 +564,15 @@ class DeepseekV2Model(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -605,10 +583,9 @@ class DeepseekV2Model(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -635,11 +612,9 @@ class FlashDeepseekV2ForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -648,10 +623,9 @@ class FlashDeepseekV2ForCausalLM(torch.nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py
new file mode 100644
index 00000000..1a7ce5cf
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py
@@ -0,0 +1,642 @@
+# coding=utf-8
+# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Type
+
+import torch
+import torch.distributed
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+
+from text_generation_server.layers import (
+    FastLinear,
+    SpeculativeHead,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    get_linear,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+    attention,
+    paged_attention,
+    HPUPagedAttentionMetadata,
+)
+from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales
+from text_generation_server.layers.layernorm import FastRMSNorm
+from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
+from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
+from text_generation_server.utils.weights import Weights
+
+
+class DeepseekV3Config(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=2,
+        n_routed_experts=160,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method="gready",
+        n_group=8,
+        topk_group=3,
+        num_experts_per_tok=6,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func="softmax",
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError(
+                "tie_word_embeddings is not supported for Deepseek V2 models."
+            )
+
+        if ep_size != 1:
+            raise ValueError(
+                f"Currently only ep_size == 1 is supported for Deepseek V2 models, was {ep_size}"
+            )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekV3Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights: Weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.kv_lora_rank = config.kv_lora_rank
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.value_head_size = config.v_head_dim
+        self.head_pad_size = max(self.head_size, self.value_head_size)
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.qk_rope_head_dim,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        mscale = get_mscale(
+            self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
+        )
+        self.softmax_scale = self.head_size**-0.5 * mscale * mscale
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        if self.q_lora_rank is None:
+            self.q_proj = TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.q_proj",
+                weights=weights,
+                bias=config.attention_bias,
+            )
+        else:
+            self.q_a_proj = get_linear(
+                weight=weights.get_weights(f"{prefix}.q_a_proj"),
+                bias=(
+                    weights.get_tensor(f"{prefix}.q_a_proj.bias")
+                    if config.attention_bias
+                    else None
+                ),
+            )
+            self.q_a_layernorm = FastRMSNorm.load(
+                prefix=f"{prefix}.q_a_layernorm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
+            self.q_b_proj = TensorParallelColumnLinear.load(
+                config,
+                prefix=f"{prefix}.q_b_proj",
+                weights=weights,
+                bias=config.attention_bias,
+            )
+
+        self.kv_a_proj_with_mqa = get_linear(
+            weight=weights.get_weights(f"{prefix}.kv_a_proj_with_mqa"),
+            bias=(
+                weights.get_tensor(f"{prefix}.kv_a_proj_with_mqa.bias")
+                if config.attention_bias
+                else None
+            ),
+        )
+
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
+        self.kv_a_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.kv_b_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.kv_b_proj",
+            weights=weights,
+            bias=config.attention_bias,
+        )
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        cu_seqlen_prefill: torch.Tensor,
+        kv_cache: KVCache,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
+    ):
+        if self.q_lora_rank is None:
+            query = self.q_proj(hidden_states)
+        else:
+            query = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))[0])
+        query = query.view(-1, self.num_heads, self.head_size)
+
+        _, query_pe = torch.split(
+            query, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, key_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+
+        key_pe = key_pe.view(-1, 1, self.qk_rope_head_dim)
+        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv.contiguous())[0]).view(
+            -1, self.num_key_value_heads, self.qk_nope_head_dim + self.value_head_size
+        )
+
+        key_nope, value = torch.split(
+            kv, [self.qk_nope_head_dim, self.value_head_size], dim=-1
+        )
+
+        batch_size, heads, head_dim = query_pe.shape
+        query_pe = (
+            query_pe.view(batch_size, heads, head_dim // 2, 2)
+            .transpose(2, 3)
+            .reshape(batch_size, heads, head_dim)
+        )
+        batch_size, heads, head_dim = key_pe.shape
+        key_pe = (
+            key_pe.view(batch_size, heads, head_dim // 2, 2)
+            .transpose(2, 3)
+            .reshape(batch_size, heads, head_dim)
+        )
+        self.rotary_emb(query_pe, key_pe, cos, sin)
+
+        query[..., self.qk_nope_head_dim :] = query_pe
+        key = torch.empty_like(query)
+        key[..., : self.qk_nope_head_dim] = key_nope
+        key[..., self.qk_nope_head_dim :] = key_pe
+
+        # We need to pad the heads because Flash Attention does not support
+        # qk and v with different head sizes.
+        query = torch.nn.functional.pad(
+            query, (0, self.head_pad_size - self.head_size), value=0
+        )
+        key = torch.nn.functional.pad(
+            key, (0, self.head_pad_size - self.head_size), value=0
+        )
+        value = torch.nn.functional.pad(
+            value, (0, self.head_pad_size - self.value_head_size), value=0
+        )
+
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache,
+                self.kv_head_mapping,
+                self.softmax_scale,
+                seqlen,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
+            )
+
+        # Remove padding.
+        attn_output = attn_output[..., : self.value_head_size]
+
+        return self.o_proj(
+            attn_output.reshape(-1, self.num_heads * self.value_head_size)
+        )
+
+
+class DeepseekV3MLP(nn.Module):
+    def __init__(self, prefix: str, config, weights, intermediate_size: int):
+        super().__init__()
+        self.hidden_act = config.hidden_act
+        if self.hidden_act != "silu":
+            # Bail out because MoE only supports silu.
+            raise NotImplementedError(
+                "Currently only `silu` is supported as an activation for Deepseek V2."
+            )
+        self.act = ACT2FN[self.hidden_act]
+
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.intermediate_size = intermediate_size // weights.process_group.size()
+
+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
+    def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
+        )
+
+
+class DeepseekV3MoE(nn.Module):
+    def __init__(
+        self,
+        prefix,
+        config: DeepseekV3Config,
+        moe_layer_cls: Type[MoELayer],
+        weights,
+    ):
+        super().__init__()
+
+        self.hidden_dim = config.hidden_size
+        self.moe_intermediate_size = (
+            config.moe_intermediate_size // weights.process_group.size()
+        )
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        # Gating
+        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
+
+        if config.topk_method == "noaux_tc":
+            self.gate.e_score_correction_bias = torch.zeros(
+                config.n_routed_experts, device=weights.device
+            )
+        else:
+            self.gate.e_score_correction_bias = None
+
+        self.moe_layer = moe_layer_cls(
+            prefix=f"{prefix}.experts",
+            n_experts=config.n_routed_experts,
+            n_expert_group=config.n_group,
+            renormalize=config.norm_topk_prob,
+            topk=config.num_experts_per_tok,
+            topk_group=config.topk_group,
+            weights=weights,
+            scoring_func=config.scoring_func,
+            e_score_correction_bias=self.gate.e_score_correction_bias,
+        )
+        assert isinstance(self.moe_layer, MoELayer)
+
+        if config.n_shared_experts is not None:
+            self.shared_experts = DeepseekV3MLP(
+                prefix=f"{prefix}.shared_experts",
+                config=config,
+                weights=weights,
+                intermediate_size=config.moe_intermediate_size
+                * config.n_shared_experts,
+            )
+        else:
+            self.shared_experts = None
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(x, reduce=False)
+        else:
+            shared_output = None
+
+        router_logits = self.gate(x)
+
+        out = self.moe_layer(x, gating_output=router_logits)
+
+        if shared_output is not None:
+            out = out + shared_output
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
+class DeepseekV3Layer(nn.Module):
+    def __init__(self, prefix, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+
+        self.self_attn = DeepseekV3Attention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+        )
+
+        if (
+            config.n_routed_experts is not None
+            and layer_id >= config.first_k_dense_replace
+            and layer_id % config.moe_layer_freq == 0
+        ):
+            moe_layer_cls = (
+                SparseMoELayer
+                if SparseMoELayer.is_supported(weights)
+                else DenseMoELayer
+            )
+            self.mlp = DeepseekV3MoE(f"{prefix}.mlp", config, moe_layer_cls, weights)
+        else:
+            self.mlp = DeepseekV3MLP(
+                prefix=f"{prefix}.mlp",
+                config=config,
+                weights=weights,
+                intermediate_size=config.intermediate_size,
+            )
+
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        cu_seqlen_prefill: torch.Tensor,
+        kv_cache,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
+    ):
+        normed_hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            slots,
+            seqlen,
+            hpu_attention_meta,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, residual = self.post_attention_layernorm(
+            attn_output, residual
+        )
+
+        output = self.mlp(normed_attn_res_output)
+
+        return output, residual
+
+
+class DeepseekV3Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights: Weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                DeepseekV3Layer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                slots,
+                seqlen,
+                hpu_attention_meta,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashDeepseekV3ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights: Weights):
+        super().__init__()
+
+        self.model = DeepseekV3Model(
+            "model" if not prefix else f"{prefix}.model", config, weights
+        )
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            slots,
+            seqlen,
+            hpu_attention_meta,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
index 7a3d60c9..79f21b0f 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
@@ -28,8 +28,8 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -40,7 +40,7 @@ from text_generation_server.layers import (
     TensorParallelMultiAdapterLinear,
     TensorParallelAdapterRowLinear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
@@ -208,6 +208,7 @@ class FlashGemma2Attention(torch.nn.Module):
             ],
             process_group=weights.process_group,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         o_proj = TensorParallelRowLinear.load(
             config,
@@ -234,11 +235,10 @@ class FlashGemma2Attention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
         adapter_data,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states, adapter_data)
         query, kv = qkv.split(
@@ -253,19 +253,24 @@ class FlashGemma2Attention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
-                causal=self.causal,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.window_size,
                 softcap=self.softcap,
             )
@@ -273,14 +278,13 @@ class FlashGemma2Attention(torch.nn.Module):
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
                 softcap=self.softcap,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(
@@ -390,11 +394,10 @@ class FlashGemma2Layer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
         adapter_data,
+        hpu_attention_meta,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -405,11 +408,10 @@ class FlashGemma2Layer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
             adapter_data,
+            hpu_attention_meta,
         )
 
         # faster post attention rms norm
@@ -458,19 +460,16 @@ class FlashGemma2Model(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        adapter_data: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -481,11 +480,10 @@ class FlashGemma2Model(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
                 adapter_data,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -529,11 +527,9 @@ class FlashGemma2ForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -543,11 +539,10 @@ class FlashGemma2ForCausalLM(torch.nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
             adapter_data,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
index 4c1be6f6..609f03ac 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -28,9 +28,8 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
-    PREFILL_IN_KV_CACHE,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -39,6 +38,7 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
@@ -187,6 +187,7 @@ class FlashGemmaAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -206,10 +207,9 @@ class FlashGemmaAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states)
         query, kv = qkv.split(
@@ -224,31 +224,36 @@ class FlashGemmaAttention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
                 causal=self.causal,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -317,10 +322,9 @@ class FlashGemmaLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -331,10 +335,9 @@ class FlashGemmaLayer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         # faster post attention rms norm
@@ -379,18 +382,16 @@ class FlashGemmaModel(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
+        adapter_data: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -401,10 +402,9 @@ class FlashGemmaModel(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -446,11 +446,9 @@ class FlashGemmaForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -460,10 +458,10 @@ class FlashGemmaForCausalLM(torch.nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            adapter_data,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
index 44c015cf..10024a6d 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
@@ -24,12 +24,11 @@ import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -38,6 +37,7 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 
 
 def load_qkv(config, prefix: str, weights, head_size, num_heads):
@@ -47,10 +47,6 @@ def load_qkv(config, prefix: str, weights, head_size, num_heads):
             prefix,
             weights,
         )
-    elif config.quantize == "marlin":
-        raise RuntimeError(
-            "GPT-2 models with marlin quantization are not yet supported"
-        )
     else:
         return _load_qkv(config, prefix, weights, head_size, num_heads)
 
@@ -195,6 +191,7 @@ class FlashGPT2Attention(torch.nn.Module):
             head_size=self.head_size,
             num_heads=self.num_heads,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = load_row(
             config,
@@ -212,10 +209,9 @@ class FlashGPT2Attention(torch.nn.Module):
         hidden_states,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         query, key, value = self.query_key_value(hidden_states).split(
             self.head_size * self.num_heads, dim=1
@@ -224,30 +220,35 @@ class FlashGPT2Attention(torch.nn.Module):
         key = key.view(-1, self.num_heads, self.head_size)
         value = value.view(-1, self.num_heads, self.head_size)
 
-        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else key,
-                kv_cache[1] if PREFILL_IN_KV_CACHE else value,
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -313,10 +314,9 @@ class FlashGPT2Layer(nn.Module):
         residual,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -326,10 +326,9 @@ class FlashGPT2Layer(nn.Module):
             hidden_states,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         hidden_states = attn_output + residual
@@ -379,12 +378,9 @@ class FlashGPT2Model(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        true_max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
@@ -395,10 +391,9 @@ class FlashGPT2Model(torch.nn.Module):
                 residual,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states = self.norm(hidden_states)
@@ -432,11 +427,9 @@ class FlashGPT2ForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor] = None,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -448,12 +441,9 @@ class FlashGPT2ForCausalLM(torch.nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            true_max_s=max_s,
-            prefill_cache_indices=prefill_cache_indices,
+            hpu_attention_meta=hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
index aca97004..41eeab78 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
@@ -24,12 +24,12 @@ import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
-from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -38,13 +38,16 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.rotary import (
     PositionRotaryEmbedding,
 )
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
+from habana_frameworks.torch.hpex.kernels import (
+    RotaryPosEmbeddingMode,
+    apply_rotary_pos_emb,
+)
 
 
 def load_attention(config, prefix: str, weights):
@@ -78,39 +81,25 @@ class GPTJRotary(PositionRotaryEmbedding):
         cos: torch.Tensor,
         sin: torch.Tensor,
     ):
-        # Such controlflows may add some overhead.
-        if SYSTEM == "cuda":
-            import rotary_emb
+        num_tokens = query.shape[0]
+        head_size = query.shape[-1]
+        rope_mode = RotaryPosEmbeddingMode.PAIRWISE
+        sin = torch.repeat_interleave(sin, 2, dim=-1)
+        cos = torch.repeat_interleave(cos, 2, dim=-1)
+        rotary_dim = cos.shape[-1]
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, head_size)
+        query_rot = query[..., :rotary_dim]
+        query_pass = query[..., rotary_dim:]
+        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
+        query.copy_(torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape))
 
-            q1 = query[..., ::2]
-            q2 = query[..., 1::2]
-
-            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
-
-            k1 = key[..., ::2]
-            k2 = key[..., 1::2]
-
-            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-        elif SYSTEM == "rocm":
-            from vllm._C import ops
-
-            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
-            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
-
-            head_size = query.shape[-1]
-
-            # Inplace operation, updating query and key.
-            ops.rotary_embedding(query, key, head_size, cos, sin, False)
-        elif SYSTEM == "ipex":
-            import intel_extension_for_pytorch as ipex
-
-            ipex.llm.functional.rotary_embedding(
-                query, key, sin, cos, query.size(-1), False
-            )
-        else:
-            raise ValueError(
-                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-            )
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, head_size)
+        key_rot = key[..., :rotary_dim]
+        key_pass = key[..., rotary_dim:]
+        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
+        key.copy_(torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape))
 
 
 class FlashGPTJAttention(torch.nn.Module):
@@ -140,6 +129,7 @@ class FlashGPTJAttention(torch.nn.Module):
             prefix=prefix,
             weights=weights,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = load_row(
             config,
@@ -166,10 +156,9 @@ class FlashGPTJAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         query, key, value = self.query_key_value(hidden_states).split(
             self.head_size * self.num_heads, dim=1
@@ -186,30 +175,35 @@ class FlashGPTJAttention(torch.nn.Module):
         else:
             self.rotary_emb(query, key, cos, sin)
 
-        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else key,
-                kv_cache[1] if PREFILL_IN_KV_CACHE else value,
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -267,10 +261,9 @@ class FlashGPTJLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         hidden_states, residual = self.input_layernorm(hidden_states, residual)
         # Self Attention
@@ -280,10 +273,9 @@ class FlashGPTJLayer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         feed_forward_hidden_states = self.mlp(hidden_states)
@@ -327,19 +319,15 @@ class FlashGPTJModel(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.wte(input_ids)
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -350,10 +338,9 @@ class FlashGPTJModel(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.ln_f(hidden_states, residual)
@@ -381,11 +368,9 @@ class FlashGPTJForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor] = None,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -394,11 +379,9 @@ class FlashGPTJForCausalLM(torch.nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            prefill_cache_indices=prefill_cache_indices,
+            hpu_attention_meta=hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index c9ec70cc..81af5560 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -27,14 +27,16 @@ import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention import (
+    KVCache,
+    get_kv_scales,
+)
 from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -57,15 +59,6 @@ from text_generation_server.utils.weights import (
 )
 from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
 
-if SYSTEM != "ipex":
-    pass
-
-if SYSTEM == "rocm":
-    try:
-        from vllm import _custom_C
-    except Exception as e:
-        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
-
 
 def load_attention(config, prefix: str, weights, layer_id):
     # Only defined in granite.
@@ -157,7 +150,10 @@ class FlashLlamaAttention(torch.nn.Module):
             device=weights.device,
         )
 
-        self.softmax_scale = self.head_size**-0.5
+        # `config.attention_multiplier` is used in Granite
+        self.softmax_scale = getattr(
+            config, "attention_multiplier", self.head_size**-0.5
+        )
 
         if self.num_heads % weights.process_group.size() != 0:
             raise ValueError(
@@ -177,11 +173,13 @@ class FlashLlamaAttention(torch.nn.Module):
         self.query_key_value = load_attention(config, prefix, weights, index)
         self.index = index
 
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
         o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
-            bias=False,
+            bias=getattr(config, "attention_bias", False),
         )
 
         self.o_proj = TensorParallelAdapterRowLinear.load(
@@ -202,12 +200,11 @@ class FlashLlamaAttention(torch.nn.Module):
         cos,
         sin,
         cu_seqlen_prefill,
-        kv_cache,
-        block_tables,
+        kv_cache: KVCache,
         slots,
         seqlen,
-        max_s,
         adapter_data,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ):
         qkv = self.query_key_value(hidden_states, adapter_data)
         query, kv = qkv.split(
@@ -222,30 +219,35 @@ class FlashLlamaAttention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_scales=self.kv_scales,
+                kv_cache=kv_cache,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(
@@ -363,31 +365,11 @@ class LlamaMLP(nn.Module):
         self.hidden_size = config.hidden_size
 
     def forward(self, hidden_states, adapter_data):
-        if (
-            SYSTEM == "rocm"
-            and self.hidden_act == "silu"
-            and hidden_states.dtype == torch.float16
-            and hidden_states.shape[0] == 1
-            and not self.quantize
-            and self.hidden_size
-            != 16384  # TODO: Temporary workaround for `LLMM_Silu` kernel not working with LLama3.1 405B; needs refactoring once fixed.
-        ):
-            out = torch.empty(
-                hidden_states.shape[0],
-                self.intermediate_size,
-                dtype=hidden_states.dtype,
-                device="cuda",
-            )
-            _custom_C.LLMM_Silu(
-                self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
-            )
-            return self.down_proj(out, adapter_data)
-        else:
-            gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
-            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
-            return self.down_proj(
-                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
-            )
+        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+        )
 
 
 class FlashLlamaLayer(nn.Module):
@@ -408,7 +390,7 @@ class FlashLlamaLayer(nn.Module):
                 if SparseMoELayer.is_supported(weights)
                 else DenseMoELayer
             )
-            self.dense = Phi3MoE(
+            self.mlp = Phi3MoE(
                 f"{prefix}.block_sparse_moe", config, moe_layer_cls, weights
             )
             # with moe the layernorms are are not rmsnorms and they have bias
@@ -423,7 +405,7 @@ class FlashLlamaLayer(nn.Module):
                 eps=config.rms_norm_eps,
             )
         else:
-            self.dense = LlamaMLP(
+            self.mlp = LlamaMLP(
                 prefix=f"{prefix}.mlp", config=config, weights=weights, index=index
             )
             self.input_layernorm = FastRMSNorm.load(
@@ -437,6 +419,11 @@ class FlashLlamaLayer(nn.Module):
                 eps=config.rms_norm_eps,
             )
 
+        # Used in Granite
+        # This could eventually be baked into the weights like we do for the embeddings/lm_head
+        # but this would mean modifying the lora code
+        self.residual_multiplier = getattr(config, "residual_multiplier", None)
+
     def forward(
         self,
         hidden_states,
@@ -445,12 +432,11 @@ class FlashLlamaLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
         adapter_data,
         cross_attention_states,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -461,19 +447,21 @@ class FlashLlamaLayer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
             adapter_data,
+            hpu_attention_meta=hpu_attention_meta,
         )
+        if self.residual_multiplier is not None:
+            attn_output *= self.residual_multiplier
 
-        # faster post attention rms norm
         normed_attn_res_output, attn_res = self.post_attention_layernorm(
             attn_output, res
         )
 
-        mlp_output = self.dense(normed_attn_res_output, adapter_data)
+        mlp_output = self.mlp(normed_attn_res_output, adapter_data)
+        if self.residual_multiplier is not None:
+            mlp_output *= self.residual_multiplier
 
         return mlp_output, attn_res
 
@@ -493,9 +481,7 @@ class FlashLlamaModel(torch.nn.Module):
             self.layers.append(
                 FlashLlamaLayer(
                     index=0,
-                    prefix=(
-                        "model.layers.0" if not prefix else f"{prefix}.model.layers.0"
-                    ),
+                    prefix=f"{prefix}.layers.0",
                     config=config,
                     weights=weights,
                 )
@@ -504,18 +490,14 @@ class FlashLlamaModel(torch.nn.Module):
         # Skip first and last layers
         for layer_id in range(1, config.num_hidden_layers - 1):
             if layer_id in self.cross_attention_layers:
-                from text_generation_server.models.custom_modeling.mllama import (
+                from text_generation_server.models.custom_modeling.flash_mllama import (
                     FlashLlamaCrossLayer,
                 )
 
                 self.layers.append(
                     FlashLlamaCrossLayer(
                         index=layer_id,
-                        prefix=(
-                            f"model.layers.{layer_id}"
-                            if not prefix
-                            else f"{prefix}.model.layers.{layer_id}"
-                        ),
+                        prefix=(f"{prefix}.layers.{layer_id}"),
                         config=config,
                         weights=weights,
                     )
@@ -524,11 +506,7 @@ class FlashLlamaModel(torch.nn.Module):
                 self.layers.append(
                     FlashLlamaLayer(
                         index=layer_id,
-                        prefix=(
-                            f"model.layers.{layer_id}"
-                            if not prefix
-                            else f"{prefix}.model.layers.{layer_id}"
-                        ),
+                        prefix=(f"{prefix}.layers.{layer_id}"),
                         config=config,
                         weights=weights,
                     )
@@ -539,18 +517,14 @@ class FlashLlamaModel(torch.nn.Module):
             self.layers.append(
                 FlashLlamaLayer(
                     index=last_layer_id,
-                    prefix=(
-                        f"model.layers.{last_layer_id}"
-                        if not prefix
-                        else f"{prefix}.model.layers.{last_layer_id}"
-                    ),
+                    prefix=(f"{prefix}.layers.{last_layer_id}"),
                     config=config,
                     weights=weights,
                 )
             )
 
         self.norm = FastRMSNorm.load(
-            prefix="model.norm" if not prefix else f"{prefix}.model.norm",
+            prefix=f"{prefix}.norm",
             weights=weights,
             eps=config.rms_norm_eps,
         )
@@ -567,22 +541,17 @@ class FlashLlamaModel(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        true_max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
         adapter_data,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         cross_attention_states=None,
     ) -> torch.Tensor:
         hidden_states = inputs_embeds
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -593,12 +562,11 @@ class FlashLlamaModel(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
                 adapter_data,
                 cross_attention_states,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -607,42 +575,60 @@ class FlashLlamaModel(torch.nn.Module):
 
 
 class FlashLlamaForCausalLM(torch.nn.Module):
-    def __init__(self, prefix: str, config, weights):
+    def __init__(self, prefix: str, config, weights, name=None):
+        if name is None:
+            name = "model"
         super().__init__()
-
         with no_fp8(weights):
             self.embed_tokens = TensorParallelEmbedding(
                 prefix=(
-                    "model.embed_tokens"
+                    f"{name}.embed_tokens"
                     if not prefix
-                    else f"{prefix}.model.embed_tokens"
+                    else f"{prefix}.{name}.embed_tokens"
                 ),
                 weights=weights,
             )
-        self.model = FlashLlamaModel(prefix, config, weights)
+        self.model = FlashLlamaModel(
+            prefix=name if not prefix else f"{prefix}.{name}",
+            config=config,
+            weights=weights,
+        )
         if config.tie_word_embeddings:
             suffix = "model.embed_tokens"
         else:
             suffix = "lm_head"
 
+        # Used in Granite
+        embedding_multiplier = getattr(config, "embedding_multiplier", None)
+        if embedding_multiplier is not None:
+            self.embed_tokens.weight.data *= embedding_multiplier
+        prefix = suffix if not prefix or name != "model" else f"{prefix}.{suffix}"
         with no_fp8(weights):
             self.lm_head = SpeculativeHead.load(
                 config,
-                prefix=suffix if not prefix else f"{prefix}.{suffix}",
-                weights=weights,
+                prefix,
+                weights,
             )
 
+        # Used in Granite
+        self.logits_scaling = getattr(config, "logits_scaling", None)
+        if self.logits_scaling is not None and self.lm_head.head is not None:
+            try:
+                # Scale the weights directly
+                self.lm_head.head.linear.weight.data /= self.logits_scaling
+                self.logits_scaled = True
+            except Exception:
+                self.logits_scaled = False
+
     def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor] = None,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
         cross_attention_states=None,
@@ -653,16 +639,20 @@ class FlashLlamaForCausalLM(torch.nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            true_max_s=max_s,
-            prefill_cache_indices=prefill_cache_indices,
             adapter_data=adapter_data,
             cross_attention_states=cross_attention_states,
+            hpu_attention_meta=hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
         logits, speculative_logits = self.lm_head(hidden_states)
+
+        # Used in Granite
+        if self.logits_scaling is not None and not self.logits_scaled:
+            logits /= self.logits_scaling
+            if speculative_logits is not None:
+                speculative_logits /= self.logits_scaling
+
         return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py
new file mode 100644
index 00000000..88548042
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Llava-NeXT model."""
+
+from typing import List, Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.image_processing_utils import select_best_resolution
+
+from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
+from text_generation_server.models.custom_modeling.vlm import (
+    load_text_model,
+    load_vision_model,
+)
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`tuple`):
+            The size of the input image in the format (height, width).
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (height, width).
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise ValueError("grid_pinpoints should be a list of tuples or lists")
+
+    height, width = select_best_resolution(image_size, grid_pinpoints)
+    return height // patch_size, width // patch_size
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+        tensor (`torch.Tensor`):
+            The image tensor, assumed to be of shape (num_channels, height, width).
+        original_size (`tuple`):
+            The original size of the image (height, width).
+
+    Returns:
+        `torch.Tensor`: The unpadded image tensor.
+    """
+    original_height, original_width = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+
+    return unpadded_tensor
+
+
+# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext
+class LlavaNextMultiModalProjector(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        self.linear_1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.linear_1", config=config, weights=weights, bias=True
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.linear_2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class FlashLlavaNextForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = config.quantize
+        vision_config = config.vision_config
+        # Instead of selecting in hidden_states[-2].
+        # Instead compute only the n -2 + 1 layers and don't pool
+        if config.vision_feature_layer < 0:
+            vision_config.num_hidden_layers += config.vision_feature_layer + 1
+        else:
+            vision_config.num_hidden_layers = config.vision_feature_layer + 1
+        self.vision_tower = load_vision_model(
+            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
+            config=config.vision_config,
+            weights=weights,
+        )
+
+        self.multi_modal_projector = LlavaNextMultiModalProjector(
+            prefix="multi_modal_projector", config=config, weights=weights
+        )
+
+        self.image_newline = weights.get_tensor("image_newline")
+
+        self.vocab_size = config.text_config.vocab_size
+        self.config = config
+        config.text_config.quantize = config.quantize
+        config.text_config.speculator = config.speculator
+        self.text_model = load_text_model(
+            prefix="language_model" if not prefix else f"{prefix}.language_model",
+            config=config.text_config,
+            weights=weights,
+        )
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+
+    def _merge_input_ids_with_image_features(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        image_features: torch.Tensor,
+    ):
+        """In place merges in vision_embeddings with inputs_embeds."""
+        mask = torch.where(input_ids == self.config.image_token_index)
+        # Let's pray we have enabled enough slots !
+        try:
+            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+        except Exception as e:
+            raise RuntimeError(
+                f"Cannot fill images right now. If error happens at warmup, make sure you have enough `--max-input-tokens`  to handle images. If error happens at regular runtime, please fill in an issue: {e}"
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        # Unused for this model
+        pixel_attention_mask=None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+    ):
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
+        if pixel_values is not None and len(pixel_values) > 0:
+            # num_special_image_tokens = (input_ids == self.config.image_token_index).sum()
+            # assert num_special_image_tokens == len(pixel_values), f"Received {num_special_image_tokens} for {len(pixel_values)} images, this is invalid"
+            # 1. Extract the input embeddings
+
+            # 2. Merge text and images
+            num_images, num_patches, channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.view(
+                num_images * num_patches, channels, height, width
+            )
+            image_features = self.vision_tower(pixel_values)
+
+            # selected_image_feature = image_features.hidden_states[self.config.vision_feature_layer]
+            # Already done within the clip model
+            selected_image_feature = image_features.last_hidden_state
+
+            if self.config.vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif self.config.vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            else:
+                raise RuntimeError(
+                    f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid."
+                )
+
+            image_features = self.multi_modal_projector(selected_image_feature)
+
+            # split up image_features for each of the individual images
+            # hence we get a list of image_features, each of shape (5, num_patches, hidden_size)
+            # if we assume each image has 5 image features (base image + 4 patches)
+            split_sizes = [num_patches] * num_images
+            image_features = torch.split(image_features, split_sizes, dim=0)
+
+            # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
+            height = width = (
+                self.config.vision_config.image_size
+                // self.config.vision_config.patch_size
+            )
+
+            new_image_features = []
+            for image_idx, image_feature in enumerate(image_features):
+                if image_feature.shape[0] > 1:
+                    base_image_feature = image_feature[0]
+                    image_feature = image_feature[1:]
+
+                    if height * width != base_image_feature.shape[0]:
+                        raise ValueError(
+                            "The number of patches is not consistent with the image size."
+                        )
+
+                    # Dimensions are intentionally swapped to be bug-compatible with
+                    # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
+                    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                        image_sizes[image_idx],
+                        self.config.image_grid_pinpoints,
+                        self.config.vision_config.image_size,
+                    )
+                    image_feature = image_feature.view(
+                        num_patch_height, num_patch_width, height, width, -1
+                    )
+                    image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                    image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                    image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                    image_feature = torch.cat(
+                        (
+                            image_feature,
+                            self.image_newline[:, None, None].expand(
+                                *image_feature.shape[:-1], 1
+                            ),
+                        ),
+                        dim=-1,
+                    )
+                    image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                    image_feature = torch.cat(
+                        (base_image_feature, image_feature), dim=0
+                    )
+                else:
+                    image_feature = image_feature[0]
+                    image_feature = torch.cat(
+                        (image_feature, self.image_newline[None]), dim=0
+                    )
+                new_image_features.append(image_feature)
+            image_features = torch.stack(new_image_features, dim=0)
+
+            inputs_embeds = self._merge_input_ids_with_image_features(
+                input_ids, inputs_embeds, image_features
+            )
+
+        hidden_states = self.text_model.model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            slots=slots,
+            seqlen=seqlen,
+            hpu_attention_meta=hpu_attention_meta,
+            adapter_data=adapter_data,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
index 341a2352..d23d4f67 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -26,12 +26,12 @@ from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
-from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -41,20 +41,12 @@ from text_generation_server.layers import (
     TensorParallelMultiAdapterLinear,
     TensorParallelAdapterRowLinear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
 
 
-if SYSTEM == "rocm":
-    try:
-        from vllm import _custom_C
-    except Exception as e:
-        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
-
-
 class MistralConfig(PretrainedConfig):
     model_type = "mistral"
 
@@ -160,6 +152,7 @@ class MistralAttention(torch.nn.Module):
             ],
             process_group=weights.process_group,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         o_proj = TensorParallelRowLinear.load(
             config,
@@ -185,12 +178,10 @@ class MistralAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
-        prefill_cache_indices,
         adapter_data,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states, adapter_data)
         query, kv = qkv.split(
@@ -205,38 +196,36 @@ class MistralAttention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        if prefill_cache_indices is not None:
-            kv_to_cache = kv[prefill_cache_indices]
-        else:
-            kv_to_cache = kv
-
-        reshape_and_cache(
-            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(
@@ -300,29 +289,11 @@ class MistralMLP(nn.Module):
         self.quantize = config.quantize
 
     def forward(self, hidden_states, adapter_data):
-        if (
-            SYSTEM == "rocm"
-            and self.hidden_act == "silu"
-            and hidden_states.dtype == torch.float16
-            and hidden_states.shape[0] == 1
-            and not self.quantize
-        ):
-            out = torch.empty(
-                hidden_states.shape[0],
-                self.intermediate_size,
-                dtype=hidden_states.dtype,
-                device="cuda",
-            )
-            _custom_C.LLMM_Silu(
-                self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
-            )
-            return self.down_proj(out, adapter_data)
-        else:
-            gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
-            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
-            return self.down_proj(
-                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
-            )
+        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+        )
 
 
 class MistralLayer(nn.Module):
@@ -355,12 +326,10 @@ class MistralLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
-        prefill_cache_indices,
         adapter_data,
+        hpu_attention_meta,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -371,12 +340,10 @@ class MistralLayer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            prefill_cache_indices,
             adapter_data,
+            hpu_attention_meta,
         )
 
         # faster post attention rms norm
@@ -423,20 +390,15 @@ class MistralModel(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        true_max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         adapter_data: Optional[torch.Tensor] = None,
     ):
         hidden_states = inputs_embeds
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, true_max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -447,12 +409,10 @@ class MistralModel(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
-                prefill_cache_indices,
                 adapter_data,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -498,35 +458,21 @@ class FlashMistralForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        true_max_s = max_s
-        if prefill_cache_indices is not None:
-            # Slots also need to be sliced as it has the same size as the whole kv tensor
-            slots = slots[prefill_cache_indices]
-        elif self.max_past is not None:
-            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
-            # kernel requires the true values
-            seqlen = seqlen.clamp(max=self.max_past_tensor)
-
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = self.model(
             inputs_embeds,
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            true_max_s,
-            prefill_cache_indices,
+            hpu_attention_meta,
             adapter_data,
         )
         if lm_head_indices is not None:
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
index 5836d30a..1ef6be48 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -37,9 +37,9 @@ from text_generation_server.layers.attention import (
     Seqlen,
     attention,
     paged_attention,
-    reshape_and_cache,
+    HPUPagedAttentionMetadata,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import FastRMSNorm
 from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
@@ -215,6 +215,7 @@ class MixtralAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -234,11 +235,9 @@ class MixtralAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
-        prefill_cache_indices,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states)
         query, kv = qkv.split(
@@ -253,38 +252,36 @@ class MixtralAttention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        if prefill_cache_indices is not None:
-            kv_to_cache = kv[prefill_cache_indices]
-        else:
-            kv_to_cache = kv
-
-        reshape_and_cache(
-            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -378,11 +375,9 @@ class MixtralLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
-        prefill_cache_indices,
+        hpu_attention_meta,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -393,11 +388,9 @@ class MixtralLayer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            prefill_cache_indices,
+            hpu_attention_meta,
         )
 
         # faster post attention rms norm
@@ -448,20 +441,15 @@ class MixtralModel(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        true_max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, true_max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -472,11 +460,9 @@ class MixtralModel(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
-                prefill_cache_indices,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -507,34 +493,21 @@ class FlashMixtralForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        true_max_s = max_s
-        if prefill_cache_indices is not None:
-            # Slots also need to be sliced as it has the same size as the whole kv tensor
-            slots = slots[prefill_cache_indices]
-        elif self.max_past is not None:
-            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
-            # kernel requires the true values
-            seqlen = seqlen.clamp(max=self.max_past_tensor)
 
         hidden_states = self.model(
             input_ids,
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            true_max_s,
-            prefill_cache_indices,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mllama.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mllama.py
new file mode 100644
index 00000000..216642e0
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mllama.py
@@ -0,0 +1,986 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Mllama model."""
+
+from typing import Optional, Tuple, List
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+import torch.nn.functional as F
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+    FastLinear,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+    HPUPagedAttentionMetadata,
+)
+from text_generation_server.models.custom_modeling.flash_llama_modeling import (
+    FlashLlamaForCausalLM,
+)
+from habana_frameworks.torch.hpex.kernels import FusedSDPA
+from vllm_hpu_extension.utils import ModuleFusedSDPA
+
+
+def _prepare_aspect_ratio_attention_mask(
+    aspect_ratio_mask: torch.Tensor,
+    num_patches: int,
+    target_length: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    # Expand aspect ratio mask to target_length
+    batch_size, max_num_tiles = aspect_ratio_mask.shape
+    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype)
+    attention_mask = attention_mask.repeat(1, 1, target_length, 1)
+
+    # Mask padding patches
+    pad_patches = target_length - num_patches
+    attention_mask[:, :, -pad_patches:] = 0
+
+    # Invert the mask (0 -> 1, 1 -> 0)
+    attention_mask = 1 - attention_mask
+
+    # Reshape to 2D and create 4D attention mask
+    # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length)
+    attention_mask = attention_mask.reshape(
+        batch_size, max_num_tiles * target_length, 1
+    )
+    attention_mask = (
+        attention_mask @ attention_mask.transpose(-1, -2) * torch.finfo(dtype).min
+    )
+    attention_mask = attention_mask.unsqueeze(1)
+
+    return attention_mask
+
+
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full(
+            (sequence_length, target_length),
+            fill_value=min_dtype,
+            dtype=dtype,
+            device=device,
+        )
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(
+            target_length, device=device
+        ) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = (
+                causal_mask.clone()
+            )  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = (
+                causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            )
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[
+                :, :, :, :mask_length
+            ].masked_fill(padding_mask, min_dtype)
+
+    return causal_mask
+
+
+def _prepare_cross_attention_mask(
+    cross_attention_mask: torch.Tensor,
+    num_vision_tokens: int,
+    dtype: str,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # reshape so it can be used by attn module
+    batch_size, text_total_length, *_ = cross_attention_mask.shape
+    cross_attention_mask = cross_attention_mask.repeat_interleave(
+        num_vision_tokens, dim=3
+    )
+    cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1)
+    cross_attention_mask = cross_attention_mask.unsqueeze(1)
+
+    # invert the mask
+    inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype)
+    cross_attention_mask = inverted_cross_attn_mask.masked_fill(
+        inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+
+    # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's
+    # last dimension contains negative infinity values, otherwise it's 1
+    negative_inf_value = torch.finfo(dtype).min
+    full_text_row_masked_out_mask = (
+        (cross_attention_mask != negative_inf_value)
+        .any(dim=-1)
+        .type_as(cross_attention_mask)[..., None]
+    )
+    cross_attention_mask *= full_text_row_masked_out_mask
+
+    return cross_attention_mask, full_text_row_masked_out_mask
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->MllamaVision
+class MllamaVisionMLP(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MllamaVisionSdpaAttention(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+
+        self.embed_dim = config.hidden_size
+        self.head_dim = config.hidden_size // config.attention_heads
+        self.num_heads = config.attention_heads // weights.process_group.size()
+
+        self.qkv_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qkv = self.qkv_proj(hidden_state)
+        query, key, value = qkv.split(
+            [
+                self.head_dim * self.num_heads,
+                self.head_dim * self.num_heads,
+                self.head_dim * self.num_heads,
+            ],
+            dim=2,
+        )
+
+        batch_size, q_seq_len, _ = query.shape
+        _, kv_seq_len, _ = key.shape
+
+        query = query.view(batch_size, q_seq_len, self.num_heads, self.head_dim)
+        key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
+        value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        attn_output = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_seq_len, -1)
+
+        output = self.o_proj(attn_output)
+        return output
+
+
+class MllamaVisionEncoderLayer(nn.Module):
+    def __init__(self, *, prefix, config, weights, is_gated: bool):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.attention_heads
+        self.is_gated = is_gated
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = MllamaVisionSdpaAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = MllamaVisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+
+        self.input_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=1e-05
+        )
+        self.post_attention_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=1e-05
+        )
+
+        # there used to be an if else here, no code path
+        if is_gated:
+            self.gate_attn = nn.Parameter(
+                weights.get_tensor(f"{prefix}.gate_attn"), requires_grad=False
+            )
+            self.gate_ffn = nn.Parameter(
+                weights.get_tensor(f"{prefix}.gate_ffn"), requires_grad=False
+            )
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state, attention_mask=attention_mask)
+        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
+        hidden_state = residual + gate_attn * hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
+        hidden_state = residual + gate_ffn * hidden_state
+        return hidden_state
+
+
+class MllamaVisionEncoder(nn.Module):
+    def __init__(self, *, prefix, config, weights, is_gated: bool, num_layers: int):
+        super().__init__()
+        self.config = config
+        self.layers = [
+            MllamaVisionEncoderLayer(
+                prefix=f"{prefix}.layers.{i}",
+                config=config,
+                weights=weights,
+                is_gated=is_gated,
+            )
+            for i in range(num_layers)
+        ]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        encoder_states = [hidden_states]
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+
+            hidden_states = layer_outputs
+            encoder_states.append(hidden_states)
+
+        return hidden_states, encoder_states
+
+
+class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+
+        self.embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.embedding", weights=weights
+        )
+        self.gate = nn.Parameter(
+            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
+        )
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        embeddings = self.embedding(aspect_ratio_ids)
+        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)
+
+        # Always gated.
+        embeddings = embeddings * self.gate.tanh()
+
+        hidden_state = hidden_state + embeddings
+        return hidden_state
+
+
+class MllamaPrecomputedPositionEmbedding(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.num_patches = (config.image_size // config.patch_size) ** 2 + 1
+        self.hidden_size = config.hidden_size
+        self.scale = config.hidden_size**-0.5
+
+        self.gate = nn.Parameter(
+            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
+        )
+
+        # position embedding
+        embedding = nn.Parameter(
+            weights.get_tensor(f"{prefix}.embedding"), requires_grad=False
+        )
+        self.gated_position_embedding = (1 - self.gate.tanh()) * embedding
+        self.tile_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.tile_embedding", weights=weights
+        )
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        # position embeddings
+        hidden_state = hidden_state + self.gated_position_embedding.view(
+            1, 1, self.num_patches, self.hidden_size
+        )
+
+        # precomputed tile position embeddings
+        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
+        batch_size = hidden_state.shape[0]
+        tile_position_embedding = tile_position_embedding.reshape(
+            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size
+        )
+        gated_tile_position_embedding = self.gate.tanh() * tile_position_embedding
+        hidden_state = hidden_state + gated_tile_position_embedding
+
+        return hidden_state
+
+
+class MllamaVisionModel(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+        self.intermediate_layers_indices = config.intermediate_layers_indices
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
+        self.scale = config.hidden_size**-0.5
+        self.dtype = weights.dtype
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+            bias=False,
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+
+        self.class_embedding = nn.Parameter(
+            weights.get_tensor(f"{prefix}.class_embedding"), requires_grad=False
+        )
+
+        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(
+            prefix=f"{prefix}.gated_positional_embedding",
+            config=config,
+            weights=weights,
+        )
+
+        self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            prefix=f"{prefix}.pre_tile_positional_embedding",
+            config=config,
+            weights=weights,
+        )
+        self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            prefix=f"{prefix}.post_tile_positional_embedding",
+            config=config,
+            weights=weights,
+        )
+
+        ## layer norms
+        self.layernorm_pre = nn.LayerNorm.load(
+            prefix=f"{prefix}.layernorm_pre",
+            weights=weights,
+            # torch default
+            eps=1e-05,
+        )
+        self.layernorm_post = nn.LayerNorm.load(
+            prefix=f"{prefix}.layernorm_post",
+            weights=weights,
+            # torch default
+            eps=1e-05,
+        )
+
+        ## encoders
+        self.transformer = MllamaVisionEncoder(
+            prefix=f"{prefix}.transformer",
+            config=config,
+            weights=weights,
+            is_gated=False,
+            num_layers=config.num_hidden_layers,
+        )
+        self.global_transformer = MllamaVisionEncoder(
+            prefix=f"{prefix}.global_transformer",
+            config=config,
+            weights=weights,
+            is_gated=True,
+            num_layers=config.num_global_layers,
+        )
+
+    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        aspect_ratio_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        (
+            batch_size,
+            num_concurrent_media,
+            num_tiles,
+            num_channels,
+            height,
+            width,
+        ) = pixel_values.shape
+
+        pixel_values = pixel_values.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_channels, height, width
+        )
+        aspect_ratio_ids = aspect_ratio_ids.reshape(
+            batch_size * num_concurrent_media, -1
+        )
+
+        # patch embedding
+        patch_embeds = self.patch_embedding(pixel_values)
+        hidden_state = patch_embeds.flatten(2).transpose(1, 2)
+
+        # tile embeddings
+        _, num_patches, dim = hidden_state.shape
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, -1, dim
+        )
+        hidden_state = self.pre_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+
+        # apply cls token
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_patches, dim
+        )
+        hidden_state = self.apply_class_embedding(hidden_state)
+        num_patches += 1
+
+        # apply position embeddings
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, num_patches, dim
+        )
+        hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids)
+
+        # apply encoder
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        # Compute the number of tokens to pad
+        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
+        # Compute padding tuple for pad function
+        padding = (
+            0,
+            0,
+            0,
+            num_padding_patches,
+        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
+        # Pad the tensor
+        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
+        slice_index = -num_padding_patches if num_padding_patches > 0 else None
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(
+                batch_size * num_concurrent_media, -1
+            )
+            attention_mask = _prepare_aspect_ratio_attention_mask(
+                aspect_ratio_mask=attention_mask,
+                num_patches=self.num_patches,
+                target_length=hidden_state.shape[2],
+                dtype=self.dtype,
+            )
+
+        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim)
+        hidden_state, all_intermediate_hidden_states = self.transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+        )
+        intermediate_hidden_states = [
+            hidden_state
+            for idx, hidden_state in enumerate(all_intermediate_hidden_states)
+            if idx in self.intermediate_layers_indices
+        ]
+        intermediate_hidden_states = torch.stack(intermediate_hidden_states, dim=-1)
+
+        # apply global encoder
+        hidden_state = self.layernorm_post(hidden_state)
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = self.post_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles * (num_patches + num_padding_patches),
+            dim,
+        )
+        hidden_state, _ = self.global_transformer(
+            hidden_state, attention_mask=attention_mask
+        )
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = hidden_state[:, :, :slice_index]
+
+        # adding intermediate layer outputs
+        hidden_state = hidden_state.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, dim
+        )
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            -1,
+        )
+        intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index]
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, -1
+        )
+        hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1)
+        return hidden_state
+
+
+class MllamaTextCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, *, prefix, config, weights, layer_idx):
+        super().__init__()
+        self.config = config
+        self.num_heads = self.config.num_attention_heads
+        self.num_key_value_heads = self.config.num_key_value_heads
+        self.dropout = config.dropout
+        self.hidden_size = config.hidden_size
+        self.head_size = config.hidden_size // self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.layer_idx = layer_idx
+
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            self.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.q_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.q_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.k_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.k_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.v_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.v_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.q_norm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.k_norm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.softmax_scale = self.head_size**-0.5
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        # past_key_value=None,
+        # attention_mask: Optional[torch.Tensor] = None,
+        # cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # hidden_states = hidden_states.unsqueeze(0)
+        # bsz, q_len, _ = hidden_states.size()
+        (
+            cross_attention_states,
+            cu_seqlen_q,
+            cu_seqlen_k,
+            indices,
+        ) = cross_attention_states
+        bs = cu_seqlen_q.size(0) - 1
+        query_states = self.q_proj(hidden_states)
+        query_states = query_states.view(bs, -1, self.num_heads, self.head_size)
+        query_states = self.q_norm(query_states)
+
+        key_states = self.k_proj(cross_attention_states)
+        value_states = self.v_proj(cross_attention_states)
+        key_states = key_states.view(bs, -1, self.num_key_value_heads, self.head_size)
+        value_states = value_states.view(
+            bs, -1, self.num_key_value_heads, self.head_size
+        )
+        key_states = self.k_norm(key_states)
+
+        # key_states = key_states.repeat(1, self.num_key_value_groups, 1)
+        # value_states = value_states.repeat(1, self.num_key_value_groups, 1)
+
+        causal = False
+        # logger.info(
+        #     f"Q: {query_states.shape} -K {key_states.shape} - V{value_states.shape}"
+        # )
+        # execute sdpa
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        fsdpa_op = ModuleFusedSDPA(FusedSDPA)
+        attn_output = fsdpa_op(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=causal,
+            scale=None,
+            softmax_mode="None",
+            recompute_mode=None,
+            valid_sequence_lengths=None,
+        )
+        attn_output = attn_output.transpose(1, 2).squeeze(0).contiguous()
+        attn_output = self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+        return attn_output
+
+
+# Copied from transformers.models.gemma2.modeling_gemma2.Gemma2MLP with Gemma2->MllamaText
+class MllamaTextMLP(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        shape = x.shape
+        gate_up_states = self.gate_up_proj(x)
+        gate_up_states = gate_up_states.view(*shape[:-1], 2, self.intermediate_size)
+        result = self.down_proj(
+            self.act_fn(gate_up_states[:, 0]) * gate_up_states[:, 1]
+        )
+        return result
+
+
+class FlashLlamaCrossLayer(torch.nn.Module):
+    """Cross-attention transformer block with tanh-gated attention and feedforward."""
+
+    def __init__(self, *, prefix, config, weights, index) -> None:
+        layer_idx = index
+        super().__init__()
+        self.cross_attn = MllamaTextCrossAttention(
+            prefix=f"{prefix}.cross_attn",
+            config=config,
+            weights=weights,
+            layer_idx=layer_idx,
+        )
+
+        self.input_layernorm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.cross_attn_attn_gate = torch.nn.Parameter(
+            weights.get_tensor(f"{prefix}.cross_attn_attn_gate"), requires_grad=False
+        )
+
+        self.mlp = MllamaTextMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.post_attention_layernorm = MllamaTextRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.cross_attn_mlp_gate = torch.nn.Parameter(
+            weights.get_tensor(f"{prefix}.cross_attn_mlp_gate"), requires_grad=False
+        )
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        slots,
+        seqlen,
+        adapter_data,
+        cross_attention_states,  # [ IB, ...]
+        hpu_attention_meta,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cross_attention_states is None:
+            return hidden_states, residual
+        if residual is not None:
+            hidden_states += residual
+
+        indices = cross_attention_states[-1]
+        out_hidden_states = hidden_states[:]
+        if len(indices) > 0:
+            assert max(indices) < hidden_states.shape[0]
+        hidden_states = hidden_states[indices]
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.cross_attn(
+            hidden_states=hidden_states,
+            # attention_mask=cross_attention_mask,
+            cross_attention_states=cross_attention_states,
+        )
+        hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states
+
+        out_hidden_states[indices] = hidden_states
+        hidden_states = out_hidden_states
+
+        return hidden_states, None
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MllamaText
+class MllamaTextRMSNorm(nn.Module):
+    def __init__(self, weight, eps):
+        super().__init__()
+        self.weight = weight
+        self.variance_epsilon = eps
+
+    @classmethod
+    def load(cls, *, prefix, weights, eps):
+        weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
+        )
+        return cls(weight=weight, eps=eps)
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class FlashMllamaForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = None
+        config.vision_config.speculator = config.speculator
+        config.text_config.quantize = config.quantize
+        config.text_config.speculator = config.speculator
+        config.text_config._attn_implementation = "sdpa"
+        self.hidden_size = config.text_config.hidden_size
+        self.vision_model = MllamaVisionModel(
+            prefix="vision_model", config=config.vision_config, weights=weights
+        )
+        self.multi_modal_projector = FastLinear.load(
+            prefix="multi_modal_projector", config=config, weights=weights, bias=True
+        )
+        self.text_model = FlashLlamaForCausalLM(
+            prefix="language_model", config=config.text_config, weights=weights
+        )
+        self.config = config
+        self.dtype = weights.dtype
+        self.device = weights.device
+
+    def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
+        if aspect_ratio_ids is None:
+            raise ValueError(
+                "`aspect_ratio_ids` must be provided if `pixel_values` is provided"
+            )
+        # logger.info(f"PIxel values {pixel_values.shape}")
+        batch_size = pixel_values.shape[0]
+        vision_states = self.vision_model(
+            pixel_values, aspect_ratio_ids, aspect_ratio_mask
+        )
+        cross_attention_states = self.multi_modal_projector(vision_states).reshape(
+            -1, vision_states.shape[-2], self.hidden_size
+        )
+        _, _, h = cross_attention_states.shape
+        cross_attention_states = cross_attention_states.view(batch_size, -1, h)
+        # logger.info(f"cross {cross_attention_states.shape}")
+        return cross_attention_states
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
+        lm_head_indices: Optional[torch.Tensor],
+        adapter_data: Optional[torch.Tensor] = None,
+        # XXX: Putting these as optional so that the cuda warmup calls can go through.
+        cross_attention_states: Optional[torch.Tensor] = None,
+        image_indices=None,
+    ):
+        if cross_attention_states is not None:
+            seqlen_q = len(image_indices)
+            n_images = cross_attention_states.shape[0]
+            seqlen_k = cross_attention_states.shape[1]
+            device = cross_attention_states.device
+            if cu_seqlen_prefill is not None:
+                offset = 0
+                cu_q = []
+                indices = []
+                for index in image_indices:
+                    cu_q.append(offset)
+                    length = seqlen.input_lengths[index].item()
+                    assert index < seqlen.cu_seqlen_q.shape[0]
+                    input_ids_offset = seqlen.cu_seqlen_q[index]
+                    indices.extend(range(input_ids_offset, input_ids_offset + length))
+                    offset += length
+                cu_q.append(offset)
+                cu_seqlen_q = torch.Tensor(cu_q).to(device=device, dtype=torch.int32)
+
+                assert max(indices) < input_ids.shape[0]
+
+                cu_seqlen_k = (
+                    torch.arange(
+                        n_images + 1,
+                        device=device,
+                        dtype=torch.int32,
+                    )
+                    * seqlen_k
+                )
+            else:
+                cu_seqlen_q = torch.arange(
+                    seqlen_q + 1, device=device, dtype=torch.int32
+                )
+                seqlen_k = cross_attention_states.shape[1]
+                n_images = cross_attention_states.shape[0]
+                cu_seqlen_k = (
+                    torch.arange(
+                        n_images + 1,
+                        device=device,
+                        dtype=torch.int32,
+                    )
+                    * seqlen_k
+                )
+                indices = image_indices[:]
+
+            cross_attention_states = (
+                cross_attention_states,
+                cu_seqlen_q,
+                cu_seqlen_k,
+                indices,
+            )
+
+        outputs = self.text_model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            slots=slots,
+            seqlen=seqlen,
+            hpu_attention_meta=hpu_attention_meta,
+            lm_head_indices=lm_head_indices,
+            adapter_data=adapter_data,
+            cross_attention_states=cross_attention_states,
+        )
+
+        return outputs
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index ad4e382f..33f63333 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -29,8 +29,8 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -39,7 +39,7 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
@@ -132,6 +132,7 @@ class FlashNeoxAttention(torch.nn.Module):
             head_size=self.head_size,
             hidden_size=self.hidden_size,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.dense = load_row(
             config, prefix=f"{prefix}.dense", weights=weights, bias=True
         )
@@ -146,10 +147,9 @@ class FlashNeoxAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states)
         qkv = qkv.view(-1, 3, self.num_heads, self.head_size)
@@ -165,30 +165,35 @@ class FlashNeoxAttention(torch.nn.Module):
         qkv[:, 0] = torch.cat((query_rot, query_pass), dim=-1)
         qkv[:, 1] = torch.cat((key_rot, key_pass), dim=-1)
 
-        reshape_and_cache(qkv[:, 1], qkv[:, 2], kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=qkv[:, 1],
+            value=qkv[:, 2],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                qkv[:, 0],
-                kv_cache[0] if PREFILL_IN_KV_CACHE else qkv[:, 1],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else qkv[:, 2],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=qkv[:, 0],
+                key=qkv[:, 1],
+                value=qkv[:, 2],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 qkv[:, 0],
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
@@ -255,10 +260,9 @@ class FlashNeoXLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         if self.use_parallel_residual:
             ln1_hidden_states, _ = self.input_layernorm(hidden_states)
@@ -269,10 +273,9 @@ class FlashNeoXLayer(nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache,
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
             ln2_hidden_states, _ = self.post_attention_layernorm(hidden_states)
@@ -293,10 +296,9 @@ class FlashNeoXLayer(nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache,
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
             hidden_states, residual = self.post_attention_layernorm(
@@ -347,18 +349,15 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.embed_in(input_ids)
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].attention.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].attention.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -369,10 +368,9 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.final_layer_norm(hidden_states, residual)
@@ -401,11 +399,9 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -414,10 +410,9 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
index 0024f2bb..4d31d5dd 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
@@ -19,7 +19,7 @@ from torch import nn
 from typing import Optional, List, Tuple
 
 from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear
-from text_generation_server.layers.attention import Seqlen
+from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
 from text_generation_server.models.custom_modeling.vlm import (
     load_text_model,
     load_vision_model,
@@ -69,22 +69,20 @@ class PaliGemmaForConditionalGeneration(nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor] = None,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         pixel_values: torch.FloatTensor = None,
         # Unused here
         pixel_attention_mask: Optional[torch.BoolTensor] = None,
         image_sizes: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         inputs_embeds = self.text_model.embed_tokens(input_ids)
         # TODO This is odd but apparently pali gemma position ids start at 1.
         if cu_seqlen_prefill is not None:
-            max_s += 1
             position_ids += 1
 
         if pixel_values is not None:
@@ -106,10 +104,10 @@ class PaliGemmaForConditionalGeneration(nn.Module):
             position_ids=position_ids,
             cu_seqlen_prefill=cu_seqlen_prefill,
             kv_cache=kv_cache,
-            block_tables=block_tables,
             slots=slots,
             seqlen=seqlen,
-            max_s=max_s,
+            hpu_attention_meta=hpu_attention_meta,
+            adapter_data=adapter_data,
         )
 
         if lm_head_indices is not None:
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
index 2a0dc606..0c777912 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -9,8 +9,8 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -19,7 +19,7 @@ from text_generation_server.layers import (
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
@@ -90,7 +90,7 @@ def _load_gqa(config, prefix: str, weights):
         dim=0,
     )
 
-    if config.quantize not in ["gptq", "awq", "marlin"]:
+    if config.quantize not in ["gptq", "awq"]:
         weight = weight.to(dtype=weights.dtype).to(device=weights.device)
 
         head_size = config.hidden_size // config.num_attention_heads
@@ -139,6 +139,7 @@ class FlashPhiAttention(torch.nn.Module):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         # in llama the dense layer is called "o_proj" and has bias=False
         self.dense = TensorParallelRowLinear.load(
@@ -159,10 +160,9 @@ class FlashPhiAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         # Compute query, key, value and split
         qkv = self.query_key_value(hidden_states)
@@ -188,29 +188,34 @@ class FlashPhiAttention(torch.nn.Module):
         )
 
         # Reshape key and value and cache
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_scales=self.kv_scales,
+                kv_cache=kv_cache,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
@@ -274,10 +279,9 @@ class FlashPhiLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         hidden_states, res = self.input_layernorm(hidden_states, residual)
         # Self Attention
@@ -287,10 +291,9 @@ class FlashPhiLayer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         hidden_states = self.resid_dropout(attn_output).add(
@@ -339,18 +342,15 @@ class FlashPhiModel(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -361,10 +361,9 @@ class FlashPhiModel(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -394,11 +393,9 @@ class FlashPhiForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -407,10 +404,9 @@ class FlashPhiForCausalLM(torch.nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index 02c788d3..af4b404d 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -8,8 +8,8 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -17,7 +17,7 @@ from text_generation_server.layers import (
     TensorParallelEmbedding,
     SpeculativeHead,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
@@ -86,6 +86,8 @@ class Qwen2Attention(torch.nn.Module):
 
         self.query_key_value = load_attention(config, prefix, weights)
 
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
         self.o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
@@ -104,11 +106,9 @@ class Qwen2Attention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
-        prefill_cache_indices,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states)
         query, kv = qkv.split(
@@ -123,38 +123,36 @@ class Qwen2Attention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        if prefill_cache_indices is not None:
-            kv_to_cache = kv[prefill_cache_indices]
-        else:
-            kv_to_cache = kv
-
-        reshape_and_cache(
-            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -223,13 +221,11 @@ class Qwen2Layer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
-        prefill_cache_indices,
+        hpu_attention_meta,
     ):
-        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+        normed_hidden_states, residual = self.input_layernorm(hidden_states)
 
         # Self Attention
         attn_output = self.self_attn(
@@ -238,21 +234,17 @@ class Qwen2Layer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            prefill_cache_indices,
+            hpu_attention_meta,
         )
+        hidden_states = attn_output + residual
 
         # faster post attention rms norm
-        normed_attn_res_output, attn_res = self.post_attention_layernorm(
-            attn_output, res
-        )
-
-        mlp_output = self.mlp(normed_attn_res_output)
-
-        return mlp_output, attn_res
+        hidden_states, residual = self.post_attention_layernorm(hidden_states)
+        mlp_output = self.mlp(hidden_states)
+        hidden_states = mlp_output + residual
+        return hidden_states
 
 
 class Qwen2Model(torch.nn.Module):
@@ -264,9 +256,6 @@ class Qwen2Model(torch.nn.Module):
         process_group = weights.process_group
         self.tp_rank = process_group.rank()
         self.tp_world_size = process_group.size()
-        self.embed_tokens = TensorParallelEmbedding(
-            prefix=f"{prefix}.embed_tokens", weights=weights
-        )
         self.layers = nn.ModuleList(
             [
                 Qwen2Layer(
@@ -290,42 +279,35 @@ class Qwen2Model(torch.nn.Module):
 
     def forward(
         self,
-        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        true_max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
 
-        # Get rotary cos and sin for this forward
-        # Avoid to index in each layer
         cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, true_max_s, hidden_states.dtype
+            position_ids,
         )
 
         residual = None
         for i, layer in enumerate(self.layers):
-            hidden_states, residual = layer(
+            hidden_states = layer(
                 hidden_states,
                 residual,
                 cos,
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
-                prefill_cache_indices,
+                hpu_attention_meta,
             )
 
-        hidden_states, _ = self.norm(hidden_states, residual)
+        hidden_states, _ = self.norm(hidden_states)
 
         return hidden_states
 
@@ -346,6 +328,12 @@ class Qwen2ForCausalLM(torch.nn.Module):
             prefix=f"{prefix}.{suffix}" if prefix else suffix,
             weights=weights,
         )
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens" if prefix else "model.embed_tokens",
+            weights=weights,
+        )
+
         self.max_past = config.sliding_window
         self.max_past_tensor = (
             torch.tensor(config.sliding_window, device=weights.device)
@@ -359,34 +347,23 @@ class Qwen2ForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor] = None,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        true_max_s = max_s
-        if prefill_cache_indices is not None:
-            # Slots also need to be sliced as it has the same size as the whole kv tensor
-            slots = slots[prefill_cache_indices]
-        elif self.max_past is not None:
-            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
-            # kernel requires the true values
-            seqlen = seqlen.clamp(max=self.max_past_tensor)
+
+        inputs_embeds = self.embed_tokens(input_ids)
 
         hidden_states = self.model(
-            input_ids,
+            inputs_embeds,
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            true_max_s,
-            prefill_cache_indices,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
index 6671d85e..141e13a6 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -12,14 +12,14 @@ from text_generation_server.layers import (
     TensorParallelRowLinear,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import FastLayerNorm
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.attention import (
     attention,
     paged_attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 
 
@@ -79,6 +79,7 @@ class RWConfig(PretrainedConfig):
         self.alibi = False
         self.rotary = True
         self.rope_theta = rope_theta
+        self.max_position_embeddings = 2048
 
         self.vocab_size = vocab_size
         # Backward compatibility with n_embed kwarg
@@ -160,6 +161,7 @@ class FlashRWAttention(torch.nn.Module):
             weights=weights,
             bias=config.bias,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.dense = load_row(
             config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
         )
@@ -180,10 +182,9 @@ class FlashRWAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states)
 
@@ -200,30 +201,35 @@ class FlashRWAttention(torch.nn.Module):
         # Inplace rotary
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
@@ -278,6 +284,7 @@ class FlashRWLargeAttention(torch.nn.Module):
             weights=weights,
             bias=config.bias,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.dense = load_row(
             config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
         )
@@ -293,10 +300,9 @@ class FlashRWLargeAttention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         qkv = self.query_key_value(hidden_states)
         qkv = qkv.view(-1, self.num_groups, self.num_heads + 2, self.head_size)
@@ -312,36 +318,35 @@ class FlashRWLargeAttention(torch.nn.Module):
         # Inplace rotary
         self.rotary_emb(query, torch.select(kv, dim=2, index=0), cos, sin)
 
-        reshape_and_cache(
-            kv[:, :, 0].contiguous(),
-            kv[:, :, 1].contiguous(),
-            kv_cache[0],
-            kv_cache[1],
-            slots,
+        kv_cache.store(
+            key=kv[:, :, 0].contiguous(),
+            value=kv[:, :, 1].contiguous(),
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, :, 0].contiguous(),
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, :, 1].contiguous(),
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, :, 0],
+                value=kv[:, :, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.dense(
@@ -424,10 +429,9 @@ class FlashRWLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         if self.parallel_attn:
             ln_hidden_states, residual = self.input_layernorm(hidden_states, residual)
@@ -438,10 +442,9 @@ class FlashRWLayer(nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache,
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
             mlp_output = self.mlp(ln_hidden_states)
@@ -460,10 +463,9 @@ class FlashRWLayer(nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache,
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
             if self.post_attention_layernorm is not None:
@@ -547,10 +549,9 @@ class FlashRWLargeLayer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         # Layer norm.
         ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual)
@@ -562,10 +563,9 @@ class FlashRWLargeLayer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         # MLP.
@@ -623,18 +623,15 @@ class FlashRWModel(FlashRWPreTrainedModel):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.word_embeddings(input_ids)
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.h[0].self_attention.rotary_emb.get_cos_sin(
-            position_ids, max_s, hidden_states.dtype
-        )
+        cos, sin = self.h[0].self_attention.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.h):
@@ -645,10 +642,9 @@ class FlashRWModel(FlashRWPreTrainedModel):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.ln_f(hidden_states, residual)
@@ -675,11 +671,9 @@ class FlashRWForCausalLM(FlashRWPreTrainedModel):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -688,10 +682,9 @@ class FlashRWForCausalLM(FlashRWPreTrainedModel):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 43eb9687..b68f4784 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -8,8 +8,8 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -18,7 +18,7 @@ from text_generation_server.layers import (
     TensorParallelEmbedding,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.gptq import GPTQWeightsLoader
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
@@ -32,10 +32,6 @@ def load_multi_mqa(
         return _load_multi_mqa_gptq(
             config, prefix, weights, bias, head_size, num_heads, hidden_size
         )
-    elif config.quantize == "marlin":
-        raise RuntimeError(
-            "santacoder models with marlin quantization are not yet supported"
-        )
     else:
         return _load_multi_mqa(
             config, prefix, weights, bias, head_size, num_heads, hidden_size
@@ -259,6 +255,7 @@ class FlashMQAttention(torch.nn.Module):
         self.c_proj = load_row(
             config, prefix=f"{prefix}.c_proj", weights=weights, bias=True
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.kv_head_mapping = torch.zeros(
             self.num_heads, dtype=torch.int32, device=weights.device
         )
@@ -268,10 +265,9 @@ class FlashMQAttention(torch.nn.Module):
         hidden_states,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         qkv = self.c_attn(hidden_states)
 
@@ -284,32 +280,35 @@ class FlashMQAttention(torch.nn.Module):
         query = query.view(-1, self.num_heads, self.head_size)
         key_value = key_value.view(-1, 2, 1, self.head_size)
 
-        reshape_and_cache(
-            key_value[:, 0], key_value[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=key_value[:, 0],
+            value=key_value[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else key_value[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else key_value[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key_value[:, 0],
+                value=key_value[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
         return self.c_proj(attn_output.view(-1, self.num_heads * self.head_size))
@@ -371,20 +370,18 @@ class Block(nn.Module):
         residual,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
+        hpu_attention_meta,
     ):
         hidden_states, residual = self.ln_1(hidden_states, residual)
         hidden_states = self.self_attn(
             hidden_states,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
 
         hidden_states, residual = self.ln_2(hidden_states, residual)
@@ -435,10 +432,9 @@ class FlashSantacoderModel(nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.wte(input_ids) + self.wpe(position_ids)
 
@@ -452,10 +448,9 @@ class FlashSantacoderModel(nn.Module):
                 residual,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.ln_f(hidden_states, residual)
@@ -484,11 +479,9 @@ class FlashSantacoderForCausalLM(nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -497,10 +490,9 @@ class FlashSantacoderForCausalLM(nn.Module):
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
index 4975cf22..76f6f473 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -29,17 +29,19 @@ from typing import Optional, List, Tuple
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
-    reshape_and_cache,
     Seqlen,
+    HPUPagedAttentionMetadata,
 )
 from text_generation_server.layers import (
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
     FastRMSNorm,
@@ -110,17 +112,31 @@ class Starcoder2Config(PretrainedConfig):
         )
 
 
-def load_attention(config, prefix, weights):
+def load_attention(config, prefix, weights, layer_id):
+    prefixes = [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
+    head_size = config.hidden_size // config.num_attention_heads
+    sizes = [
+        head_size * config.num_attention_heads,
+        head_size * config.num_key_value_heads,
+        head_size * config.num_key_value_heads,
+    ]
     if config.num_attention_heads != config.num_key_value_heads:
-        return _load_gqa(config, prefix, weights)
+        base_layer = _load_gqa(config, prefix, weights)
     else:
-        return TensorParallelColumnLinear.load_multi(
+        base_layer = TensorParallelColumnLinear.load_multi(
             config,
-            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            prefixes=prefixes,
             dim=0,
             weights=weights,
             bias=config.use_bias,
         )
+    return TensorParallelMultiAdapterLinear.load(
+        base_layer=base_layer,
+        layer_id=layer_id,
+        layer_names=prefixes,
+        sizes=sizes,
+        process_group=weights.process_group,
+    )
 
 
 def _load_gqa(config, prefix: str, weights):
@@ -158,6 +174,7 @@ def _load_gqa(config, prefix: str, weights):
 class Starcoder2Attention(torch.nn.Module):
     def __init__(
         self,
+        index: int,
         prefix: str,
         config,
         weights,
@@ -189,14 +206,23 @@ class Starcoder2Attention(torch.nn.Module):
             config.num_key_value_heads // weights.process_group.size()
         )
 
-        self.query_key_value = load_attention(config, prefix, weights)
+        self.query_key_value = load_attention(config, prefix, weights, index)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
-        self.o_proj = TensorParallelRowLinear.load(
+        o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
-            bias=config.use_bias,
+            bias=getattr(config, "use_bias", False),
         )
+
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            index,
+            "o_proj",
+            process_group=weights.process_group,
+        )
+
         self.num_groups = self.num_heads // self.num_key_value_heads
         self.kv_head_mapping = torch.arange(
             0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
@@ -209,13 +235,12 @@ class Starcoder2Attention(torch.nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
-        prefill_cache_indices,
+        adapter_data,
+        hpu_attention_meta,
     ):
-        qkv = self.query_key_value(hidden_states)
+        qkv = self.query_key_value(hidden_states, adapter_data)
         query, kv = qkv.split(
             [
                 self.head_size * self.num_heads,
@@ -228,45 +253,45 @@ class Starcoder2Attention(torch.nn.Module):
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        if prefill_cache_indices is not None:
-            kv_to_cache = kv[prefill_cache_indices]
-        else:
-            kv_to_cache = kv
-
-        reshape_and_cache(
-            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            # flash attention
+            # sdpa
             attn_output = attention(
-                query,
-                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
-                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache[0],
-                kv_cache[1],
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
-                block_tables,
                 seqlen,
-                max_s,
+                kv_scales=self.kv_scales,
+                hpu_attention_meta=hpu_attention_meta,
             )
 
-        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
 
 
 class Starcoder2MLP(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix, config, weights, index):
         super().__init__()
         act = config.hidden_act
         self.act = (
@@ -280,27 +305,42 @@ class Starcoder2MLP(nn.Module):
             )
         )
         # Fuse gate and up proj
-        self.c_fc = TensorParallelColumnLinear.load(
+        c_fc = TensorParallelColumnLinear.load(
             config,
             prefix=f"{prefix}.c_fc",
             weights=weights,
             bias=config.use_bias,
         )
-        self.c_proj = TensorParallelRowLinear.load(
+        c_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.c_proj",
             weights=weights,
             bias=config.use_bias,
         )
 
-    def forward(self, hidden_states):
-        hidden_states = self.c_fc(hidden_states)
+        self.c_fc = TensorParallelMultiAdapterLinear.load(
+            c_fc,
+            layer_id=index,
+            layer_names=[f"{prefix}.c_fc"],
+            sizes=[config.intermediate_size, config.intermediate_size],
+            process_group=weights.process_group,
+        )
+
+        self.c_proj = TensorParallelAdapterRowLinear.load(
+            c_proj,
+            index,
+            "c_proj",
+            process_group=weights.process_group,
+        )
+
+    def forward(self, hidden_states, adapter_data):
+        hidden_states = self.c_fc(hidden_states, adapter_data)
         hidden_states = self.act(hidden_states)
-        return self.c_proj(hidden_states)
+        return self.c_proj(hidden_states, adapter_data)
 
 
 class Starcoder2GatedMLP(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, index, prefix, config, weights):
         super().__init__()
         act = config.hidden_act
         self.act = (
@@ -314,27 +354,47 @@ class Starcoder2GatedMLP(nn.Module):
             )
         )
         # Fuse gate and up proj
-        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+        prefixes = [f"{prefix}.gate_proj", f"{prefix}.up_proj"]
+        sizes = [
+            config.intermediate_size,
+            config.intermediate_size,
+        ]
+        gate_up_proj = TensorParallelColumnLinear.load_multi(
             config,
-            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            prefixes=prefixes,
             weights=weights,
             dim=0,
             bias=config.use_bias,
         )
-        self.down_proj = TensorParallelRowLinear.load(
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            index,
+            layer_names=prefixes,
+            sizes=sizes,
+            process_group=weights.process_group,
+        )
+        down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
             bias=config.use_bias,
         )
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            index,
+            "down_proj",
+            process_group=weights.process_group,
+        )
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()
         )
 
-    def forward(self, hidden_states):
-        gate_up_states = self.gate_up_proj(hidden_states)
+    def forward(self, hidden_states, adapter_data):
+        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
         gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
-        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+        )
 
 
 STARCODER2_NORMALIZATION_CLASSES = {
@@ -353,11 +413,11 @@ class Starcoder2Layer(nn.Module):
         super().__init__()
         prefix = f"model.layers.{layer_id}"
         self.self_attn = Starcoder2Attention(
-            prefix=f"{prefix}.self_attn", config=config, weights=weights
+            prefix=f"{prefix}.self_attn", config=config, weights=weights, index=layer_id
         )
 
         self.mlp = STARCODER2_MLP_CLASSES[config.mlp_type](
-            prefix=f"{prefix}.mlp", config=config, weights=weights
+            prefix=f"{prefix}.mlp", config=config, weights=weights, index=layer_id
         )
 
         self.input_layernorm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
@@ -379,11 +439,10 @@ class Starcoder2Layer(nn.Module):
         sin,
         cu_seqlen_prefill,
         kv_cache,
-        block_tables,
         slots,
         seqlen,
-        max_s,
-        prefill_cache_indices,
+        adapter_data,
+        hpu_attention_meta,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -394,11 +453,10 @@ class Starcoder2Layer(nn.Module):
             sin,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            prefill_cache_indices,
+            adapter_data,
+            hpu_attention_meta,
         )
 
         # faster post attention rms norm
@@ -406,7 +464,7 @@ class Starcoder2Layer(nn.Module):
             attn_output, res
         )
 
-        mlp_output = self.mlp(normed_attn_res_output)
+        mlp_output = self.mlp(normed_attn_res_output, adapter_data)
 
         return mlp_output, attn_res
 
@@ -447,20 +505,16 @@ class Starcoder2Model(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        true_max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        adapter_data,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
-        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
-            position_ids, true_max_s, hidden_states.dtype
-        )
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)
 
         residual = None
         for i, layer in enumerate(self.layers):
@@ -471,11 +525,10 @@ class Starcoder2Model(torch.nn.Module):
                 sin,
                 cu_seqlen_prefill,
                 kv_cache[i],
-                block_tables,
                 slots,
                 seqlen,
-                max_s,
-                prefill_cache_indices,
+                adapter_data,
+                hpu_attention_meta,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -519,34 +572,22 @@ class FlashStarcoder2ForCausalLM(torch.nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        true_max_s = max_s
-        if prefill_cache_indices is not None:
-            # Slots also need to be sliced as it has the same size as the whole kv tensor
-            slots = slots[prefill_cache_indices]
-        elif self.max_past is not None:
-            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
-            # kernel requires the true values
-            seqlen = seqlen.clamp(max=self.max_past_tensor)
 
         hidden_states = self.model(
             input_ids,
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
-            block_tables,
             slots,
             seqlen,
-            max_s,
-            true_max_s,
-            prefill_cache_indices,
+            adapter_data,
+            hpu_attention_meta,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
index a829c374..02806ac9 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -25,7 +25,7 @@ from transformers.activations import ACT2FN
 from text_generation_server.models.custom_modeling.vlm import (
     load_text_model,
 )
-from text_generation_server.layers.attention import Seqlen
+from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
 
 from text_generation_server.layers import (
@@ -728,7 +728,8 @@ class Idefics2ForConditionalGeneration(nn.Module):
     ):
         """In place merges in vision_embeddings with inputs_embeds."""
         # mask = input_ids == self.config.image_token_index
-        mask = input_ids == self.config.image_token_id
+        #  - replace `==` with torch.where to fix the issue in hpu graph
+        mask = torch.where(input_ids == self.config.image_token_id)
         # Let's pray we have enabled enough slots !
         inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
         return inputs_embeds
@@ -739,17 +740,16 @@ class Idefics2ForConditionalGeneration(nn.Module):
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
-        block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
-        max_s: int,
-        prefill_cache_indices: Optional[torch.Tensor],
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
         lm_head_indices: Optional[torch.Tensor] = None,
         pixel_values: torch.FloatTensor = None,
         pixel_attention_mask: Optional[torch.BoolTensor] = None,
         # Unused here
         image_sizes: Optional[torch.Tensor] = None,
         adapter_data: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
     ):
         inputs_embeds = self.text_model.embed_tokens(input_ids)
         if pixel_values is not None:
@@ -793,6 +793,7 @@ class Idefics2ForConditionalGeneration(nn.Module):
                     ].contiguous()
 
                 patch_size = self.config.vision_config.patch_size
+                """
                 patches_subgrid = pixel_attention_mask.unfold(
                     dimension=1, size=patch_size, step=patch_size
                 )
@@ -800,6 +801,21 @@ class Idefics2ForConditionalGeneration(nn.Module):
                     dimension=2, size=patch_size, step=patch_size
                 )
                 patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+                """
+                # hpu does none support unfold
+                conv_kernel = torch.ones(
+                    [1, 1, patch_size, patch_size],
+                    dtype=pixel_values.dtype,
+                    device=pixel_values.device,
+                )
+                patches_subgrid = torch.nn.functional.conv2d(
+                    pixel_attention_mask.unsqueeze(1).to(conv_kernel.dtype),
+                    conv_kernel,
+                    stride=patch_size,
+                ).squeeze(1)
+                patch_attention_mask = torch.eq(
+                    patches_subgrid, (patch_size * patch_size)
+                )
 
                 # Get sequence from the vision encoder
                 image_hidden_states = self.vision_model(
@@ -825,12 +841,9 @@ class Idefics2ForConditionalGeneration(nn.Module):
             position_ids=position_ids,
             cu_seqlen_prefill=cu_seqlen_prefill,
             kv_cache=kv_cache,
-            block_tables=block_tables,
             slots=slots,
             seqlen=seqlen,
-            max_s=max_s,
-            true_max_s=max_s,
-            prefill_cache_indices=None,
+            hpu_attention_meta=hpu_attention_meta,
             adapter_data=adapter_data,
         )
         if lm_head_indices is not None:
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py
new file mode 100644
index 00000000..964526fc
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py
@@ -0,0 +1,596 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Idefics3 model."""
+
+from typing import List, Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+from text_generation_server.models.custom_modeling.vlm import (
+    load_text_model,
+)
+from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+)
+from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Idefics3VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the need to resize them to the same
+    fixed size. In particular, we start from the original pre-trained SigLIP model
+    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+        self.patch_embedding.bias = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.position_embedding", weights=weights
+        )
+
+    def forward(
+        self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics3VisionAttention(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = self.embed_dim // self.num_heads
+        if self.head_size * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_size**-0.5
+        self.dropout = config.attention_dropout
+
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.embed_dim = self.embed_dim // weights.process_group.size()
+
+        self.qkv = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
+        )
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv(hidden_states)
+        query_states, key_states, value_states = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                self.head_size * self.num_heads,
+                self.head_size * self.num_heads,
+            ],
+            dim=2,
+        )
+
+        query_states = query_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = (
+            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        )
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output
+
+
+class Idefics3VisionMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics3EncoderLayer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Idefics3VisionAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.layer_norm1 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm1", eps=config.layer_norm_eps, weights=weights
+        )
+        self.layer_norm2 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm2", eps=config.layer_norm_eps, weights=weights
+        )
+        self.mlp = Idefics3VisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+
+    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Idefics3Encoder(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Idefics3EncoderLayer(
+                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+        return hidden_states
+
+
+class Idefics3VisionTransformer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embeddings = Idefics3VisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
+        self.encoder = Idefics3Encoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+        self.post_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.post_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+    ):
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_size = self.config.patch_size
+            patch_attention_mask = torch.ones(
+                (
+                    batch_size,
+                    pixel_values.size(2) // patch_size,
+                    pixel_values.size(3) // patch_size,
+                )
+            )
+            patch_attention_mask = patch_attention_mask.to(
+                dtype=torch.bool, device=pixel_values.device
+            )
+
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask
+        )
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            patch_attention_mask = None
+        else:
+            patch_attention_mask = _prepare_4d_attention_mask(
+                patch_attention_mask, hidden_states.dtype
+            )
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=patch_attention_mask,
+        )
+
+        last_hidden_state = encoder_outputs
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        return last_hidden_state
+
+
+class Idefics3SimpleMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        input_size = config.vision_config.hidden_size * (config.scale_factor**2)
+        output_size = config.text_config.hidden_size
+        proj = nn.Parameter(
+            weights.get_tensor(f"{prefix}.modality_projection.proj.weight"),
+            requires_grad=False,
+        ).to(weights.dtype)
+        self.proj = nn.Linear(input_size, output_size, bias=False)
+        self.proj.weight = proj
+
+    def forward(self, x):
+        return self.proj(x)
+
+
+class Idefics3Connector(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.modality_projection = Idefics3SimpleMLP(prefix, config, weights)
+        self.scale_factor = config.scale_factor
+
+    def pixel_shuffle(self, x, scale_factor=2):
+        bsz, seq, embed_dim = x.size()
+        height = width = int(seq**0.5)
+        x = x.view(bsz, height, width, embed_dim)
+        x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(
+            bsz,
+            int(width / scale_factor),
+            int(height / scale_factor),
+            embed_dim * (scale_factor**2),
+        )
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
+        return x
+
+    def forward(self, image_hidden_states):
+        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        return image_hidden_states
+
+
+class Idefics3ForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = None
+        config.vision_config.speculator = config.speculator
+        config.text_config.quantize = config.quantize
+        config.text_config.speculator = config.speculator
+        # set tie_word_embeddings to True to load `.embed_tokens.weight` instead of `.lm_head.weight`
+        # since Idefics3 uses the `embed_tokens` for the final prediction
+        # config.text_config.tie_word_embeddings = True
+
+        vision_config = config.vision_config
+        self.text_model = load_text_model(
+            prefix="model" if not prefix else f"{prefix}.model",
+            config=config.text_config,
+            weights=weights,
+            name="text_model",
+        )
+        self.dtype = weights.dtype
+
+        # The vision and connector models are not quantized.
+        with weights.use_loader(DefaultWeightsLoader(UnquantizedWeight)):
+            self.vision_model = Idefics3VisionTransformer(
+                prefix=(
+                    f"{prefix}.model.vision_model" if prefix else "model.vision_model"
+                ),
+                config=vision_config,
+                weights=weights,
+            )
+
+            config.quantize = None
+            self.connector = Idefics3Connector(
+                prefix=f"{prefix}.model.connector" if prefix else "model.connector",
+                config=config,
+                weights=weights,
+            )
+
+        self.config = config
+        self.image_token_id = config.image_token_id
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+
+    def _merge_input_ids_with_image_features(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        image_features: torch.Tensor,
+    ):
+        """In place merges in vision_embeddings with inputs_embeds."""
+        # mask = input_ids == self.config.image_token_index
+        #  - replace `==` with torch.where to fix the issue in hpu graph
+        mask = torch.where(input_ids == self.config.image_token_id)
+        # Let's pray we have enabled enough slots !
+        inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        # Unused here
+        image_sizes: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        image_indices=None,
+    ):
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
+        if pixel_values is not None:
+            batch_size, num_images, num_channels, height, width = pixel_values.shape
+            all_states = []
+            all_pixel_values = pixel_values
+            all_pixel_mask = pixel_attention_mask
+            for i in range(batch_size):
+                pixel_values = all_pixel_values.to(
+                    dtype=self.dtype
+                )  # fp16 compatibility
+                pixel_values = pixel_values[i : i + 1]
+                pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])
+
+                # Remove padding images - padding images are full 0.
+                nb_values_per_image = pixel_values.shape[1:].numel()
+                real_images_inds = (pixel_values == 0.0).sum(
+                    dim=(-1, -2, -3)
+                ) != nb_values_per_image
+                pixel_values = pixel_values[real_images_inds].contiguous()
+                # Handle the vision attention mask
+                if pixel_attention_mask is None:
+                    pixel_attention_mask = torch.ones(
+                        size=(
+                            pixel_values.size(0),
+                            pixel_values.size(2),
+                            pixel_values.size(3),
+                        ),
+                        dtype=torch.bool,
+                        device=pixel_values.device,
+                    )
+                else:
+                    # Remove padding images from the mask/pP p
+                    pixel_attention_mask = all_pixel_mask[i : i + 1]
+                    pixel_attention_mask = pixel_attention_mask.view(
+                        1 * num_images, *pixel_attention_mask.shape[2:]
+                    )
+                    pixel_attention_mask = pixel_attention_mask[
+                        real_images_inds
+                    ].contiguous()
+
+                patch_size = self.config.vision_config.patch_size
+                """
+                patches_subgrid = pixel_attention_mask.unfold(
+                    dimension=1, size=patch_size, step=patch_size
+                )
+                patches_subgrid = patches_subgrid.unfold(
+                    dimension=2, size=patch_size, step=patch_size
+                )
+                patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+                """
+                # hpu does none support unfold
+                conv_kernel = torch.ones(
+                    [1, 1, patch_size, patch_size],
+                    dtype=pixel_values.dtype,
+                    device=pixel_values.device,
+                )
+                patches_subgrid = torch.nn.functional.conv2d(
+                    pixel_attention_mask.unsqueeze(1).to(conv_kernel.dtype),
+                    conv_kernel,
+                    stride=patch_size,
+                ).squeeze(1)
+                patch_attention_mask = torch.eq(
+                    patches_subgrid, (patch_size * patch_size)
+                )
+
+                # Get sequence from the vision encoder
+                image_hidden_states = self.vision_model(
+                    pixel_values=pixel_values,
+                    patch_attention_mask=patch_attention_mask,
+                )
+
+                # Modality projection & resampling
+                image_hidden_states = self.connector(
+                    image_hidden_states,
+                )
+
+                all_states.append(image_hidden_states)
+            image_hidden_states = torch.stack(all_states, dim=0)
+
+            inputs_embeds = self._merge_input_ids_with_image_features(
+                input_ids, inputs_embeds, image_hidden_states
+            )
+
+        hidden_states = self.text_model.model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            slots=slots,
+            seqlen=seqlen,
+            hpu_attention_meta=hpu_attention_meta,
+            adapter_data=adapter_data,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index fc6becc4..a130dbc1 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -46,15 +46,9 @@ from text_generation_server.layers import (
     FastLinear,
 )
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
-from text_generation_server.utils.import_utils import SYSTEM
 from loguru import logger
 
-if SYSTEM == "cuda":
-    import dropout_layer_norm
-elif SYSTEM == "rocm":
-    from vllm._C import ops
-else:
-    dropout_layer_norm = None
+dropout_layer_norm = None
 
 
 @dataclass
@@ -351,94 +345,18 @@ class IdeficsRMSNorm(nn.Module):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states, residual=None):
-        if SYSTEM == "ipex":
-            import intel_extension_for_pytorch as ipex
+        from vllm_hpu_extension.kernels import rms_norm
 
-            out = ipex.llm.functional.add_rms_norm(
-                residual,
-                hidden_states,
-                self.weight,
-                None,
-                self.variance_epsilon,
-                residual is not None,
-            )
-            return out
-        elif hidden_states.shape[-1] > 8192:
-            if residual is not None:
-                hidden_states += residual
-            residual = hidden_states
-
-            hidden_states = hidden_states.to(torch.float32)
-            variance = hidden_states.pow(2).mean(-1, keepdim=True)
-            hidden_states = hidden_states * torch.rsqrt(
-                variance + self.variance_epsilon
-            )
-
-            # convert into half-precision if necessary
-            if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                hidden_states = hidden_states.to(self.weight.dtype)
-
-            return self.weight * hidden_states
-        elif SYSTEM == "cuda":
-            # faster post attention rms norm
-            unwrap = False
-            if len(hidden_states.shape) > 2:
-                unwrap = True
-                shape = hidden_states.shape
-                hidden_states = hidden_states.reshape(-1, shape[-1])
-
-            normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
-                hidden_states,
-                residual,
-                self.weight,
-                None,
-                None,
-                None,
-                None,
-                None,
-                0.0,
-                self.variance_epsilon,
-                1.0,
-                0,
-                None,
-                False,
-                True,  # Activate RMSNorm
-            )
-            if res is None:
-                res = hidden_states
-
-            if unwrap:
-                normed_hidden_states = normed_hidden_states.view(*shape)
-
-            return normed_hidden_states
-        elif SYSTEM == "rocm":
-            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
-            if residual is not None:
-                hidden_states += residual
-            residual = hidden_states
-
-            unwrap = False
-            if len(hidden_states.shape) > 2:
-                unwrap = True
-                shape = hidden_states.shape
-                hidden_states = hidden_states.reshape(-1, shape[-1])
-
-            out = torch.empty_like(hidden_states)
-            ops.rms_norm(
-                out,
-                hidden_states,
-                self.weight.data,
-                self.variance_epsilon,
-            )
-
-            if unwrap:
-                out = out.view(*shape)
-
-            return out
+        orig_shape = hidden_states.shape
+        if residual is not None:
+            residual += hidden_states.view(residual.shape)
         else:
-            raise ValueError(
-                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
-            )
+            residual = hidden_states
+        # Note: HPUFusedRMSNorm requires 3D tensors as inputs
+        if len(orig_shape) == 2:
+            residual = residual.unsqueeze(0)
+        x = rms_norm().apply(residual, self.weight, self.variance_epsilon)
+        return x.view(orig_shape), residual.view(orig_shape)
 
 
 # this was adapted from LlamaMLP
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py
index 293051c2..5a9c0588 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@@ -196,7 +196,10 @@ class MambaModel(nn.Module):
     def __init__(self, config, weights):
         super().__init__()
         prefix = "backbone"
-        self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embedding", weights)
+        try:
+            self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embeddings", weights)
+        except RuntimeError:
+            self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embedding", weights)
         self.blocks = nn.ModuleList(
             [
                 ResidualBlock(f"{prefix}.layers.{i}", config, weights, layer_id=i)
@@ -206,7 +209,10 @@ class MambaModel(nn.Module):
         self.norm_f = FastRMSNorm.load(
             f"{prefix}.norm_f", weights, eps=config.layer_norm_epsilon
         )
-        self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
+        try:
+            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
+        except RuntimeError:
+            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
         self.config = config
 
     def forward(
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/mpt_modeling.py
deleted file mode 100644
index 988a74a3..00000000
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ /dev/null
@@ -1,1215 +0,0 @@
-"""A simple, flexible implementation of a GPT model.
-
-Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
-"""
-
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from einops import rearrange
-from packaging import version
-from text_generation_server.layers import (
-    TensorParallelEmbedding,
-    TensorParallelColumnLinear,
-    TensorParallelRowLinear,
-    SpeculativeHead,
-    get_linear,
-)
-
-EPS = 1e-5
-
-
-def load_col(config, prefix, weights, bias):
-    assert config.quantize != "gptq", NotImplementedError
-    slice_ = weights._get_slice(f"{prefix}.weight")
-    rank = weights.process_group.rank()
-    size = weights.process_group.size()
-
-    h3, h = slice_.get_shape()
-    block_size = h // size
-
-    q_part = slice_[rank * block_size : (rank + 1) * block_size]
-    k_part = slice_[h + rank * block_size : h + (rank + 1) * block_size]
-    v_part = slice_[2 * h + rank * block_size : 2 * h + (rank + 1) * block_size]
-
-    weight = torch.cat([q_part, k_part, v_part], dim=0)
-    if weight.dtype != torch.int32:
-        weight = weight.to(dtype=weights.dtype)
-    weight = weight.to(device=weights.device)
-
-    if bias:
-        bias_slice_ = weights._get_slice(f"{prefix}.bias")
-        bias_rank = weights.process_group.rank()
-        bias_size = weights.process_group.size()
-
-        bias_h = bias_slice_.get_shape()
-        bias_h = bias_h[0]
-        bias_block_size = bias_h // bias_size
-
-        bias_q_part = bias_slice_[
-            bias_rank * bias_block_size : (bias_rank + 1) * bias_block_size
-        ]
-        bias_k_part = bias_slice_[
-            bias_h
-            + bias_rank * bias_block_size : bias_h
-            + (bias_rank + 1) * bias_block_size
-        ]
-        bias_v_part = bias_slice_[
-            2 * bias_h
-            + bias_rank * bias_block_size : 2 * bias_h
-            + (bias_rank + 1) * bias_block_size
-        ]
-
-        bias = torch.cat([bias_q_part, bias_k_part, bias_v_part], dim=0)
-        if bias.dtype != torch.int32:
-            bias = bias.to(dtype=weights.dtype)
-        bias = bias.to(device=weights.device)
-    else:
-        bias = None
-    linear = get_linear(weight, bias)
-    return TensorParallelColumnLinear(linear)
-
-
-def _reset_is_causal(
-    num_query_tokens: int, num_key_tokens: int, original_is_causal: bool
-):
-    if original_is_causal and num_query_tokens != num_key_tokens:
-        if num_query_tokens != 1:
-            raise NotImplementedError(
-                "MPT does not support query and key with different number of tokens, unless number of query tokens is 1."
-            )
-        else:
-            return False
-    return original_is_causal
-
-
-def scaled_multihead_dot_product_attention(
-    query,
-    key,
-    value,
-    n_heads,
-    past_key_value=None,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-    multiquery=False,
-):
-    q = rearrange(query, "b s (h d) -> b h s d", h=n_heads)
-    kv_n_heads = 1 if multiquery else n_heads
-    k = rearrange(key, "b s (h d) -> b h d s", h=kv_n_heads)
-    v = rearrange(value, "b s (h d) -> b h s d", h=kv_n_heads)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            k = torch.cat([past_key_value[0], k], dim=3)
-            v = torch.cat([past_key_value[1], v], dim=2)
-        past_key_value = (k, v)
-    (b, _, s_q, d) = q.shape
-    s_k = k.size(-1)
-    attn_weight = q.matmul(k) * softmax_scale
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - s_q)
-        _s_k = max(0, attn_bias.size(3) - s_k)
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-        if (
-            attn_bias.size(-1) != 1
-            and attn_bias.size(-1) != s_k
-            or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q)
-        ):
-            raise RuntimeError(
-                f"attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}."
-            )
-        attn_weight = attn_weight + attn_bias
-    min_val = torch.finfo(q.dtype).min
-    if key_padding_mask is not None:
-        if attn_bias is not None:
-            warnings.warn(
-                "Propogating key_padding_mask to the attention module "
-                + "and applying it within the attention module can cause "
-                + "unneccessary computation/memory usage. Consider integrating "
-                + "into attn_bias once and passing that to each attention "
-                + "module instead."
-            )
-        attn_weight = attn_weight.masked_fill(
-            ~key_padding_mask.view((b, 1, 1, s_k)), min_val
-        )
-    if is_causal and (not q.size(2) == 1):
-        s = max(s_q, s_k)
-        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
-        causal_mask = causal_mask.tril()
-        causal_mask = causal_mask.to(torch.bool)
-        causal_mask = ~causal_mask
-        causal_mask = causal_mask[-s_q:, -s_k:]
-        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
-    attn_weight = torch.softmax(attn_weight, dim=-1)
-    if dropout_p:
-        attn_weight = torch.nn.functional.dropout(
-            attn_weight, p=dropout_p, training=training, inplace=True
-        )
-    out = attn_weight.to(v.dtype).matmul(v)
-    out = rearrange(out, "b h s d -> b s (h d)")
-    if needs_weights:
-        return (out, attn_weight, past_key_value)
-    return (out, None, past_key_value)
-
-
-def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
-    for tensor in tensors:
-        if tensor.dtype not in valid_dtypes:
-            raise TypeError(
-                f"tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}."
-            )
-        if not tensor.is_cuda:
-            raise TypeError(
-                f"Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r})."
-            )
-
-
-def flash_attn_fn(
-    query,
-    key,
-    value,
-    n_heads,
-    past_key_value=None,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-    multiquery=False,
-):
-    try:
-        from flash_attn import bert_padding, flash_attn_interface
-    except Exception:
-        raise RuntimeError("Please install flash-attn==1.0.3.post0")
-    check_valid_inputs(query, key, value)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            key = torch.cat([past_key_value[0], key], dim=1)
-            value = torch.cat([past_key_value[1], value], dim=1)
-        past_key_value = (key, value)
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - query.size(1))
-        _s_k = max(0, attn_bias.size(3) - key.size(1))
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-    if attn_bias is not None:
-        raise NotImplementedError("attn_bias not implemented for flash attn.")
-    (batch_size, seqlen) = query.shape[:2]
-    if key_padding_mask is None:
-        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
-    query_padding_mask = key_padding_mask[:, -query.size(1) :]
-    (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(
-        query, query_padding_mask
-    )
-    query_unpad = rearrange(query_unpad, "nnz (h d) -> nnz h d", h=n_heads)
-    (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(
-        key, key_padding_mask
-    )
-    key_unpad = rearrange(
-        key_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads
-    )
-    (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
-    value_unpad = rearrange(
-        value_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads
-    )
-    if multiquery:
-        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
-        value_unpad = value_unpad.expand(
-            value_unpad.size(0), n_heads, value_unpad.size(-1)
-        )
-    dropout_p = dropout_p if training else 0.0
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
-        query_unpad,
-        key_unpad,
-        value_unpad,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale=softmax_scale,
-        causal=reset_is_causal,
-        return_attn_probs=needs_weights,
-    )
-    output = bert_padding.pad_input(
-        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices_q, batch_size, seqlen
-    )
-    return (output, None, past_key_value)
-
-
-def triton_flash_attn_fn(
-    query,
-    key,
-    value,
-    n_heads,
-    past_key_value=None,
-    softmax_scale=None,
-    attn_bias=None,
-    key_padding_mask=None,
-    is_causal=False,
-    dropout_p=0.0,
-    training=False,
-    needs_weights=False,
-    multiquery=False,
-):
-    try:
-        from .flash_attn_triton import flash_attn_func
-    except Exception:
-        _installed = False
-        if version.parse(torch.__version__) < version.parse("2.0.0"):
-            _installed = True
-            try:
-                from flash_attn.flash_attn_triton import flash_attn_func
-            except Exception:
-                _installed = False
-        if not _installed:
-            raise RuntimeError(
-                "Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed."
-            )
-    check_valid_inputs(query, key, value)
-    if past_key_value is not None:
-        if len(past_key_value) != 0:
-            key = torch.cat([past_key_value[0], key], dim=1)
-            value = torch.cat([past_key_value[1], value], dim=1)
-        past_key_value = (key, value)
-    if attn_bias is not None:
-        _s_q = max(0, attn_bias.size(2) - query.size(1))
-        _s_k = max(0, attn_bias.size(3) - key.size(1))
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-    if dropout_p:
-        raise NotImplementedError("Dropout not implemented for attn_impl: triton.")
-    if needs_weights:
-        raise NotImplementedError("attn_impl: triton cannot return attn weights.")
-    if key_padding_mask is not None:
-        warnings.warn(
-            "Propagating key_padding_mask to the attention module "
-            + "and applying it within the attention module can cause "
-            + "unnecessary computation/memory usage. Consider integrating "
-            + "into attn_bias once and passing that to each attention "
-            + "module instead."
-        )
-        (b_size, s_k) = key_padding_mask.shape[:2]
-        if attn_bias is None:
-            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
-        attn_bias = attn_bias.masked_fill(
-            ~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min
-        )
-    query = rearrange(query, "b s (h d) -> b s h d", h=n_heads)
-    key = rearrange(key, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
-    value = rearrange(value, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
-    if multiquery:
-        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
-        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
-    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
-    attn_output = flash_attn_func(
-        query, key, value, attn_bias, reset_is_causal, softmax_scale
-    )
-    output = attn_output.view(*attn_output.shape[:2], -1)
-    return (output, None, past_key_value)
-
-
-class MultiheadAttention(nn.Module):
-    """Multi-head self attention.
-
-    Using torch or triton attention implementation enables user to also use
-    additive bias.
-    """
-
-    def __init__(
-        self,
-        config,
-        prefix,
-        weights,
-    ):
-        super().__init__()
-        attn_impl = config.attn_config.attn_impl
-        self.attn_impl = config.attn_config.attn_impl
-        self.clip_qkv = config.attn_config.clip_qkv
-        self.qk_ln = config.attn_config.qk_ln
-        self.d_model = config.d_model
-        d_model = config.d_model
-        self.n_heads = config.n_heads
-        self.softmax_scale = config.attn_config.softmax_scale
-        if self.softmax_scale is None:
-            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
-        self.attn_dropout_p = config.attn_config.attn_pdrop
-
-        if self.n_heads % weights.process_group.size() != 0:
-            raise ValueError(
-                f"`n_heads` must be divisible by `num_shards` (got `n_heads`: {self.n_heads} "
-                f"and `num_shards`: {weights.process_group.size()}"
-            )
-        self.n_heads = self.n_heads // weights.process_group.size()
-        self.Wqkv = load_col(
-            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
-        )
-        if self.qk_ln:
-            bias = not config.no_bias
-            hidden_size = config.d_model
-            head_dim = hidden_size // self.n_heads
-
-            self.q_ln = LPLayerNorm(
-                d_model, bias=bias, prefix=f"{prefix}.q_ln", weights=weights
-            )
-            self.k_ln = LPLayerNorm(
-                self.n_heads * head_dim, prefix=f"{prefix}.k_ln", weights=weights
-            )
-        if self.attn_impl == "flash":
-            self.attn_fn = flash_attn_fn
-        elif self.attn_impl == "triton":
-            self.attn_fn = triton_flash_attn_fn
-        elif self.attn_impl == "torch":
-            self.attn_fn = scaled_multihead_dot_product_attention
-        else:
-            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-        self.out_proj = TensorParallelRowLinear.load(
-            config,
-            prefix=f"{prefix}.out_proj",
-            weights=weights,
-            bias=not config.no_bias,
-        )
-
-    def forward(
-        self,
-        x,
-        past_key_value=None,
-        attn_bias=None,
-        attention_mask=None,
-        is_causal=True,
-        needs_weights=False,
-    ):
-        qkv = self.Wqkv(x)
-        if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-        (query, key, value) = qkv.chunk(3, dim=2)
-
-        key_padding_mask = attention_mask
-        if self.qk_ln:
-            dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
-        (context, attn_weights, past_key_value) = self.attn_fn(
-            query,
-            key,
-            value,
-            self.n_heads,
-            past_key_value=past_key_value,
-            softmax_scale=self.softmax_scale,
-            attn_bias=attn_bias,
-            key_padding_mask=key_padding_mask,
-            is_causal=is_causal,
-            dropout_p=self.attn_dropout_p,
-            training=self.training,
-            needs_weights=needs_weights,
-        )
-        out = self.out_proj(context)
-        return (out, attn_weights, past_key_value)
-
-
-class MultiQueryAttention(nn.Module):
-    """Multi-Query self attention.
-
-    Using torch or triton attention implementation enables user to also use
-    additive bias.
-    """
-
-    def __init__(self, config, prefix, weights, verbose=False):
-        super().__init__()
-        attn_impl = config.attn_config.attn_impl
-        self.attn_impl = config.attn_config.attn_impl
-        self.clip_qkv = config.attn_config.clip_qkv
-        self.qk_ln = config.attn_config.qk_ln
-        self.d_model = config.d_model
-        d_model = config.d_model
-        self.n_heads = config.n_heads
-        self.softmax_scale = config.attn_config.softmax_scale
-        if self.softmax_scale is None:
-            self.softmax_scale = 1 / math.sqrt(self.head_dim)
-        self.attn_dropout_p = config.attn_config.attn_pdrop
-        # self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
-        self.Wqkv = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
-        )
-        (d_model, d_model + self.head_dim)
-        if self.qk_ln:
-            raise NotImplementedError("qk_ln not supported")
-        if self.attn_impl == "flash":
-            self.attn_fn = flash_attn_fn
-        elif self.attn_impl == "triton":
-            self.attn_fn = triton_flash_attn_fn
-            if verbose:
-                warnings.warn(
-                    "While `attn_impl: triton` can be faster than `attn_impl: flash` "
-                    + "it uses more memory. When training larger models this can trigger "
-                    + "alloc retries which hurts performance. If encountered, we recommend "
-                    + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
-                )
-        elif self.attn_impl == "torch":
-            self.attn_fn = scaled_multihead_dot_product_attention
-            if torch.cuda.is_available() and verbose:
-                warnings.warn(
-                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
-                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
-                    + "we recommend using `attn_impl: triton`."
-                )
-        else:
-            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-        self.out_proj = TensorParallelRowLinear.load(
-            config,
-            prefix=f"{prefix}.out_proj",
-            weights=weights,
-            bias=not config.no_bias,
-        )
-        # self.out_proj._is_residual = True
-
-    def forward(
-        self,
-        x,
-        past_key_value=None,
-        attn_bias=None,
-        attention_mask=None,
-        is_causal=True,
-        needs_weights=False,
-    ):
-        qkv = self.Wqkv(x)
-        if self.clip_qkv:
-            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
-        (query, key, value) = qkv.split(
-            [self.d_model, self.head_dim, self.head_dim], dim=2
-        )
-        key_padding_mask = attention_mask
-        if self.qk_ln:
-            dtype = query.dtype
-            query = self.q_ln(query).to(dtype)
-            key = self.k_ln(key).to(dtype)
-        (context, attn_weights, past_key_value) = self.attn_fn(
-            query,
-            key,
-            value,
-            self.n_heads,
-            past_key_value=past_key_value,
-            softmax_scale=self.softmax_scale,
-            attn_bias=attn_bias,
-            key_padding_mask=key_padding_mask,
-            is_causal=is_causal,
-            dropout_p=self.attn_dropout_p,
-            training=self.training,
-            needs_weights=needs_weights,
-            multiquery=True,
-        )
-        return (self.out_proj(context), attn_weights, past_key_value)
-
-
-def attn_bias_shape(
-    attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id
-):
-    if attn_impl == "flash":
-        return None
-    elif attn_impl in ["torch", "triton"]:
-        if alibi:
-            if (prefix_lm or not causal) or use_sequence_id:
-                return (1, n_heads, seq_len, seq_len)
-            return (1, n_heads, 1, seq_len)
-        elif prefix_lm or use_sequence_id:
-            return (1, 1, seq_len, seq_len)
-        return None
-    else:
-        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-
-
-def build_attn_bias(
-    attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8
-):
-    if attn_impl == "flash":
-        return None
-    elif attn_impl in ["torch", "triton"]:
-        if alibi:
-            (device, dtype) = (attn_bias.device, attn_bias.dtype)
-            attn_bias = attn_bias.add(
-                build_alibi_bias(
-                    n_heads,
-                    seq_len,
-                    full=not causal,
-                    alibi_bias_max=alibi_bias_max,
-                    device=device,
-                    dtype=dtype,
-                )
-            )
-        return attn_bias
-    else:
-        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
-
-
-def gen_slopes(n_heads, alibi_bias_max=8, device=None):
-    _n_heads = 2 ** math.ceil(math.log2(n_heads))
-    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
-    m = m.mul(alibi_bias_max / _n_heads)
-    slopes = 1.0 / torch.pow(2, m)
-    if _n_heads != n_heads:
-        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
-    return slopes.view(1, n_heads, 1, 1)
-
-
-def build_alibi_bias(
-    n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None
-):
-    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(
-        1, 1, 1, seq_len
-    )
-    if full:
-        alibi_bias = alibi_bias - torch.arange(
-            1 - seq_len, 1, dtype=torch.int32, device=device
-        ).view(1, 1, seq_len, 1)
-        alibi_bias = alibi_bias.abs().mul(-1)
-    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
-    alibi_bias = alibi_bias * slopes
-    return alibi_bias.to(dtype=dtype)
-
-
-ATTN_CLASS_REGISTRY = {
-    "multihead_attention": MultiheadAttention,
-    "multiquery_attention": MultiQueryAttention,
-}
-
-"""GPT Blocks used for the GPT Model."""
-
-
-class MPTMLP(nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        # self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
-        self.up_proj = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.up_proj", weights=weights, bias=not config.no_bias
-        )
-        self.act = nn.GELU(approximate="none")
-        # self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
-        self.down_proj = TensorParallelRowLinear.load(
-            config,
-            prefix=f"{prefix}.down_proj",
-            weights=weights,
-            bias=not config.no_bias,
-        )
-        # self.down_proj._is_residual = True
-
-    def forward(self, x):
-        return self.down_proj(self.act(self.up_proj(x)))
-
-
-class MPTBlock(nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        self.prefix = prefix
-        if config.attn_config.attn_type != "multihead_attention":
-            raise NotImplementedError(
-                f"""Not implemented attn {config.attn_config.attn_type}"""
-            )
-        resid_pdrop = config.resid_pdrop
-        if config.no_bias:
-            self.norm_1 = nn.LayerNorm.load_no_bias(
-                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
-            )
-            self.norm_2 = nn.LayerNorm.load_no_bias(
-                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
-            )
-        else:
-            self.norm_1 = nn.LayerNorm.load(
-                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
-            )
-            self.norm_2 = nn.LayerNorm.load(
-                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
-            )
-        self.attn = MultiheadAttention(config, prefix=f"{prefix}.attn", weights=weights)
-        self.ffn = MPTMLP(config, prefix=f"{prefix}.ffn", weights=weights)
-        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
-        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attn_bias: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        is_causal: bool = True,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
-        a = self.norm_1(x)
-        (b, attn_weights, past_key_value) = self.attn(
-            a,
-            past_key_value=past_key_value,
-            attn_bias=attn_bias,
-            attention_mask=attention_mask,
-            is_causal=is_causal,
-        )
-        x = x + self.resid_attn_dropout(b)
-        m = self.norm_2(x)
-        n = self.ffn(m)
-        x = x + self.resid_ffn_dropout(n)
-        return (x, attn_weights, past_key_value)
-
-
-def _cast_if_autocast_enabled(tensor):
-    if torch.is_autocast_enabled():
-        if tensor.device.type == "cuda":
-            dtype = torch.get_autocast_gpu_dtype()
-        elif tensor.device.type == "cpu":
-            dtype = torch.get_autocast_cpu_dtype()
-        else:
-            raise NotImplementedError()
-        return tensor.to(dtype=dtype)
-    return tensor
-
-
-class LPLayerNorm(torch.nn.LayerNorm):
-    def __init__(
-        self,
-        normalized_shape,
-        eps=1e-05,
-        elementwise_affine=True,
-        device=None,
-        dtype=None,
-        bias: Optional[bool] = True,
-        prefix=None,
-        weights=None,
-    ):
-        super().__init__(
-            normalized_shape=normalized_shape,
-            eps=eps,
-            elementwise_affine=elementwise_affine,
-            device=device,
-            dtype=dtype,
-            bias=bias,
-        )
-        if weights is not None:
-            self.weight = nn.Parameter(weights.get_sharded(f"{prefix}.weight", dim=0))
-            if bias:
-                self.bias = nn.Parameter(weights.get_sharded(f"{prefix}.bias", dim=0))
-            self.normalized_shape = self.weight.shape
-
-    def forward(self, x):
-        module_device = x.device
-        downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = (
-            _cast_if_autocast_enabled(self.weight)
-            if self.weight is not None
-            else self.weight
-        )
-        downcast_bias = (
-            _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
-        )
-        with torch.autocast(enabled=False, device_type=module_device.type):
-            return torch.nn.functional.layer_norm(
-                downcast_x,
-                self.normalized_shape,
-                downcast_weight,
-                downcast_bias,
-                self.eps,
-            )
-
-
-def rms_norm(x, weight=None, eps=1e-05):
-    output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-    if weight is not None:
-        return output * weight
-    return output
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(
-        self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
-    ):
-        super().__init__()
-        self.eps = eps
-        if weight:
-            self.weight = torch.nn.Parameter(
-                torch.ones(normalized_shape, dtype=dtype, device=device)
-            )
-        else:
-            self.register_parameter("weight", None)
-
-    def forward(self, x):
-        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
-
-
-class LPRMSNorm(RMSNorm):
-    def __init__(
-        self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
-    ):
-        super().__init__(
-            normalized_shape=normalized_shape,
-            eps=eps,
-            weight=weight,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(self, x):
-        downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = (
-            _cast_if_autocast_enabled(self.weight)
-            if self.weight is not None
-            else self.weight
-        )
-        with torch.autocast(enabled=False, device_type=x.device.type):
-            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
-
-
-NORM_CLASS_REGISTRY = {
-    "layernorm": torch.nn.LayerNorm,
-    "low_precision_layernorm": LPLayerNorm,
-    "rmsnorm": RMSNorm,
-    "low_precision_rmsnorm": LPRMSNorm,
-}
-
-Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-
-
-class MPTPreTrainedModel(PreTrainedModel):
-    base_model_prefix = "model"
-    _no_split_modules = ["MPTBlock"]
-
-
-class MPTModel(MPTPreTrainedModel):
-    def __init__(self, prefix: str, config, weights):
-        # config._validate_config()
-        super().__init__(config)
-        self.world_size = weights.process_group.size()
-        self.rank = weights.process_group.rank()
-        self.n_heads = config.n_heads
-        self.attn_impl = config.attn_config.attn_impl
-        self.prefix_lm = config.attn_config.prefix_lm
-        self.attn_uses_sequence_id = config.attn_config.attn_uses_sequence_id
-        self.alibi = config.attn_config.alibi
-        self.alibi_bias_max = config.attn_config.alibi_bias_max
-        if config.init_device == "mixed":
-            # TODO: reimplement mixed device initialization
-            # dist.get_local_rank() == 0:
-            if True:
-                config.init_device = "cpu"
-            else:
-                config.init_device = "meta"
-        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
-            norm_options = " | ".join(NORM_CLASS_REGISTRY.keys())
-            raise NotImplementedError(
-                f"Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options})."
-            )
-        if config.norm_type.lower() != "low_precision_layernorm":
-            raise NotImplementedError(
-                f"Requested norm type ({config.norm_type}) is not implemented within this repo."
-            )
-
-        self.wte = TensorParallelEmbedding(f"{prefix}.wte", weights)
-
-        if not self.alibi:
-            self.wpe = TensorParallelEmbedding(f"{prefix}.wpe", weights)
-        self.blocks = nn.ModuleList(
-            [
-                MPTBlock(config, prefix=f"{prefix}.blocks.{i}", weights=weights)
-                for i in range(config.n_layers)
-            ]
-        )
-        if config.no_bias:
-            self.norm_f = nn.LayerNorm.load_no_bias(
-                prefix="transformer.norm_f", weights=weights, eps=EPS
-            )
-        else:
-            self.norm_f = nn.LayerNorm.load(
-                prefix="transformer.norm_f", weights=weights, eps=EPS
-            )
-        self.is_causal = not self.prefix_lm
-        self._attn_bias_initialized = False
-        self.attn_bias = None
-        self.attn_bias_shape = attn_bias_shape(
-            self.attn_impl,
-            config.n_heads,
-            config.max_seq_len,
-            self.alibi,
-            prefix_lm=self.prefix_lm,
-            causal=self.is_causal,
-            use_sequence_id=self.attn_uses_sequence_id,
-        )
-        if config.no_bias:
-            for module in self.modules():
-                if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
-                    if config.verbose:
-                        warnings.warn(f"Removing bias ({module.bias}) from {module}.")
-                    module.register_parameter("bias", None)
-        if hasattr(self.config, "verbose"):
-            if config.verbose and config.verbose > 2:
-                print(self)
-        if "verbose" not in self.config.init_config:
-            self.config.init_config["verbose"] = self.config.verbose
-        if self.config.init_config["verbose"] > 1:
-            init_fn_name = self.config.init_config["name"]
-            warnings.warn(f"Using {init_fn_name} initialization.")
-
-    @torch.no_grad()
-    def _attn_bias(
-        self,
-        device,
-        dtype,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-    ):
-        if not self._attn_bias_initialized:
-            if self.attn_bias_shape:
-                self.attn_bias = torch.zeros(
-                    self.attn_bias_shape, device=device, dtype=dtype
-                )
-                self.attn_bias = build_attn_bias(
-                    self.attn_impl,
-                    self.attn_bias,
-                    self.config.n_heads,
-                    self.config.max_seq_len,
-                    causal=self.is_causal,
-                    alibi=self.alibi,
-                    alibi_bias_max=self.alibi_bias_max,
-                )
-                assert self.n_heads % self.world_size == 0
-                block_size = self.n_heads // self.world_size
-                self.attn_bias = self.attn_bias[
-                    :, self.rank * block_size : (self.rank + 1) * block_size
-                ]
-            self._attn_bias_initialized = True
-        if self.attn_impl == "flash":
-            return (self.attn_bias, attention_mask)
-        if self.attn_bias is not None:
-            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
-        attn_bias = self.attn_bias
-        if self.prefix_lm:
-            assert isinstance(attn_bias, torch.Tensor)
-            assert isinstance(prefix_mask, torch.Tensor)
-            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
-        if self.attn_uses_sequence_id and sequence_id is not None:
-            assert isinstance(attn_bias, torch.Tensor)
-            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
-        if attention_mask is not None:
-            s_k = attention_mask.shape[-1]
-            if attn_bias is None:
-                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
-            else:
-                _s_k = max(0, attn_bias.size(-1) - s_k)
-                attn_bias = attn_bias[:, :, :, _s_k:]
-            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
-                raise ValueError(
-                    f"attention_mask shape={attention_mask.shape} "
-                    + f"and prefix_mask shape={prefix_mask.shape} are not equal."
-                )
-            min_val = torch.finfo(attn_bias.dtype).min
-            attn_bias = attn_bias.masked_fill(
-                ~attention_mask.view(-1, 1, 1, s_k), min_val
-            )
-        return (attn_bias, None)
-
-    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
-        (s_k, s_q) = attn_bias.shape[-2:]
-        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
-            raise ValueError(
-                "attn_bias does not match the expected shape. "
-                + f"The last two dimensions should both be {self.config.max_length} "
-                + f"but are {s_k} and {s_q}."
-            )
-        seq_len = prefix_mask.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(
-                f"prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}"
-            )
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        causal = torch.tril(
-            torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)
-        ).view(1, 1, seq_len, seq_len)
-        prefix = prefix_mask.view(-1, 1, 1, seq_len)
-        cannot_attend = ~torch.logical_or(causal, prefix.bool())
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-
-    def _apply_sequence_id(
-        self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor
-    ):
-        seq_len = sequence_id.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(
-                f"sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}"
-            )
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-        cannot_attend = torch.logical_not(
-            torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))
-        ).unsqueeze(1)
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-        return attn_bias
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-    ):
-        return_dict = (
-            return_dict if return_dict is not None else self.config.return_dict
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        if attention_mask is not None:
-            attention_mask = attention_mask.bool()
-        if prefix_mask is not None:
-            prefix_mask = prefix_mask.bool()
-        if not return_dict:
-            raise NotImplementedError(
-                "return_dict False is not implemented yet for MPT"
-            )
-        if output_attentions:
-            if self.attn_impl != "torch":
-                raise NotImplementedError(
-                    "output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`."
-                )
-        if (
-            attention_mask is not None
-            and attention_mask[:, 0].sum() != attention_mask.shape[0]
-            and self.training
-        ):
-            raise NotImplementedError(
-                "MPT does not support training with left padding."
-            )
-        if self.prefix_lm and prefix_mask is None:
-            raise ValueError(
-                "prefix_mask is a required argument when MPT is configured with prefix_lm=True."
-            )
-        if self.training:
-            if self.attn_uses_sequence_id and sequence_id is None:
-                raise ValueError(
-                    "sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True "
-                    + "and the model is in train mode."
-                )
-            elif self.attn_uses_sequence_id is False and sequence_id is not None:
-                warnings.warn(
-                    "MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. "
-                    + "This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True."
-                )
-        S = input_ids.size(1)
-        assert (
-            S <= self.config.max_seq_len
-        ), f"Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}"
-        tok_emb = self.wte(input_ids)
-        if self.alibi:
-            x = tok_emb
-        else:
-            past_position = 0
-            if past_key_values is not None:
-                if len(past_key_values) != self.config.n_layers:
-                    raise ValueError(
-                        "past_key_values must provide a past_key_value for each attention "
-                        + f"layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r})."
-                    )
-                past_position = past_key_values[0][0].size(1)
-                if self.attn_impl == "torch":
-                    past_position = past_key_values[0][0].size(3)
-            if S + past_position > self.config.max_seq_len:
-                raise ValueError(
-                    f"Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}."
-                )
-            pos = torch.arange(
-                past_position,
-                S + past_position,
-                dtype=torch.long,
-                device=input_ids.device,
-            ).unsqueeze(0)
-            if attention_mask is not None:
-                pos = torch.clamp(
-                    pos
-                    - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[
-                        :, past_position:
-                    ],
-                    min=0,
-                )
-            pos_emb = self.wpe(pos)
-            x = tok_emb + pos_emb
-        (attn_bias, attention_mask) = self._attn_bias(
-            device=x.device,
-            dtype=torch.float32,
-            attention_mask=attention_mask,
-            prefix_mask=prefix_mask,
-            sequence_id=sequence_id,
-        )
-        if use_cache and past_key_values is None:
-            past_key_values = [() for _ in range(self.config.n_layers)]
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        for b_idx, block in enumerate(self.blocks):
-            if output_hidden_states:
-                assert all_hidden_states is not None
-                all_hidden_states = all_hidden_states + (x,)
-            past_key_value = (
-                past_key_values[b_idx] if past_key_values is not None else None
-            )
-            (x, attn_weights, past_key_value) = block(
-                x,
-                past_key_value=past_key_value,
-                attn_bias=attn_bias,
-                attention_mask=attention_mask,
-                is_causal=self.is_causal,
-            )
-            if past_key_values is not None:
-                past_key_values[b_idx] = past_key_value
-            if output_attentions:
-                assert all_self_attns is not None
-                all_self_attns = all_self_attns + (attn_weights,)
-        x = self.norm_f(x)
-        if output_hidden_states:
-            assert all_hidden_states is not None
-            all_hidden_states = all_hidden_states + (x,)
-        return BaseModelOutputWithPast(
-            last_hidden_state=x,
-            past_key_values=past_key_values,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class MPTForCausalLM(MPTPreTrainedModel):
-    def __init__(self, prefix: str, config, weights):
-        super().__init__(config)
-
-        if not prefix:
-            prefix = "transformer"
-        else:
-            prefix = f"{prefix}.transformer"
-
-        if not config.tie_word_embeddings:
-            raise ValueError("MPTForCausalLM only supports tied word embeddings")
-        self.transformer = MPTModel(prefix, config, weights)
-        self.lm_head = SpeculativeHead.load(
-            config, prefix=f"{prefix}.wte", weights=weights
-        )
-        self.logit_scale = None
-        if config.logit_scale is not None:
-            logit_scale = config.logit_scale
-            if isinstance(logit_scale, str):
-                if logit_scale == "inv_sqrt_d_model":
-                    logit_scale = 1 / math.sqrt(config.d_model)
-                else:
-                    raise ValueError(
-                        f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
-                    )
-            self.logit_scale = logit_scale
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        prefix_mask: Optional[torch.ByteTensor] = None,
-        sequence_id: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        return_dict: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-    ):
-        return_dict = (
-            return_dict if return_dict is not None else self.config.return_dict
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        outputs = self.transformer(
-            input_ids=input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            prefix_mask=prefix_mask,
-            sequence_id=sequence_id,
-            return_dict=return_dict,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            use_cache=use_cache,
-        )
-        logits, speculative_logits = self.lm_head(outputs.last_hidden_state)
-        if self.logit_scale is not None:
-            if self.logit_scale == 0:
-                warnings.warn(
-                    f"Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs."
-                )
-            logits *= self.logit_scale
-        loss = None
-        if labels is not None:
-            labels = torch.roll(labels, shifts=-1)
-            labels[:, -1] = -100
-            loss = F.cross_entropy(
-                logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)
-            )
-        return (
-            CausalLMOutputWithPast(
-                loss=loss,
-                logits=logits,
-                past_key_values=outputs.past_key_values,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            ),
-            speculative_logits,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
-    ):
-        if inputs_embeds is not None:
-            raise NotImplementedError("inputs_embeds is not implemented for MPT yet")
-        attention_mask = kwargs["attention_mask"].bool()
-        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
-            raise NotImplementedError(
-                "MPT does not support generation with right padding."
-            )
-        if self.transformer.attn_uses_sequence_id and self.training:
-            sequence_id = torch.zeros_like(input_ids[:1])
-        else:
-            sequence_id = None
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-        if self.transformer.prefix_lm:
-            prefix_mask = torch.ones_like(attention_mask)
-            if kwargs.get("use_cache") is False:
-                raise NotImplementedError(
-                    "MPT with prefix_lm=True does not support use_cache=False."
-                )
-        else:
-            prefix_mask = None
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "prefix_mask": prefix_mask,
-            "sequence_id": sequence_id,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache", True),
-        }
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        """Used by HuggingFace generate when using beam search with kv-caching.
-
-        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
-        for an example in transformers.
-        """
-        reordered_past = []
-        for layer_past in past_key_values:
-            reordered_past += [
-                tuple(
-                    (past_state.index_select(0, beam_idx) for past_state in layer_past)
-                )
-            ]
-        return reordered_past
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/neox_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/neox_modeling.py
deleted file mode 100644
index 06731a6f..00000000
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ /dev/null
@@ -1,796 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch GPTNeoX model."""
-
-from typing import Optional, Tuple, Union
-
-import os
-import torch
-import torch.distributed
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from text_generation_server.layers import (
-    TensorParallelColumnLinear,
-    TensorParallelEmbedding,
-    TensorParallelRowLinear,
-    SpeculativeHead,
-)
-
-
-CUSTOM_KERNELS_ENABLED = False
-if (
-    torch.cuda.is_available()
-    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
-):
-    try:
-        from custom_kernels import fused_attention_cuda
-
-        CUSTOM_KERNELS_ENABLED = True
-    except ImportError:
-        pass
-
-
-def make_causal_mask(
-    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
-) -> torch.BoolTensor:
-    """
-    Make causal mask used for self-attention.
-    """
-    batch_size, target_length = input_ids_shape
-    mask = torch.ones(
-        (target_length, target_length + past_key_values_length),
-        dtype=torch.bool,
-        device=device,
-    )
-    mask = mask.triu(1 + past_key_values_length)
-
-    expanded_mask = mask.unsqueeze(0).expand(
-        batch_size, target_length, target_length + past_key_values_length
-    )
-    return expanded_mask
-
-
-def expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
-    """
-    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
-    """
-    batch_size, src_length = mask.shape
-    tgt_length = tgt_length if tgt_length is not None else src_length
-
-    expanded_mask = ~(mask[:, None, :].to(torch.bool))
-    return expanded_mask.expand(batch_size, tgt_length, src_length)
-
-
-def prepare_attn_mask(
-    attention_mask: torch.Tensor,
-    input_shape: Tuple[int, int],
-    past_key_values_length: int,
-) -> torch.BoolTensor:
-    # create causal mask
-    # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
-    combined_attention_mask = None
-    device = attention_mask.device
-    _, src_length = input_shape
-
-    if src_length > 1:
-        combined_attention_mask = make_causal_mask(
-            input_shape, device=device, past_key_values_length=past_key_values_length
-        )
-
-    # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
-    expanded_attn_mask = expand_mask(attention_mask, tgt_length=src_length)
-    combined_attention_mask = (
-        expanded_attn_mask
-        if combined_attention_mask is None
-        else expanded_attn_mask | combined_attention_mask
-    )
-
-    return combined_attention_mask
-
-
-class GPTNeoXPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-
-class GPTNeoXAttention(nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        self.num_attention_heads = config.num_attention_heads
-        self.hidden_size = config.hidden_size
-        self.head_size = self.hidden_size // self.num_attention_heads
-        self.rotary_ndims = int(self.head_size * config.rotary_pct)
-        # ??? TODO
-        # self.register_buffer(
-        #     "bias",
-        #     torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
-        #         1, 1, max_positions, max_positions
-        #     ),
-        # )
-        # self.register_buffer("masked_bias", torch.tensor(-1e9))
-        self.rotary_emb = RotaryEmbedding(
-            self.rotary_ndims,
-            config.max_position_embeddings,
-            base=config.rotary_emb_base,
-        )
-        self.rotary_emb.inv_freq = nn.Parameter(
-            weights.get_tensor(f"{prefix}.rotary_emb.inv_freq")
-        )
-        self.inv_norm_factor = 1.0 / torch.sqrt(
-            torch.tensor(self.head_size, dtype=torch.float32)
-        ).to(torch.get_default_dtype())
-
-        if self.num_attention_heads % weights.process_group.size() != 0:
-            raise ValueError(
-                f"`num_attention_heads` must be divisible by `num_shards` "
-                f"(got `num_attention_heads`: {self.num_attention_heads} "
-                f"and `num_shards`: {weights.process_group.size()}"
-            )
-        self.num_attention_heads = (
-            self.num_attention_heads // weights.process_group.size()
-        )
-        self.query_key_value = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.query_key_value", weights=weights, bias=True
-        )
-        self.dense = TensorParallelRowLinear.load(
-            config, prefix=f"{prefix}.dense", weights=weights, bias=True
-        )
-
-    def forward(
-        self,
-        hidden_states,
-        position_ids,
-        attention_mask,
-        head_mask=None,
-        layer_past=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        has_layer_past = layer_past is not None
-
-        # Compute QKV
-        # Attention heads [batch, seq_len, hidden_size]
-        #   --> [batch, seq_len, (np * 3 * head_size)]
-        qkv = self.query_key_value(hidden_states)
-
-        # [batch, seq_len, (num_heads * 3 * head_size)]
-        #   --> [batch, seq_len, num_heads, 3 * head_size]
-        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
-        qkv = qkv.view(*new_qkv_shape).permute(0, 2, 1, 3)
-        # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
-        query, key, value = qkv.split(self.head_size, -1)
-
-        # Compute token offset for rotary embeddings (when decoding)
-        seq_len = key.shape[-2]
-        if has_layer_past:
-            seq_len += layer_past[0].shape[-2]
-
-        # Compute rotary embeddings on rotary_ndims
-        query_rot = query[..., : self.rotary_ndims]
-        key_rot = key[..., : self.rotary_ndims]
-
-        query_rot, key_rot = self.rotary_emb(query_rot, key_rot, position_ids, seq_len)
-
-        query[..., : self.rotary_ndims] = query_rot
-        key[..., : self.rotary_ndims] = key_rot
-
-        if CUSTOM_KERNELS_ENABLED:
-            attn_output, present, attn_weights = fused_attention_cuda.forward(
-                query,
-                key,
-                value,
-                layer_past,
-                attention_mask,
-                head_mask,
-                self.inv_norm_factor,
-                self.num_attention_heads,
-                use_cache,
-            )
-        else:
-            # Cache QKV values
-            if has_layer_past:
-                past_key = layer_past[0]
-                past_value = layer_past[1]
-                key = torch.cat((past_key, key), dim=-2)
-                value = torch.cat((past_value, value), dim=-2)
-            present = (key, value) if use_cache else None
-
-            # Compute attention
-            attn_output, attn_weights = self._attn(
-                query, key, value, attention_mask, head_mask
-            )
-
-            # Reshape outputs
-            attn_output = self._merge_heads(
-                attn_output, self.num_attention_heads, self.head_size
-            )
-
-        attn_output = self.dense(attn_output)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-    @classmethod
-    def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
-        """
-        Splits hidden dim into attn_head_size and num_attention_heads
-        """
-        # tensor: [bs, seq_len, hidden_size]
-        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
-        # -> [bs, seq_len, num_attention_heads, attn_head_size]
-        tensor = tensor.view(new_shape)
-        # -> [bs, num_attention_heads, seq_len, attn_head_size]
-        tensor = tensor.permute(0, 2, 1, 3)
-        return tensor
-
-    @classmethod
-    def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
-        """
-        Merges attn_head_size dim and num_attn_heads dim into hidden dim
-        """
-        # tensor [bs, num_attention_heads, seq_len, attn_head_size]
-        tensor = tensor.permute(0, 2, 1, 3).contiguous()
-        # -> [bs, seq_len, num_attention_heads, attn_head_size]
-        tensor = tensor.view(
-            tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size
-        )
-        # -> [bs, seq_len, hidden_size]
-        return tensor
-
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
-        # compute causal mask from causal mask buffer
-        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
-        key_length = key.size(-2)
-
-        query = query.reshape(
-            batch_size * num_attention_heads, query_length, attn_head_size
-        )
-        key = key.reshape(batch_size * num_attention_heads, key_length, attn_head_size)
-        attn_scores = torch.zeros(
-            1,
-            dtype=query.dtype,
-            device=key.device,
-        ).expand(batch_size * num_attention_heads, query_length, key_length)
-        attn_scores = torch.baddbmm(
-            attn_scores,
-            query,
-            key.transpose(1, 2),
-            beta=1.0,
-            alpha=self.inv_norm_factor,
-        )
-
-        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
-        input_dtype = attn_scores.dtype
-        if input_dtype in [torch.float16, torch.bfloat16]:
-            attn_scores = attn_scores.to(torch.float)
-        attn_scores = torch.where(
-            attention_mask, torch.finfo(attn_scores.dtype).min, attn_scores
-        )
-        attn_scores = attn_scores.view(
-            batch_size, num_attention_heads, query_length, key_length
-        )
-
-        attn_weights = nn.functional.softmax(attn_scores, dim=-1)
-        attn_weights = attn_weights.to(value.dtype)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-        return attn_output, attn_weights
-
-
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings, base=10000, device=None):
-        super().__init__()
-        self.true_inv_freq = 1.0 / (
-            base ** (torch.arange(0, dim, 2).float().to(device) / dim)
-        )
-        self.register_buffer("inv_freq", self.true_inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        self.cos_cached = None
-        self.sin_cached = None
-
-    @staticmethod
-    def rotate_half(x):
-        """Rotates half the hidden dims of the input."""
-        x1 = x[..., : x.shape[-1] // 2]
-        x2 = x[..., x.shape[-1] // 2 :]
-        return torch.cat((-x2, x1), dim=-1)
-
-    @staticmethod
-    def _create_cos_sin(inv_freq, max_position_embeddings, dtype, device):
-        t = torch.arange(
-            max_position_embeddings, device=inv_freq.device, dtype=inv_freq.dtype
-        )
-        freqs = torch.einsum("i,j->ij", t, inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        return emb.cos().to(device).to(dtype), emb.sin().to(device).to(dtype)
-
-    def forward(self, q, k, position_ids, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if (
-            seq_len > self.max_seq_len_cached
-            or self.cos_cached is None
-            or self.sin_cached is None
-        ):
-            if seq_len > self.max_seq_len_cached:
-                self.max_seq_len_cached = seq_len
-            self.cos_cached, self.sin_cached = self._create_cos_sin(
-                self.true_inv_freq, self.max_seq_len_cached, q.dtype, q.device
-            )
-        return rotary_forward(q, k, self.cos_cached, self.sin_cached, position_ids)
-
-
-@torch.jit.script
-def rotary_forward(q, k, cos, sin, position_ids):
-    cos = cos[position_ids].unsqueeze(1)
-    sin = sin[position_ids].unsqueeze(1)
-
-    chunk_size = q.shape[-1] // 2
-    q1, q2 = q.split(chunk_size, -1)
-    q_rotated = torch.cat((-q2, q1), dim=-1)
-    k1, k2 = k.split(chunk_size, -1)
-    k_rotated = torch.cat((-k2, k1), dim=-1)
-
-    q_embed = (q * cos) + (q_rotated * sin)
-    k_embed = (k * cos) + (k_rotated * sin)
-    return q_embed, k_embed
-
-
-class GPTNeoXMLP(nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        self.act = (
-            ACT2FN[config.hidden_act]
-            if "gelu_fast" not in config.hidden_act
-            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
-        )
-
-        self.dense_h_to_4h = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=True
-        )
-        self.dense_4h_to_h = TensorParallelRowLinear.load(
-            config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=True
-        )
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense_h_to_4h(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.dense_4h_to_h(hidden_states)
-        return hidden_states
-
-
-class GPTNeoXLayer(nn.Module):
-    def __init__(self, layer_id, prefix: str, config, weights):
-        super().__init__()
-        self.use_parallel_residual = config.use_parallel_residual
-        self.input_layernorm = nn.LayerNorm.load(
-            prefix=f"{prefix}.layers.{layer_id}.input_layernorm",
-            weights=weights,
-            eps=config.layer_norm_eps,
-        )
-        self.post_attention_layernorm = nn.LayerNorm.load(
-            prefix=f"{prefix}.layers.{layer_id}.post_attention_layernorm",
-            weights=weights,
-            eps=config.layer_norm_eps,
-        )
-        self.attention = GPTNeoXAttention(
-            config, prefix=f"{prefix}.layers.{layer_id}.attention", weights=weights
-        )
-        self.mlp = GPTNeoXMLP(
-            config, prefix=f"{prefix}.layers.{layer_id}.mlp", weights=weights
-        )
-
-    def forward(
-        self,
-        hidden_states,
-        position_ids,
-        attention_mask=None,
-        head_mask=None,
-        use_cache=False,
-        layer_past=None,
-        output_attentions=False,
-    ):
-        attention_layer_outputs = self.attention(
-            self.input_layernorm(hidden_states),
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            layer_past=layer_past,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        attn_output = attention_layer_outputs[
-            0
-        ]  # output_attn: attn_output, present, (attn_weights)
-        outputs = attention_layer_outputs[1:]
-
-        if self.use_parallel_residual:
-            # pseudocode:
-            # x = x + attn(ln1(x)) + mlp(ln2(x))
-            mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
-            hidden_states = mlp_output + attn_output + hidden_states
-        else:
-            # pseudocode:
-            # x = x + attn(ln1(x))
-            # x = x + mlp(ln2(x))
-            attn_output = attn_output + hidden_states
-            mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
-            hidden_states = mlp_output + attn_output
-
-        if use_cache:
-            outputs = (
-                hidden_states,
-            ) + outputs  # hidden_states, present, (attn_weights)
-        else:
-            outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)
-
-        return outputs
-
-
-class GPTNeoXModel(GPTNeoXPreTrainedModel):
-    def __init__(self, prefix: str, config, weights):
-        super().__init__(config)
-        self.config = config
-
-        self.num_attention_heads = config.num_attention_heads
-
-        self.embed_in = TensorParallelEmbedding(
-            prefix=f"{prefix}.embed_in", weights=weights
-        )
-        self.layers = nn.ModuleList(
-            [
-                GPTNeoXLayer(layer_id, prefix, config, weights)
-                for layer_id in range(config.num_hidden_layers)
-            ]
-        )
-        self.final_layer_norm = nn.LayerNorm.load(
-            prefix=f"{prefix}.final_layer_norm",
-            weights=weights,
-            eps=config.layer_norm_eps,
-        )
-        self.tp_world_size = weights.process_group.size()
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        position_ids=None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        r"""
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        batch_size, seq_length = input_shape
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * self.config.num_hidden_layers)
-        else:
-            past_length = past_key_values[0][0].size(-2)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_length, seq_length + past_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_in(input_ids)
-
-        hidden_states = inputs_embeds
-
-        # Attention mask.
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-        if past_key_values[0] is not None:
-            past_key_values_length = past_key_values[0][0].shape[-1]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), device=hidden_states.device
-            )
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-
-        causal_mask = prepare_attn_mask(
-            attention_mask,
-            input_shape=(batch_size, seq_length),
-            past_key_values_length=past_key_values_length,
-        )
-
-        assert self.num_attention_heads % self.tp_world_size == 0
-        block_size = self.num_attention_heads // self.tp_world_size
-        causal_mask = torch.repeat_interleave(causal_mask, block_size, dim=0)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        presents = () if use_cache else None
-        all_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (layer, layer_past) in enumerate(zip(self.layers, past_key_values)):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            outputs = layer(
-                hidden_states,
-                position_ids=position_ids,
-                attention_mask=causal_mask,
-                head_mask=head_mask[i],
-                layer_past=layer_past,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-            if output_attentions:
-                all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
-
-        hidden_states = self.final_layer_norm(hidden_states)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, presents, all_hidden_states, all_attentions]
-                if v is not None
-            )
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_attentions,
-        )
-
-
-class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-
-    def __init__(self, prefix: str, config, weights):
-        super().__init__(config)
-
-        if not prefix:
-            prefix = "gpt_neox"
-        else:
-            prefix = f"{prefix}.gpt_neox"
-
-        self.gpt_neox = GPTNeoXModel(prefix, config, weights)
-        self.embed_out = SpeculativeHead.load(
-            config, prefix="embed_out", weights=weights
-        )
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are
-            only required when the model is used as a decoder in a Sequence to Sequence model.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
-            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
-        >>> import torch
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
-        >>> config = GPTNeoXConfig.from_pretrained("EleutherAI/gpt-neox-20b")
-        >>> config.is_decoder = True
-        >>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", config=config)
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> prediction_logits = outputs.logits
-        ```"""
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        outputs = self.gpt_neox(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        lm_logits, speculative_logits = self.embed_out(hidden_states)
-
-        lm_loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shift_logits = lm_logits[:, :-1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + outputs[1:]
-            return ((lm_loss,) + output) if lm_loss is not None else output
-
-        return (
-            CausalLMOutputWithPast(
-                loss=lm_loss,
-                logits=lm_logits,
-                past_key_values=outputs.past_key_values,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            ),
-            speculative_logits,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        **kwargs,
-    ):
-        input_shape = input_ids.shape
-
-        # cut decoder_input_ids if past is used
-        if past_key_values and past_key_values[0] is not None:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "attention_mask": attention_mask,
-                "past_key_values": past_key_values,
-                "position_ids": position_ids,
-            }
-        )
-
-        return model_inputs
-
-    def _reorder_cache(self, past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(
-                    past_state.index_select(0, beam_idx)
-                    for past_state in layer_past[:2]
-                )
-                + layer_past[2:],
-            )
-        return reordered_past
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/opt_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/opt_modeling.py
deleted file mode 100644
index bd440321..00000000
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/opt_modeling.py
+++ /dev/null
@@ -1,857 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch OPT model."""
-import random
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers import OPTConfig
-from text_generation_server.layers import (
-    FastLinear,
-    TensorParallelColumnLinear,
-    TensorParallelEmbedding,
-    TensorParallelRowLinear,
-    SpeculativeHead,
-)
-
-EPS = 1e-5
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size,
-    dtype: torch.dtype,
-    device: torch.device,
-    past_key_values_length: int = 0,
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full(
-        (tgt_len, tgt_len),
-        torch.tensor(torch.finfo(dtype).min, device=device),
-        device=device,
-    )
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat(
-            [
-                torch.zeros(
-                    tgt_len, past_key_values_length, dtype=dtype, device=device
-                ),
-                mask,
-            ],
-            dim=-1,
-        )
-    return mask[None, None, :, :].expand(
-        bsz, 1, tgt_len, tgt_len + past_key_values_length
-    )
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(
-        inverted_mask.to(torch.bool), torch.finfo(dtype).min
-    )
-
-
-class OPTLearnedPositionalEmbedding(nn.Module):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    """
-
-    def __init__(self, prefix: str, weights):
-        super().__init__()
-        self.offset = 2
-        self.weight = nn.Parameter(
-            weights.get_tensor(
-                f"{prefix + '.' if prefix else ''}decoder.embed_positions.weight"
-            )
-        )
-
-    def forward(
-        self, attention_mask: torch.LongTensor, past_key_values_length: int = 0
-    ):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        attention_mask = attention_mask.long()
-
-        # create positions depending on attention_mask
-        positions = (
-            torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
-        ).long() - 1
-
-        # cut positions if `past_key_values_length` is > 0
-        positions = positions[:, past_key_values_length:]
-
-        return torch.nn.functional.embedding(positions + self.offset, self.weight)
-
-
-class OPTAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        config,
-        prefix,
-        weights,
-        is_decoder: bool = False,
-        bias: bool = True,
-        process_group=None,
-    ):
-        super().__init__()
-        hidden_size = config.hidden_size
-        num_heads = config.num_attention_heads
-
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.dropout = config.dropout
-        self.head_dim = hidden_size // num_heads
-
-        if (self.head_dim * num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {num_heads})."
-            )
-        self.scaling = self.head_dim**-0.5
-        self.is_decoder = is_decoder
-
-        process_group = weights.process_group
-        if self.num_heads % weights.process_group.size() != 0:
-            raise ValueError(
-                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
-                f"and `num_shards`: {weights.process_group.size()}"
-            )
-        self.num_heads = self.num_heads // process_group.size()
-        self.hidden_size = self.hidden_size // process_group.size()
-
-        self.q_proj = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.q_proj", weights=weights, bias=bias
-        )
-        self.k_proj = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.k_proj", weights=weights, bias=bias
-        )
-        self.v_proj = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.v_proj", weights=weights, bias=bias
-        )
-        self.out_proj = TensorParallelRowLinear.load(
-            config, prefix=f"{prefix}.out_proj", weights=weights, bias=bias
-        )
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return (
-            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
-            .transpose(1, 2)
-            .contiguous()
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = (
-                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-                + attention_mask
-            )
-            attn_weights = torch.max(
-                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
-            )
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
-        if attn_weights.dtype == torch.float16:
-            attn_weights = nn.functional.softmax(
-                attn_weights, dim=-1, dtype=torch.float32
-            ).to(torch.float16)
-        else:
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if layer_head_mask is not None:
-            if layer_head_mask.size() != (self.num_heads,):
-                raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                    f" {layer_head_mask.size()}"
-                )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
-                bsz, self.num_heads, tgt_len, src_len
-            )
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(
-                bsz, self.num_heads, tgt_len, src_len
-            )
-            attn_weights = attn_weights_reshaped.view(
-                bsz * self.num_heads, tgt_len, src_len
-            )
-        else:
-            attn_weights_reshaped = None
-
-        attn_probs = nn.functional.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )
-
-        attn_output = torch.bmm(attn_probs, value_states)
-
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-
-        # Use the `hidden_size` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.hidden_size)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped, past_key_value
-
-
-class OPTDecoderLayer(nn.Module):
-    def __init__(self, layer_id: int, prefix: str, config: OPTConfig, weights):
-        super().__init__()
-        self.process_group = weights.process_group
-        self.hidden_size = config.hidden_size
-        prefix = f"{prefix + '.' if prefix else ''}decoder.layers.{layer_id}"
-        self.self_attn = OPTAttention(
-            config,
-            prefix=f"{prefix}.self_attn",
-            weights=weights,
-            is_decoder=True,
-            bias=config.enable_bias,
-        )
-        self.do_layer_norm_before = config.do_layer_norm_before
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-
-        self.self_attn_layer_norm = nn.LayerNorm.load(
-            prefix=f"{prefix}.self_attn_layer_norm", weights=weights, eps=EPS
-        )
-        self.fc1 = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.fc1", weights=weights, bias=config.enable_bias
-        )
-        self.fc2 = TensorParallelRowLinear.load(
-            config, prefix=f"{prefix}.fc2", weights=weights, bias=config.enable_bias
-        )
-        self.final_layer_norm = nn.LayerNorm.load(
-            prefix=f"{prefix}.final_layer_norm", weights=weights, eps=EPS
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    ) -> Tuple[
-        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
-    ]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            past_key_value=past_key_value,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(
-            hidden_states, p=self.dropout, training=self.training
-        )
-        hidden_states = residual + hidden_states
-
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Fully Connected
-        hidden_states_shape = hidden_states.shape
-        hidden_states = hidden_states.reshape(-1, hidden_states.size(-1))
-        residual = hidden_states
-
-        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
-        if self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(
-            hidden_states, p=self.dropout, training=self.training
-        )
-
-        hidden_states = (residual + hidden_states).view(hidden_states_shape)
-
-        # 350m applies layer norm AFTER attention
-        if not self.do_layer_norm_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-class OPTPreTrainedModel(PreTrainedModel):
-    config_class = OPTConfig
-
-
-class OPTDecoder(OPTPreTrainedModel):
-    def __init__(self, prefix: str, config: OPTConfig, weights):
-        super().__init__(config)
-        self.dropout = config.dropout
-        self.layerdrop = config.layerdrop
-        self.padding_idx = config.pad_token_id
-        self.max_target_positions = config.max_position_embeddings
-        self.vocab_size = config.vocab_size
-
-        prefix = prefix + "." if prefix else ""
-
-        self.embed_tokens = TensorParallelEmbedding(
-            prefix=f"{prefix}decoder.embed_tokens", weights=weights
-        )
-        self.embed_positions = OPTLearnedPositionalEmbedding(prefix, weights)
-
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_out = FastLinear.load(
-                config,
-                prefix=f"{prefix}decoder.project_out",
-                weights=weights,
-                bias=False,
-            )
-        else:
-            self.project_out = None
-
-        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_in = FastLinear.load(
-                config,
-                prefix=f"{prefix}decoder.project_in",
-                weights=weights,
-                bias=False,
-            )
-        else:
-            self.project_in = None
-
-        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
-        # with checkpoints that have been fine-tuned before transformers v4.20.1
-        # see https://github.com/facebookresearch/metaseq/pull/164
-        if config.do_layer_norm_before and not config._remove_final_layer_norm:
-            self.final_layer_norm = nn.LayerNorm.load(
-                prefix=f"{prefix}decoder.final_layer_norm", weights=weights, eps=EPS
-            )
-        else:
-            self.final_layer_norm = None
-
-        self.layers = nn.ModuleList(
-            [
-                OPTDecoderLayer(layer_id, prefix, config, weights)
-                for layer_id in range(config.num_hidden_layers)
-            ]
-        )
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(
-        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
-    ):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(
-                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-            ).to(inputs_embeds.device)
-            combined_attention_mask = (
-                expanded_attn_mask
-                if combined_attention_mask is None
-                else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.__call__`] for details.
-
-                [What are input IDs?](../glossary#input-ids)
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-
-                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError(
-                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
-            )
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        batch_size, seq_length = input_shape
-        past_key_values_length = (
-            past_key_values[0][0].shape[2] if past_key_values is not None else 0
-        )
-        # required mask seq length can be calculated via length of past
-        mask_seq_length = past_key_values_length + seq_length
-
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                batch_size, mask_seq_length, device=inputs_embeds.device
-            )
-        causal_attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_key_values_length
-        )
-        pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
-
-        if self.project_in is not None:
-            inputs_embeds = self.project_in(inputs_embeds)
-
-        hidden_states = inputs_embeds + pos_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        # check if head_mask has a correct number of layers specified if desired
-        for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
-            if attn_mask is not None:
-                if attn_mask.size()[0] != (len(self.layers)):
-                    raise ValueError(
-                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                        f" {head_mask.size()[0]}."
-                    )
-
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
-
-            past_key_value = (
-                past_key_values[idx] if past_key_values is not None else None
-            )
-
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=causal_attention_mask,
-                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        if self.final_layer_norm is not None:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        if self.project_out is not None:
-            hidden_states = self.project_out(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None
-            )
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class OPTModel(OPTPreTrainedModel):
-    def __init__(self, prefix: str, config: OPTConfig, weights):
-        super().__init__(config)
-        self.decoder = OPTDecoder(prefix, config, weights)
-        # Initialize weights and apply final processing
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-        decoder_outputs = self.decoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        if not return_dict:
-            return decoder_outputs
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            hidden_states=decoder_outputs.hidden_states,
-            attentions=decoder_outputs.attentions,
-        )
-
-
-class OPTForCausalLM(OPTPreTrainedModel):
-    def __init__(self, prefix, config, weights):
-        super().__init__(config)
-
-        self.model = OPTModel(prefix, config, weights)
-
-        self.lm_head = SpeculativeHead.load(
-            config,
-            prefix=f"{prefix + '.' if prefix else ''}decoder.embed_tokens",
-            weights=weights,
-        )
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model.decoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        logits, speculative_logits = self.lm_head(outputs.last_hidden_state)
-
-        loss = None
-
-        return (
-            CausalLMOutputWithPast(
-                loss=loss,
-                logits=logits,
-                past_key_values=outputs.past_key_values,
-                hidden_states=outputs.hidden_states,
-                attentions=outputs.attentions,
-            ),
-            speculative_logits,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        **kwargs,
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(
-                    past_state.index_select(0, beam_idx) for past_state in layer_past
-                ),
-            )
-        return reordered_past
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/phi_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/phi_modeling.py
deleted file mode 100644
index 3f2ed010..00000000
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/phi_modeling.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# imlementation of the PhiModel and PhiForCausalLM classes
-
-import torch
-import torch.distributed
-
-import math
-from torch import nn
-from typing import Optional, List, Tuple
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-from text_generation_server.layers import (
-    TensorParallelRowLinear,
-    TensorParallelColumnLinear,
-    TensorParallelEmbedding,
-    SpeculativeHead,
-    FastLinear,
-)
-
-
-# PhiConfig is the configuration class for the PhiModel.
-class PhiConfig(PretrainedConfig):
-    def __init__(
-        self,
-        vocab_size=51200,
-        n_positions=2048,
-        n_embd=2560,
-        n_layer=32,
-        n_inner=None,
-        n_head=32,
-        rotary_dim=32,
-        layer_norm_epsilon=1e-5,
-        tie_word_embeddings=False,
-        pad_vocab_size_multiple=64,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        no_bias=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_inner = n_inner
-        self.n_head = n_head
-        self.rotary_dim = rotary_dim
-
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.tie_word_embeddings = tie_word_embeddings
-        self.pad_vocab_size_multiple = pad_vocab_size_multiple
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.no_bias = no_bias
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-
-# RotaryEmbedding is a class that implements the rotary embedding.
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_seq_len):
-        super().__init__()
-        inv_freq = [1.0 / 10000.0 ** (i / dim) for i in range(0, dim, 2)]
-        inv_freq_len = len(inv_freq)
-        inv_freq = torch.tensor(inv_freq).view(1, inv_freq_len)
-        t = torch.arange(0, max_seq_len, dtype=torch.float).view(max_seq_len, 1)
-        freqs = t.matmul(inv_freq)
-        self.sin = freqs.sin()
-        self.cos = freqs.cos()
-
-    def apply_rotary_emb_qkv(self, qkv, seqlen_offset):
-        b_size, seqlen, three, _, _headdim = qkv.shape
-        if three != 3:
-            raise Exception("unexpected shape for qkv")
-        _, rotary_dim = self.cos.shape
-        rotary_dim = rotary_dim * 2
-        q_rot = qkv[:, :, 0, :, :rotary_dim]
-        q_pass = qkv[:, :, 0, :, rotary_dim:]
-        k_rot = qkv[:, :, 1, :, :rotary_dim]
-        k_pass = qkv[:, :, 1, :, rotary_dim:]
-        q12 = torch.chunk(q_rot, 2, dim=-1)
-        k12 = torch.chunk(k_rot, 2, dim=-1)
-        q1, q2 = q12[0], q12[1]
-        k1, k2 = k12[0], k12[1]
-        c = self.cos.narrow(0, seqlen_offset, seqlen).unsqueeze(1)
-        s = self.sin.narrow(0, seqlen_offset, seqlen).unsqueeze(1)
-        q_rot = torch.cat(
-            [
-                q1 * c - q2 * s,
-                q1 * s + q2 * c,
-            ],
-            dim=-1,
-        )
-        k_rot = torch.cat(
-            [
-                k1 * c - k2 * s,
-                k1 * s + k2 * c,
-            ],
-            dim=-1,
-        )
-        q = torch.cat([q_rot, q_pass], dim=-1)
-        k = torch.cat([k_rot, k_pass], dim=-1)
-        v = qkv[:, :, 2]
-        return q, k, v
-
-
-# PhiCausalLMHead is the head of the PhiModel. It is a linear layer with a layer norm.
-class PhiCausalLMHead(nn.Module):
-    def __init__(self, config, weights):
-        super().__init__()
-        self.ln = nn.LayerNorm.load(
-            prefix="lm_head.ln",
-            weights=weights,
-            eps=config.layer_norm_epsilon,
-        )
-        self.linear = SpeculativeHead.load(
-            config=config, prefix="lm_head.linear", weights=weights
-        )
-
-    def forward(self, hidden_states):
-        hidden_states = self.ln(hidden_states)
-        hidden_states = self.linear(hidden_states)
-        return hidden_states
-
-
-# PhiMHA is a multi-head attention layer. This layer uses an attention mask to prevent tokens from attending to subsequent tokens.
-class PhiMHA(nn.Module):
-    def __init__(self, prefix, config, weights):
-        super().__init__()
-        self.Wqkv = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
-        )
-        self.out_proj = TensorParallelRowLinear.load(
-            config,
-            prefix=f"{prefix}.out_proj",
-            weights=weights,
-            bias=not config.no_bias,
-        )
-        self.op_size = config.n_embd
-        self.head_dim = int(config.n_embd / config.n_head)
-        self.num_heads = config.n_head
-        self.rotary_emb = RotaryEmbedding(
-            config.rotary_dim,
-            config.n_positions,
-        )
-        self.softmax_scale = 1.0 / math.sqrt(self.head_dim)
-
-    def forward(
-        self,
-        hidden_states,
-        past_kv_cache,
-        attention_mask=None,
-    ):
-        b_size, seq_len, _n_embd = hidden_states.shape
-        qkv = self.Wqkv(hidden_states)
-        qkv = qkv.view(b_size, seq_len, 3, self.num_heads, self.head_dim)
-        seqlen_offset = 0 if past_kv_cache is None else past_kv_cache[0].shape[1]
-        q, k, v = self.rotary_emb.apply_rotary_emb_qkv(qkv, seqlen_offset)
-
-        # if there is a kv_cache, then we need to concatenate
-        if past_kv_cache is not None:
-            prev_k, prev_v = past_kv_cache
-            k = torch.cat([prev_k, k], dim=1)
-            v = torch.cat([prev_v, v], dim=1)
-
-        past_kv_cache = [k, v]
-        attn_weights = torch.einsum("bthd,bshd->bhts", q, k * self.softmax_scale)
-
-        if attention_mask is not None:
-            seqlen_k = k.shape[1]
-            seqlen_q = q.shape[1]
-            causal_mask = torch.triu(
-                torch.full((seqlen_q, seqlen_k), -10000.0, device=attn_weights.device),
-                1,
-            )
-            attn_weights = attn_weights + causal_mask.to(dtype=attn_weights.dtype)
-
-        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
-        attn_output = attn_weights.matmul(v.transpose(1, 2)).squeeze(0)
-        attn_output = (
-            attn_output.view((b_size, self.num_heads, seq_len, self.head_dim))
-            .transpose(1, 2)
-            .flatten(-2)
-        )
-        return self.out_proj(attn_output), past_kv_cache
-
-
-# PhiMLP is a multi-layer perceptron. It contains two linear layers with a gelu activation function.
-class PhiMLP(nn.Module):
-    def __init__(self, prefix, config, weights):
-        super().__init__()
-
-        self.n_inner = config.n_inner
-        self.fc1 = FastLinear.load(
-            config=config,
-            prefix=f"{prefix}.fc1",
-            weights=weights,
-            bias=False,
-        )
-        self.fc2 = FastLinear.load(
-            config=config,
-            prefix=f"{prefix}.fc2",
-            weights=weights,
-            bias=False,
-        )
-        self.activation = torch.nn.functional.gelu
-
-    def forward(self, hidden_states):
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
-
-
-# PhiBlock is a single transformer block. It contains a layer norm, a multi-head attention layer and an multi-layer perceptron.
-class PhiBlock(nn.Module):
-    def __init__(self, layer_id, config, weights):
-        super().__init__()
-        self.layer_id = layer_id
-        self.layer_norm = nn.LayerNorm.load(
-            prefix=f"{layer_id}.ln", weights=weights, eps=config.layer_norm_epsilon
-        )
-        self.mixer = PhiMHA(prefix=f"{layer_id}.mixer", config=config, weights=weights)
-        self.mlp = PhiMLP(prefix=f"{layer_id}.mlp", config=config, weights=weights)
-
-    def forward(
-        self,
-        hidden_states,
-        kv_cache,
-        attention_mask,
-    ):
-        residual = hidden_states
-        hidden_states = self.layer_norm(hidden_states)
-        attn_outputs, past_kv_cache = self.mixer(
-            hidden_states, kv_cache, attention_mask
-        )
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        out = attn_outputs + feed_forward_hidden_states + residual
-        return out, past_kv_cache
-
-
-# PhiModel implements the embedding layer and the transformer blocks.
-class PhiModel(nn.Module):
-    def __init__(self, prefix: str, config, weights):
-        super().__init__()
-        self.tp_rank = weights.process_group.rank()
-        self.tp_world_size = weights.process_group.size()
-        self.embed_tokens = TensorParallelEmbedding(
-            prefix=f"{prefix}.embd.wte", weights=weights
-        )
-        self.blocks = nn.ModuleList(
-            [
-                PhiBlock(f"{prefix}.h.{layer_id}", config, weights)
-                for layer_id in range(config.n_layer)
-            ]
-        )
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        return_dict: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
-        hidden_states = self.embed_tokens(input_ids)
-        seq_len = hidden_states.shape[1]
-        mask = None if seq_len <= 1 else attention_mask
-
-        past_key_values = (
-            [None] * len(self.blocks) if past_key_values is None else past_key_values
-        )
-
-        for index, block in enumerate(self.blocks):
-            hidden_states, new_key_values = block(
-                hidden_states, past_key_values[index], mask
-            )
-            past_key_values[index] = new_key_values
-
-        return hidden_states, past_key_values
-
-
-# PhiForCausalLM wraps the PhiModel and PhiCausalLMHead together and returns a CausalLMOutputWithPast object.
-class PhiForCausalLM(torch.nn.Module):
-    def __init__(self, prefix: str, config, weights):
-        super().__init__()
-
-        if not prefix:
-            prefix = "transformer"
-        else:
-            prefix = f"{prefix}.transformer"
-
-        self.model = PhiModel(prefix, config, weights)
-        self.lm_head = PhiCausalLMHead(config, weights)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
-        attention_mask: Optional[torch.ByteTensor] = None,
-        return_dict: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        labels: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
-        model_output = self.model(
-            input_ids, past_key_values, attention_mask, return_dict, use_cache
-        )
-        logits = self.lm_head(model_output[0])
-
-        loss = None
-        if labels is not None:
-            loss = nn.CrossEntropyLoss()(
-                logits[:, :-1].view(-1, logits.size(-1)), labels[:, 1:].view(-1)
-            )
-
-        if not return_dict:
-            return (
-                ((loss,) + (logits,) + model_output[1:])
-                if loss is not None
-                else (logits,) + model_output[1:]
-            )
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=model_output[1],
-            hidden_states=None,
-            attentions=None,
-        )
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
new file mode 100644
index 00000000..441b0016
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
@@ -0,0 +1,946 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen2.5 VL model."""
+
+from typing import Optional, Tuple, List
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from habana_frameworks.torch.hpex.kernels import FusedSDPA
+from vllm_hpu_extension.utils import ModuleFusedSDPA
+
+
+import numpy as np
+
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+
+import torch.nn.functional as F
+
+from text_generation_server.layers.layernorm import FastRMSNorm
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+    HPUPagedAttentionMetadata,
+)
+from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
+    Qwen2Model,
+)
+
+# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+from typing import Union
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.processing_utils import (
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    VideosKwargs,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
+    fps: Union[List[float], float]
+
+
+class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
+    videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "videos_kwargs": {"fps": 2.0},
+    }
+
+
+class Qwen2_5_VLProcessor(ProcessorMixin):
+    r"""
+    Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
+    [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+
+    def __init__(
+        self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
+    ):
+        self.image_token = (
+            "<|image_pad|>"
+            if not hasattr(tokenizer, "image_token")
+            else tokenizer.image_token
+        )
+        self.video_token = (
+            "<|video_pad|>"
+            if not hasattr(tokenizer, "video_token")
+            else tokenizer.video_token
+        )
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Qwen2_5_VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(
+                images=images, videos=None, **output_kwargs["images_kwargs"]
+            )
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+
+        if videos is not None:
+            videos_inputs = self.image_processor(
+                images=None, videos=videos, **output_kwargs["images_kwargs"]
+            )
+            video_grid_thw = videos_inputs["video_grid_thw"]
+
+            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
+            if isinstance(fps, (int, float)):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / fps
+                ] * len(video_grid_thw)
+            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / tmp for tmp in fps
+                ]
+            else:
+                raise ValueError(
+                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
+                )
+            videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
+
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+
+        if not isinstance(text, list):
+            text = [text]
+
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>"
+                        * (image_grid_thw[index].prod() // merge_length),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+
+        if video_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        "<|placeholder|>"
+                        * (video_grid_thw[index].prod() // merge_length),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(
+            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
+        )
+        return names_from_processor + ["second_per_grid_ts"]
+
+
+# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
+class Qwen2_5_VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_5_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=32,
+        hidden_size=3584,
+        hidden_act="silu",
+        intermediate_size=3420,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        spatial_patch_size=14,
+        temporal_patch_size=2,
+        tokens_per_second=4,
+        window_size=112,
+        out_hidden_size=3584,
+        fullatt_block_indexes=[7, 15, 23, 31],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_patch_size = spatial_patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes
+        self.out_hidden_size = out_hidden_size
+
+
+class Qwen2_5_VLConfig(PretrainedConfig):
+
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=8192,
+        intermediate_size=29568,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=80,
+        attention_dropout=0.0,
+        vision_config=None,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        if vision_config is not None:
+            self.vision_config = Qwen2_5_VLVisionConfig(**vision_config)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(
+    tensor: torch.Tensor, freqs: torch.Tensor
+) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+
+
+class Qwen2_5VLAttention(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size // weights.process_group.size()
+        self.head_dim = config.hidden_size // config.num_heads
+        self.num_heads = config.num_heads // weights.process_group.size()
+
+        self.qkv = TensorParallelColumnLinear.load_qkv(
+            config,
+            prefix=f"{prefix}.qkv",
+            weights=weights,
+            bias=False,
+            num_heads=self.num_heads,
+            num_key_value_heads=self.num_heads,
+        )
+        self.qkv.linear.bias = weights.get_sharded(f"{prefix}.qkv.bias", dim=0)
+
+        self.proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.proj",
+            weights=weights,
+            bias=True,
+        )
+        self.softmax_scale = 1.0 / np.sqrt(self.embed_dim // self.num_heads)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int,
+    ) -> torch.Tensor:
+        # apply the qkv linear layer to the hidden state
+        qkv = self.qkv(hidden_state)
+        query, key, value = qkv.split(
+            [self.embed_dim, self.embed_dim, self.embed_dim], dim=1
+        )
+
+        # reshape the query, key, and value tensors
+        _shape = (
+            hidden_state.shape[0],
+            self.num_heads,
+            self.embed_dim // self.num_heads,
+        )
+        query = query.view(*_shape)
+        key = key.view(*_shape)
+        value = value.view(*_shape)
+
+        # apply rotary positional embeddings
+        query = apply_rotary_pos_emb_vision(query.unsqueeze(0), rotary_pos_emb).squeeze(
+            0
+        )
+        key = apply_rotary_pos_emb_vision(key.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        # calc maximum sequence length for any batch
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        causal = False
+
+        # execute sdpa
+        query = query.unsqueeze(0).transpose(1, 2)
+        key = key.unsqueeze(0).transpose(1, 2)
+        value = value.unsqueeze(0).transpose(1, 2)
+        fsdpa_op = ModuleFusedSDPA(FusedSDPA)
+        attn_output = fsdpa_op(
+            query,
+            key,
+            value,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=causal,
+            scale=None,
+            softmax_mode="None",
+            recompute_mode=None,
+            valid_sequence_lengths=None,
+        )
+        attn_output = attn_output.transpose(1, 2).squeeze(0).contiguous()
+
+        # reshape output to original dimensions
+        attn_output = attn_output.reshape(hidden_state.shape[0], -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class Qwen2_5VLVisionMLP(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+        self.up = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.up_proj", weights=weights, config=config, bias=True
+        )
+        self.gate = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.gate_proj", weights=weights, config=config, bias=True
+        )
+        self.down = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.down_proj", weights=weights, config=config, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_states = self.gate(hidden_states)
+        up_states = self.up(hidden_states)
+        activated_states = self.activation_fn(gate_states) * up_states
+        down_states = self.down(activated_states)
+        return down_states
+
+
+class Qwen2_5VLVisionBlock(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.attn = Qwen2_5VLAttention(
+            prefix=f"{prefix}.attn",
+            config=config,
+            weights=weights,
+        )
+        self.norm1 = FastRMSNorm.load(
+            prefix=f"{prefix}.norm1",
+            weights=weights,
+            eps=1e-6,
+        )
+        self.norm2 = FastRMSNorm.load(
+            prefix=f"{prefix}.norm2",
+            weights=weights,
+            eps=1e-6,
+        )
+        self.mlp = Qwen2_5VLVisionMLP(
+            prefix=f"{prefix}.mlp",
+            config=config,
+            weights=weights,
+        )
+
+    def forward(
+        self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
+    ) -> torch.Tensor:
+        norm1_out, _ = self.norm1(hidden_states)
+        attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen)
+        hidden_states = hidden_states + attn_out
+        norm2_out, _ = self.norm2(hidden_states)
+        mlp_out = self.mlp(norm2_out)
+        hidden_states = hidden_states + mlp_out
+        return hidden_states
+
+
+class Qwen2_5VLPatchMerger(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.hidden_size = config.hidden_size * (config.spatial_merge_size**2)
+        self.patch_merger_ln_q = FastRMSNorm.load(
+            prefix=f"{prefix}.ln_q",
+            weights=weights,
+            eps=1e-6,
+        )
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.mlp.0", weights=weights, config=config, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.mlp.2", weights=weights, config=config, bias=True
+        )
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.patch_merger_ln_q(hidden_states)
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = F.gelu(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Qwen2_5VisionModel(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+
+        self.spatial_merge_size = config.spatial_merge_size
+        kernel_size = [config.temporal_patch_size, config.patch_size, config.patch_size]
+        self.patch_embedding = nn.Conv3d(
+            in_channels=config.in_channels,
+            out_channels=config.hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=False,
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embed.proj.weight"), requires_grad=False
+        )
+        head_dim = config.hidden_size // config.num_heads
+
+        theta = 10000.0
+        dim = head_dim // 2
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        self.blocks = nn.ModuleList(
+            [
+                Qwen2_5VLVisionBlock(
+                    prefix=f"{prefix}.blocks.{i}",
+                    config=config,
+                    weights=weights,
+                )
+                for i in range(config.depth)
+            ]
+        )
+        self.merger = Qwen2_5VLPatchMerger(
+            prefix=f"{prefix}.merger",
+            config=config,
+            weights=weights,
+        )
+        # import ipdb; ipdb.set_trace()
+        self.temporal_patch_size = config.temporal_patch_size
+        self.spatial_patch_size = config.spatial_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+        self.window_size = config.window_size
+        self.patch_size = config.patch_size
+        self.spatial_merge_unit = config.spatial_merge_size * config.spatial_merge_size
+        self.fullatt_block_indexes = config.fullatt_block_indexes
+
+    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = (
+            self.window_size // self.spatial_merge_size // self.patch_size
+        )
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.spatial_merge_size,
+                grid_w // self.spatial_merge_size,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w
+            )
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = (
+                seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            )
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+
+        return window_index, cu_window_seqlens
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+
+        # reshape the input tensor for processing
+        shape = (
+            -1,
+            self.in_channels,
+            self.temporal_patch_size,
+            self.spatial_patch_size,
+            self.spatial_patch_size,
+        )
+        pixel_values = pixel_values.view(shape).to(self.patch_embedding.weight.dtype)
+        hidden_states = self.patch_embedding(pixel_values).view(-1, self.embed_dim)
+        # TODO: revisit to see if we can avoid some of these reshapes
+
+        # find the position ids for the input tensor based on the grid_thw
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+
+        pos_ids = torch.cat(pos_ids, dim=0)
+
+        max_grid_size = grid_thw[:, 1:].max()
+
+        # apply the positional embeddings to the position ids
+        seq = torch.arange(
+            max_grid_size, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
+        rotary_pos_emb_full = torch.outer(seq, self.inv_freq)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        seq_len = hidden_states.shape[0]
+        patch_shape = (seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        og_shape = (seq_len, -1)
+
+        hidden_states = hidden_states.view(patch_shape)[window_index, :, :].view(
+            og_shape
+        )
+        rotary_pos_emb = rotary_pos_emb.view(patch_shape)[window_index, :, :].view(
+            og_shape
+        )
+
+        rotary_pos_emb = rotary_pos_emb.to(device=hidden_states.device)
+
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device="cpu",
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens).to(
+            hidden_states.device
+        )
+
+        # create a cu_seqlens tensor to be used in the attention mask
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        max_seqlen = torch.max(cu_seqlens[1:] - cu_seqlens[:-1])
+
+        # iterately apply the blocks to the hidden states
+        for layer_num, block in enumerate(self.blocks):
+            # NOTE: qwen2_5_vl.py has a concept of full attention blocks
+            # that are applied at specific layers.
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+
+            hidden_states = block(
+                hidden_states, cu_seqlens_now, rotary_pos_emb, max_seqlen
+            )
+
+        # apply the final patch merger to the hidden states
+        hidden_states = self.merger(hidden_states)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :]
+        return hidden_states
+
+
+class Qwen2_5VLForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        config.vision_config.quantize = None
+        config.vision_config.speculator = config.speculator
+        # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
+        # returns rope_scaling.type == "default" for Qwen2_5-VL model at the moment
+        if (
+            hasattr(config, "rope_scaling")
+            and config.rope_scaling is not None
+            and config.rope_scaling.get("type", None) == "default"
+        ):
+            config.rope_scaling.update({"rope_type": "mrope"})
+        self.hidden_size = config.hidden_size
+        self.vision_start_token_id = config.vision_start_token_id
+        self.vision_end_token_id = config.vision_end_token_id
+        self.image_token_id = config.image_token_id
+        self.video_token_id = config.video_token_id
+        self.spatial_merge_size = config.vision_config.spatial_merge_size
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix="model.embed_tokens", weights=weights
+        )
+        self.visual = Qwen2_5VisionModel(
+            prefix="visual", config=config.vision_config, weights=weights
+        )
+        self.text_model = Qwen2Model(prefix=None, config=config, weights=weights)
+        if config.tie_word_embeddings:
+            suffix = "model.embed_tokens"
+        else:
+            suffix = "lm_head"
+
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix=suffix if not prefix else f"{prefix}.{suffix}",
+            weights=weights,
+        )
+        self.device = weights.device
+
+    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
+    # modified to first find segments then initialize position ids for each segment
+    # Steps:
+    #  locate all vision and text segments
+    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
+    #  calculate `text_segment_lengths` for each text segment to be used as offset
+    #  create position ids for each vision segment based on the image grid
+    #  create position ids for each text segment
+    #  combine all the position ids
+    #  the final segment is the difference between the last vision segment and the end of the input
+    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
+    def get_position_ids(
+        self,
+        input_ids: torch.Tensor,
+        image_grid_thw: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if image_grid_thw is None:
+            return (
+                torch.arange(input_ids.shape[0], device=input_ids.device)
+                .unsqueeze(1)
+                .repeat(1, 3)
+            )
+
+        spatial_merge_size = self.spatial_merge_size
+        vision_start_token_id = self.vision_start_token_id
+        vision_end_token_id = self.vision_end_token_id
+        device = input_ids.device
+        dtype = input_ids.dtype
+        input_ids_len = input_ids.shape[0]
+
+        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
+        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
+        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
+        prev_vision_end = torch.cat(
+            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
+        )
+        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
+        vision_widths_max = torch.cat(
+            [
+                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
+                image_grid_thw[:-1, 2] // spatial_merge_size,
+            ]
+        )
+        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
+        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
+        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision
+
+        # create position ids for each vision segment based on the image grid
+        llm_pos_ids_list = []
+        for i, _ in enumerate(vision_segments):
+            t, h, w = (
+                image_grid_thw[i][0],
+                image_grid_thw[i][1] // spatial_merge_size,
+                image_grid_thw[i][2] // spatial_merge_size,
+            )
+            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
+            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
+            w_indices = torch.arange(w, device=device).repeat(t * h)
+            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)
+
+            # offset by the position of the last vision segment
+            im = image_position_ids + vision_segment_lengths[i]
+            llm_pos_ids_list.append(im)
+
+        # create position ids for each text segment
+        text_ranges = [
+            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
+            + text_segment_lengths[i]
+            for i, seq_len in enumerate(text_lengths_between_vision)
+        ]
+
+        full_llm_pos_ids_list = [
+            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
+        ]
+        # import ipdb
+
+        # ipdb.set_trace()
+        max_s = full_llm_pos_ids_list[-1].max() + 1
+        final_text_len = input_ids_len - vision_ends[-1]
+        if final_text_len > 0:
+            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
+            full_llm_pos_ids_list.append(m + max_s)
+
+        position_ids = (
+            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
+        )
+        return position_ids
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
+        lm_head_indices: Optional[torch.Tensor],
+        pixel_values: torch.FloatTensor = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        # Unused in this model
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_attention_mask=None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        image_indices=None,
+    ):
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        # apply the visual model to the pixel values if they are provided
+        if pixel_values is not None and len(pixel_values) > 0:
+            if pixel_values is not None:
+                image_embeds = self.visual(
+                    pixel_values, grid_thw=image_grid_thw
+                ).squeeze(0)
+                mask = torch.where(input_ids == self.image_token_id)
+                inputs_embeds[mask] = image_embeds
+
+        hidden_states = self.text_model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            slots=slots,
+            seqlen=seqlen,
+            hpu_attention_meta=hpu_attention_meta,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_vl.py
new file mode 100644
index 00000000..47ae2ac9
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_vl.py
@@ -0,0 +1,519 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen2 VL model."""
+
+from typing import Optional, Tuple, List
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+
+from habana_frameworks.torch.hpex.kernels import FusedSDPA
+from vllm_hpu_extension.utils import ModuleFusedSDPA
+
+
+import numpy as np
+
+from transformers.activations import ACT2FN
+import torch.nn.functional as F
+
+from text_generation_server.layers.layernorm import FastLayerNorm, FastRMSNorm
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+)
+from text_generation_server.layers.attention import (
+    Seqlen,
+    HPUPagedAttentionMetadata,
+)
+from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
+    Qwen2Model,
+)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(
+    tensor: torch.Tensor, freqs: torch.Tensor
+) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+
+
+class Qwen2VLAttention(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.embed_dim // weights.process_group.size()
+        self.head_dim = config.hidden_size // config.num_heads
+        self.num_heads = config.num_heads // weights.process_group.size()
+
+        self.qkv = TensorParallelColumnLinear.load_qkv(
+            config,
+            prefix=f"{prefix}.qkv",
+            weights=weights,
+            bias=False,
+            num_heads=self.num_heads,
+            num_key_value_heads=self.num_heads,
+        )
+        self.qkv.linear.bias = weights.get_sharded(f"{prefix}.qkv.bias", dim=0)
+        self.proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.proj",
+            weights=weights,
+            bias=True,
+        )
+        self.softmax_scale = 1.0 / np.sqrt(self.embed_dim // self.num_heads)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+        max_seqlen: int,
+    ) -> torch.Tensor:
+        # apply the qkv linear layer to the hidden state
+        qkv = self.qkv(hidden_state)
+        query, key, value = qkv.split(
+            [self.embed_dim, self.embed_dim, self.embed_dim], dim=1
+        )
+
+        # reshape the query, key, and value tensors
+        _shape = (
+            hidden_state.shape[0],
+            self.num_heads,
+            self.embed_dim // self.num_heads,
+        )
+        query = query.view(*_shape)
+        key = key.view(*_shape)
+        value = value.view(*_shape)
+
+        # apply rotary positional embeddings
+        query = apply_rotary_pos_emb_vision(query.unsqueeze(0), rotary_pos_emb).squeeze(
+            0
+        )
+        key = apply_rotary_pos_emb_vision(key.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        # calc maximum sequence length for any batch
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        causal = False
+
+        # execute sdpa
+        query = query.unsqueeze(0).transpose(1, 2)
+        key = key.unsqueeze(0).transpose(1, 2)
+        value = value.unsqueeze(0).transpose(1, 2)
+        fsdpa_op = ModuleFusedSDPA(FusedSDPA)
+        attn_output = fsdpa_op(
+            query,
+            key,
+            value,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=causal,
+            scale=None,
+            softmax_mode="None",
+            recompute_mode=None,
+            valid_sequence_lengths=None,
+        )
+        attn_output = attn_output.transpose(1, 2).squeeze(0).contiguous()
+        # reshape output to original dimensions
+        attn_output = attn_output.reshape(hidden_state.shape[0], -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class Qwen2VLVisionMLP(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Qwen2VLVisionBlock(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.attn = Qwen2VLAttention(
+            prefix=f"{prefix}.attn",
+            config=config,
+            weights=weights,
+        )
+        self.norm1 = FastLayerNorm.load(
+            prefix=f"{prefix}.norm1",
+            weights=weights,
+            eps=1e-6,
+        )
+        self.norm2 = FastLayerNorm.load(
+            prefix=f"{prefix}.norm2",
+            weights=weights,
+            eps=1e-6,
+        )
+        self.mlp = Qwen2VLVisionMLP(
+            prefix=f"{prefix}.mlp",
+            config=config,
+            weights=weights,
+        )
+
+    def forward(
+        self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
+    ) -> torch.Tensor:
+        norm1_out, residual = self.norm1(hidden_states)
+        attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen)
+        hidden_states = attn_out + residual
+        norm2_out, residual = self.norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(norm2_out)
+        return hidden_states
+
+
+class Qwen2VLPatchMerger(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.hidden_size = config.embed_dim * (config.spatial_merge_size**2)
+        self.patch_merger_ln_q = FastLayerNorm.load(
+            prefix=f"{prefix}.ln_q",
+            weights=weights,
+            eps=1e-6,
+        )
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.mlp.0", weights=weights, config=config, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.mlp.2", weights=weights, config=config, bias=True
+        )
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.patch_merger_ln_q(hidden_states)
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = F.gelu(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Qwen2VisionModel(nn.Module):
+    def __init__(self, *, prefix, config, weights):
+        super().__init__()
+        self.spatial_merge_size = config.spatial_merge_size
+        kernel_size = [config.temporal_patch_size, config.patch_size, config.patch_size]
+        self.patch_embedding = nn.Conv3d(
+            in_channels=config.in_chans,
+            out_channels=config.embed_dim,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=False,
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embed.proj.weight"), requires_grad=False
+        )
+        head_dim = config.embed_dim // config.num_heads
+        # TODO: replace with static positional embeddings once implemented
+        theta = 10000.0
+        dim = head_dim // 2
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        self.blocks = nn.ModuleList(
+            [
+                Qwen2VLVisionBlock(
+                    prefix=f"{prefix}.blocks.{i}",
+                    config=config,
+                    weights=weights,
+                )
+                for i in range(config.depth)
+            ]
+        )
+        self.merger = Qwen2VLPatchMerger(
+            prefix=f"{prefix}.merger",
+            config=config,
+            weights=weights,
+        )
+
+        self.temporal_patch_size = config.temporal_patch_size
+        self.spatial_patch_size = config.spatial_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.embed_dim
+
+    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        # reshape the input tensor for processing
+        shape = (
+            -1,
+            self.in_channels,
+            self.temporal_patch_size,
+            self.spatial_patch_size,
+            self.spatial_patch_size,
+        )
+        pixel_values = pixel_values.view(shape).to(self.patch_embedding.weight.dtype)
+        hidden_states = self.patch_embedding(pixel_values).view(-1, self.embed_dim)
+        # TODO: revisit to see if we can avoid some of these reshapes
+
+        # find the position ids for the input tensor based on the grid_thw
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+
+        # apply the positional embeddings to the position ids
+        seq = torch.arange(
+            max_grid_size, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
+        rotary_pos_emb_full = torch.outer(seq, self.inv_freq)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, hidden_states.dtype)
+
+        # create a cu_seqlens tensor to be used in the attention mask
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        max_seqlen = torch.max(cu_seqlens[1:] - cu_seqlens[:-1])
+        # iterately apply the blocks to the hidden states
+        for block in self.blocks:
+            hidden_states = block(hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen)
+
+        # apply the final patch merger to the hidden states
+        hidden_states = self.merger(hidden_states)
+        return hidden_states
+
+
+class Qwen2VLForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        config.vision_config.quantize = None
+        config.vision_config.speculator = config.speculator
+        # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
+        # returns rope_scaling.type == "default" for Qwen2-VL model at the moment
+        if (
+            hasattr(config, "rope_scaling")
+            and config.rope_scaling is not None
+            and config.rope_scaling.get("type", None) == "default"
+        ):
+            config.rope_scaling.update({"rope_type": "mrope"})
+        self.hidden_size = config.hidden_size
+        self.vision_start_token_id = config.vision_start_token_id
+        self.vision_end_token_id = config.vision_end_token_id
+        self.image_token_id = config.image_token_id
+        self.video_token_id = config.video_token_id
+        self.spatial_merge_size = config.vision_config.spatial_merge_size
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix="model.embed_tokens", weights=weights
+        )
+        self.visual = Qwen2VisionModel(
+            prefix="visual", config=config.vision_config, weights=weights
+        )
+        self.text_model = Qwen2Model(prefix=None, config=config, weights=weights)
+        if config.tie_word_embeddings:
+            suffix = "model.embed_tokens"
+        else:
+            suffix = "lm_head"
+
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix=suffix if not prefix else f"{prefix}.{suffix}",
+            weights=weights,
+        )
+        self.norm = FastRMSNorm.load(
+            prefix="model.norm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.device = weights.device
+
+    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
+    # modified to first find segments then initialize position ids for each segment
+    # Steps:
+    #  locate all vision and text segments
+    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
+    #  calculate `text_segment_lengths` for each text segment to be used as offset
+    #  create position ids for each vision segment based on the image grid
+    #  create position ids for each text segment
+    #  combine all the position ids
+    #  the final segment is the difference between the last vision segment and the end of the input
+    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
+    def get_position_ids(
+        self,
+        input_ids: torch.Tensor,
+        image_grid_thw: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if image_grid_thw is None:
+            return (
+                torch.arange(input_ids.shape[0], device=input_ids.device)
+                .unsqueeze(1)
+                .repeat(1, 3)
+            )
+
+        spatial_merge_size = self.spatial_merge_size
+        vision_start_token_id = self.vision_start_token_id
+        vision_end_token_id = self.vision_end_token_id
+        device = input_ids.device
+        dtype = input_ids.dtype
+        input_ids_len = input_ids.shape[0]
+
+        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
+        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
+        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
+        prev_vision_end = torch.cat(
+            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
+        )
+        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
+        vision_widths_max = torch.cat(
+            [
+                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
+                image_grid_thw[:-1, 2] // spatial_merge_size,
+            ]
+        )
+        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
+        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
+        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision
+
+        # create position ids for each vision segment based on the image grid
+        llm_pos_ids_list = []
+        for i, _ in enumerate(vision_segments):
+            t, h, w = (
+                image_grid_thw[i][0],
+                image_grid_thw[i][1] // spatial_merge_size,
+                image_grid_thw[i][2] // spatial_merge_size,
+            )
+            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
+            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
+            w_indices = torch.arange(w, device=device).repeat(t * h)
+            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)
+
+            # offset by the position of the last vision segment
+            im = image_position_ids + vision_segment_lengths[i]
+            llm_pos_ids_list.append(im)
+
+        # create position ids for each text segment
+        text_ranges = [
+            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
+            + text_segment_lengths[i]
+            for i, seq_len in enumerate(text_lengths_between_vision)
+        ]
+
+        full_llm_pos_ids_list = [
+            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
+        ]
+        max_s = full_llm_pos_ids_list[-1].max() + 1
+        final_text_len = input_ids_len - vision_ends[-1]
+        if final_text_len > 0:
+            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
+            full_llm_pos_ids_list.append(m + max_s)
+
+        position_ids = (
+            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
+        )
+        return position_ids
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
+        lm_head_indices: Optional[torch.Tensor],
+        pixel_values: torch.FloatTensor = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_attention_mask=None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        image_indices=None,
+    ):
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        # apply the visual model to the pixel values if they are provided
+        if pixel_values is not None and len(pixel_values) > 0:
+            if pixel_values is not None:
+                image_embeds = self.visual(
+                    pixel_values, grid_thw=image_grid_thw
+                ).squeeze(0)
+                mask = torch.where(input_ids == self.image_token_id)
+                inputs_embeds[mask] = image_embeds
+
+        hidden_states = self.text_model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            slots=slots,
+            seqlen=seqlen,
+            hpu_attention_meta=hpu_attention_meta,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/t5_modeling.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/t5_modeling.py
deleted file mode 100644
index e6666acd..00000000
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ /dev/null
@@ -1,1227 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch T5 model."""
-
-import copy
-import math
-import warnings
-from typing import Optional, Tuple, Union
-
-from loguru import logger
-
-import torch
-import torch.distributed
-from torch import nn
-from torch.nn import CrossEntropyLoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqLMOutput,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import (
-    is_torch_fx_proxy,
-)
-from transformers import T5Config
-from text_generation_server.layers import (
-    TensorParallelColumnLinear,
-    TensorParallelEmbedding,
-    TensorParallelRowLinear,
-    SpeculativeHead,
-)
-
-# copied from https://github.com/huggingface/transformers/blob/cd4584e3c809bb9e1392ccd3fe38b40daba5519a/src/transformers/models/t5/modeling_t5.py#L1316
-# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-__HEAD_MASK_WARNING_MSG = """
-The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
-`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
-If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
-num_heads)`.
-"""
-
-
-class PartialTPEmbedding(nn.Module):
-    def __init__(self, prefix: str, weights):
-        super().__init__()
-        weight = weights.get_sharded(f"{prefix}.weight", dim=1)
-        self.weight = nn.Parameter(weight)
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return torch.nn.functional.embedding(input, self.weight)
-
-
-@torch.jit.script
-def layer_norm(hidden_states, weight, epsilon):
-    # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-    # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
-    # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-    # half-precision inputs is done in fp32
-
-    variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + epsilon)
-
-    # convert into half-precision if necessary
-    if weight.dtype in [torch.float16, torch.bfloat16]:
-        hidden_states = hidden_states.to(weight.dtype)
-
-    return weight * hidden_states
-
-
-class T5LayerNorm(nn.Module):
-    def __init__(self, prefix, weights, eps=1e-6):
-        """
-        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
-        """
-        super().__init__()
-        weight = weights.get_tensor(f"{prefix}.weight")
-        self.weight = nn.Parameter(weight)
-        self.variance_epsilon = torch.tensor(eps)
-
-    def forward(self, hidden_states):
-        return layer_norm(hidden_states, self.weight, self.variance_epsilon)
-
-
-try:
-    from apex.normalization import FusedRMSNorm
-
-    T5LayerNorm = FusedRMSNorm  # noqa
-
-    logger.info(
-        "Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm"
-    )
-except ImportError:
-    # using the normal T5LayerNorm
-    pass
-except Exception:
-    logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
-    pass
-
-ALL_LAYERNORM_LAYERS.append(T5LayerNorm)
-
-
-class T5DenseActDense(nn.Module):
-    def __init__(self, config: T5Config, prefix, weights):
-        super().__init__()
-        self.wi = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.wi", weights=weights, bias=False
-        )
-
-        ### XXX: T5 models do not handle well both f16 and quantization.
-        ### Overidding specifically this layer for that reason.
-        ### https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L316
-        ### https://github.com/huggingface/transformers/issues/20287
-        _q = config.quantize
-        _dtype = weights.dtype
-        weights.dtype = torch.float32
-        config.quantize = None
-        self.wo_cast = (torch.float32, _dtype)
-        self.wo = TensorParallelRowLinear.load(
-            config, prefix=f"{prefix}.wo", weights=weights, bias=False
-        )
-        weights.dtype = _dtype
-        config.quantize = _q
-
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.act = (
-            ACT2FN[config.dense_act_fn]
-            if "gelu" not in config.dense_act_fn
-            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
-        )
-
-    def forward(self, hidden_states):
-        hidden_states = self.wi(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        hidden_states = hidden_states.to(dtype=self.wo_cast[0])
-        hidden_states = self.wo(hidden_states)
-        # XXX: Recasting is already done within the layer norm.
-        # Casting back to float16 here modifies results
-        # hidden_states = hidden_states.to(dtype=self.wo_cast[1])
-        return hidden_states
-
-
-class T5DenseGatedActDense(nn.Module):
-    def __init__(self, config: T5Config, prefix, weights):
-        super().__init__()
-        self.wi_0 = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.wi_0", weights=weights, bias=False
-        )
-        self.wi_1 = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.wi_1", weights=weights, bias=False
-        )
-        ### XXX: T5 models do not handle well both f16 and quantization.
-        ### Overidding specifically this layer for that reason.
-        ### https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L316
-        ### https://github.com/huggingface/transformers/issues/20287
-        _q = config.quantize
-        _dtype = weights.dtype
-        weights.dtype = torch.float32
-        config.quantize = None
-        self.wo_cast = (torch.float32, _dtype)
-        self.wo = TensorParallelRowLinear.load(
-            config, prefix=f"{prefix}.wo", weights=weights, bias=False
-        )
-        weights.dtype = _dtype
-        config.quantize = _q
-
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.act = (
-            ACT2FN[config.dense_act_fn]
-            if "gelu" not in config.dense_act_fn
-            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
-        )
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-
-        hidden_states = hidden_states.to(dtype=self.wo_cast[0])
-        hidden_states = self.wo(hidden_states)
-        # XXX: Recasting is already done within the layer norm.
-        # Casting back to float16 here modifies results
-        # hidden_states = hidden_states.to(dtype=self.wo_cast[1])
-        return hidden_states
-
-
-class T5LayerFF(nn.Module):
-    def __init__(self, config: T5Config, prefix, weights):
-        super().__init__()
-        if config.is_gated_act:
-            self.DenseReluDense = T5DenseGatedActDense(
-                config, prefix=f"{prefix}.DenseReluDense", weights=weights
-            )
-        else:
-            self.DenseReluDense = T5DenseActDense(
-                config, prefix=f"{prefix}.DenseReluDense", weights=weights
-            )
-
-        self.layer_norm = T5LayerNorm(
-            prefix=f"{prefix}.layer_norm",
-            weights=weights,
-            eps=config.layer_norm_epsilon,
-        )
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states):
-        forwarded_states = self.layer_norm(hidden_states)
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        hidden_states = hidden_states + self.dropout(forwarded_states)
-        return hidden_states
-
-
-class T5Attention(nn.Module):
-    def __init__(
-        self, config: T5Config, prefix, weights, has_relative_attention_bias=False
-    ):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.has_relative_attention_bias = has_relative_attention_bias
-        self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.relative_attention_max_distance = config.relative_attention_max_distance
-        self.d_model = config.d_model
-        self.key_value_proj_dim = config.d_kv
-        self.n_heads = config.num_heads
-        self.dropout = config.dropout_rate
-        self.inner_dim = self.n_heads * self.key_value_proj_dim
-
-        process_group = weights.process_group
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        assert self.n_heads % process_group.size() == 0
-        self.q = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.q", weights=weights, bias=False
-        )
-        self.k = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.k", weights=weights, bias=False
-        )
-        self.v = TensorParallelColumnLinear.load(
-            config, prefix=f"{prefix}.v", weights=weights, bias=False
-        )
-        self.o = TensorParallelRowLinear.load(
-            config, prefix=f"{prefix}.o", weights=weights, bias=False
-        )
-        if self.n_heads % weights.process_group.size() != 0:
-            raise ValueError(
-                f"`n_heads` must be divisible by `num_shards` (got `n_heads`: {self.n_heads} "
-                f"and `num_shards`: {weights.process_group.size()}"
-            )
-        self.n_heads = self.n_heads // process_group.size()
-        self.inner_dim = self.inner_dim // process_group.size()
-
-        if self.has_relative_attention_bias:
-            self.relative_attention_bias = PartialTPEmbedding(
-                prefix=f"{prefix}.relative_attention_bias", weights=weights
-            )
-
-    @staticmethod
-    def _relative_position_bucket(
-        relative_position, bidirectional=True, num_buckets=32, max_distance=128
-    ):
-        """
-        Adapted from Mesh Tensorflow:
-        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-
-        Translate relative position to a bucket number for relative attention. The relative position is defined as
-        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
-        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
-        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
-        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
-        This should allow for more graceful generalization to longer sequences than the model has been trained on
-
-        Args:
-            relative_position: an int32 Tensor
-            bidirectional: a boolean - whether the attention is bidirectional
-            num_buckets: an integer
-            max_distance: an integer
-
-        Returns:
-            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
-        """
-        relative_buckets = 0
-        if bidirectional:
-            num_buckets //= 2
-            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
-            relative_position = torch.abs(relative_position)
-        else:
-            relative_position = -torch.min(
-                relative_position, torch.zeros_like(relative_position)
-            )
-        # now relative_position is in the range [0, inf)
-
-        # half of the buckets are for exact increments in positions
-        max_exact = num_buckets // 2
-        is_small = relative_position < max_exact
-
-        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
-        relative_position_if_large = max_exact + (
-            torch.log(relative_position.float() / max_exact)
-            / math.log(max_distance / max_exact)
-            * (num_buckets - max_exact)
-        ).to(torch.long)
-        relative_position_if_large = torch.min(
-            relative_position_if_large,
-            torch.full_like(relative_position_if_large, num_buckets - 1),
-        )
-
-        relative_buckets += torch.where(
-            is_small, relative_position, relative_position_if_large
-        )
-        return relative_buckets
-
-    def compute_bias(self, query_length, key_length, device=None):
-        """Compute binned relative position bias"""
-        if device is None:
-            device = self.relative_attention_bias.weight.device
-        context_position = torch.arange(query_length, dtype=torch.long, device=device)[
-            :, None
-        ]
-        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[
-            None, :
-        ]
-        relative_position = (
-            memory_position - context_position
-        )  # shape (query_length, key_length)
-        relative_position_bucket = self._relative_position_bucket(
-            relative_position,  # shape (query_length, key_length)
-            bidirectional=(not self.is_decoder),
-            num_buckets=self.relative_attention_num_buckets,
-            max_distance=self.relative_attention_max_distance,
-        )
-        values = self.relative_attention_bias(
-            relative_position_bucket
-        )  # shape (query_length, key_length, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(
-            0
-        )  # shape (1, num_heads, query_length, key_length)
-        return values
-
-    def forward(
-        self,
-        hidden_states,
-        mask=None,
-        key_value_states=None,
-        position_bias=None,
-        past_key_value=None,
-        layer_head_mask=None,
-        query_length=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        """
-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
-        """
-        # Input is (batch_size, seq_length, dim)
-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-
-        batch_size, seq_length = hidden_states.shape[:2]
-
-        real_seq_length = seq_length
-
-        if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
-            real_seq_length += (
-                past_key_value[0].shape[2] if query_length is None else query_length
-            )
-
-        key_length = (
-            real_seq_length if key_value_states is None else key_value_states.shape[1]
-        )
-
-        def shape(states):
-            """projection"""
-            return states.view(
-                batch_size, -1, self.n_heads, self.key_value_proj_dim
-            ).transpose(1, 2)
-
-        def unshape(states):
-            """reshape"""
-            return (
-                states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-            )
-
-        def project(hidden_states, proj_layer, key_value_states, past_key_value):
-            """projects hidden states correctly to key/query states"""
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-            elif past_key_value is None:
-                # cross-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-
-            if past_key_value is not None:
-                if key_value_states is None:
-                    # self-attn
-                    # (batch_size, n_heads, key_length, dim_per_head)
-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-                elif past_key_value.shape[2] != key_value_states.shape[1]:
-                    # checking that the `sequence_length` of the `past_key_value` is the same as
-                    # the provided `key_value_states` to support prefix tuning
-                    # cross-attn
-                    # (batch_size, n_heads, seq_length, dim_per_head)
-                    hidden_states = shape(proj_layer(key_value_states))
-                else:
-                    # cross-attn
-                    hidden_states = past_key_value
-            return hidden_states
-
-        # get query states
-        query_states = shape(
-            self.q(hidden_states)
-        )  # (batch_size, n_heads, seq_length, dim_per_head)
-
-        # get key/value states
-        key_states = project(
-            hidden_states,
-            self.k,
-            key_value_states,
-            past_key_value[0] if past_key_value is not None else None,
-        )
-        value_states = project(
-            hidden_states,
-            self.v,
-            key_value_states,
-            past_key_value[1] if past_key_value is not None else None,
-        )
-
-        # compute scores
-        scores = torch.matmul(
-            query_states, key_states.transpose(3, 2)
-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-
-        if position_bias is None:
-            if not self.has_relative_attention_bias:
-                position_bias = torch.zeros(
-                    (1, self.n_heads, real_seq_length, key_length),
-                    device=scores.device,
-                    dtype=scores.dtype,
-                )
-            else:
-                position_bias = self.compute_bias(
-                    real_seq_length, key_length, device=scores.device
-                )
-
-            # if key and values are already calculated
-            # we want only the last query position bias
-            if past_key_value is not None:
-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
-
-            if mask is not None:
-                position_bias = (
-                    position_bias + mask
-                )  # (batch_size, n_heads, seq_length, key_length)
-
-        position_bias_masked = position_bias
-
-        scores += position_bias_masked
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )  # (batch_size, n_heads, seq_length, key_length)
-
-        # Mask heads if we want to
-        if layer_head_mask is not None:
-            attn_weights = attn_weights * layer_head_mask
-
-        attn_output = unshape(
-            torch.matmul(attn_weights, value_states)
-        )  # (batch_size, seq_length, dim)
-        attn_output = self.o(attn_output)
-
-        present_key_value_state = (
-            (key_states, value_states) if (self.is_decoder and use_cache) else None
-        )
-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
-
-        if output_attentions:
-            outputs = outputs + (attn_weights,)
-        return outputs
-
-
-class T5LayerSelfAttention(nn.Module):
-    def __init__(self, config, prefix, weights, has_relative_attention_bias=False):
-        super().__init__()
-        self.SelfAttention = T5Attention(
-            config,
-            prefix=f"{prefix}.SelfAttention",
-            weights=weights,
-            has_relative_attention_bias=has_relative_attention_bias,
-        )
-        self.layer_norm = T5LayerNorm(
-            prefix=f"{prefix}.layer_norm",
-            weights=weights,
-            eps=config.layer_norm_epsilon,
-        )
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[
-            1:
-        ]  # add attentions if we output them
-        return outputs
-
-
-class T5LayerCrossAttention(nn.Module):
-    def __init__(self, config, prefix, weights):
-        super().__init__()
-        self.EncDecAttention = T5Attention(
-            config,
-            prefix=f"{prefix}.EncDecAttention",
-            weights=weights,
-            has_relative_attention_bias=False,
-        )
-        self.layer_norm = T5LayerNorm(
-            prefix=f"{prefix}.layer_norm",
-            weights=weights,
-            eps=config.layer_norm_epsilon,
-        )
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        key_value_states,
-        attention_mask=None,
-        position_bias=None,
-        layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        query_length=None,
-        output_attentions=False,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            key_value_states=key_value_states,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            query_length=query_length,
-            output_attentions=output_attentions,
-        )
-        layer_output = hidden_states + self.dropout(attention_output[0])
-        outputs = (layer_output,) + attention_output[
-            1:
-        ]  # add attentions if we output them
-        return outputs
-
-
-class T5Block(nn.Module):
-    def __init__(self, config, prefix, weights, has_relative_attention_bias: bool):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.layer = nn.ModuleList()
-        self.layer.append(
-            T5LayerSelfAttention(
-                config,
-                prefix=f"{prefix}.layer.0",
-                weights=weights,
-                has_relative_attention_bias=has_relative_attention_bias,
-            )
-        )
-        if self.is_decoder:
-            i = 2
-            self.layer.append(
-                T5LayerCrossAttention(
-                    config, prefix=f"{prefix}.layer.1", weights=weights
-                )
-            )
-        else:
-            i = 1
-
-        self.layer.append(
-            T5LayerFF(config, prefix=f"{prefix}.layer.{i}", weights=weights)
-        )
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        layer_head_mask=None,
-        cross_attn_layer_head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-        if past_key_value is not None:
-            if not self.is_decoder:
-                logger.warning(
-                    "`past_key_values` is passed to the encoder. Please make sure this is intended."
-                )
-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-            if len(past_key_value) != expected_num_past_key_values:
-                raise ValueError(
-                    f"There should be {expected_num_past_key_values} past states. "
-                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
-                    f"Got {len(past_key_value)} past key / value states"
-                )
-
-            self_attn_past_key_value = past_key_value[:2]
-            cross_attn_past_key_value = past_key_value[2:]
-        else:
-            self_attn_past_key_value, cross_attn_past_key_value = None, None
-
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            layer_head_mask=layer_head_mask,
-            past_key_value=self_attn_past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, present_key_value_state = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[
-            2:
-        ]  # Keep self-attention outputs and relative position weights
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.where(
-                torch.isinf(hidden_states).any(),
-                torch.finfo(hidden_states.dtype).max - 1000,
-                torch.finfo(hidden_states.dtype).max,
-            )
-            hidden_states = torch.clamp(
-                hidden_states, min=-clamp_value, max=clamp_value
-            )
-
-        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
-        if do_cross_attention:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
-            else:
-                query_length = None
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                query_length=query_length,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = cross_attention_outputs[0]
-
-            # clamp inf values to enable fp16 training
-            if hidden_states.dtype == torch.float16:
-                clamp_value = torch.where(
-                    torch.isinf(hidden_states).any(),
-                    torch.finfo(hidden_states.dtype).max - 1000,
-                    torch.finfo(hidden_states.dtype).max,
-                )
-                hidden_states = torch.clamp(
-                    hidden_states, min=-clamp_value, max=clamp_value
-                )
-
-            # Combine self attn and cross attn key value states
-            if present_key_value_state is not None:
-                present_key_value_state = (
-                    present_key_value_state + cross_attention_outputs[1]
-                )
-
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
-        # Apply Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states)
-
-        # clamp inf values to enable fp16 training
-        if hidden_states.dtype == torch.float16:
-            clamp_value = torch.where(
-                torch.isinf(hidden_states).any(),
-                torch.finfo(hidden_states.dtype).max - 1000,
-                torch.finfo(hidden_states.dtype).max,
-            )
-            hidden_states = torch.clamp(
-                hidden_states, min=-clamp_value, max=clamp_value
-            )
-
-        outputs = (hidden_states,)
-
-        if use_cache:
-            outputs = outputs + (present_key_value_state,) + attention_outputs
-        else:
-            outputs = outputs + attention_outputs
-
-        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-
-
-class T5PreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = T5Config
-
-    def _shift_right(self, input_ids):
-        decoder_start_token_id = self.config.decoder_start_token_id
-        pad_token_id = self.config.pad_token_id
-
-        assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
-            " See T5 docs for more information"
-        )
-
-        # shift inputs to the right
-        if is_torch_fx_proxy(input_ids):
-            # Item assignment is not supported natively for proxies.
-            shifted_input_ids = torch.full(
-                input_ids.shape[:-1] + (1,), decoder_start_token_id
-            )
-            shifted_input_ids = torch.cat(
-                [shifted_input_ids, input_ids[..., :-1]], dim=-1
-            )
-        else:
-            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-            shifted_input_ids[..., 0] = decoder_start_token_id
-
-        assert (
-            pad_token_id is not None
-        ), "self.model.config.pad_token_id has to be defined."
-        # replace possible -100 values in labels by `pad_token_id`
-        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-
-        return shifted_input_ids
-
-
-class T5Stack(T5PreTrainedModel):
-    def __init__(self, config, prefix, weights, embed_tokens):
-        super().__init__(config)
-
-        self.is_decoder = config.is_decoder
-
-        self.embed_tokens = embed_tokens
-        self.block = nn.ModuleList(
-            [
-                T5Block(
-                    config,
-                    prefix=f"{prefix}.block.{layer_id}",
-                    weights=weights,
-                    has_relative_attention_bias=(layer_id == 0),
-                )
-                for layer_id in range(config.num_layers)
-            ]
-        )
-        self.final_layer_norm = T5LayerNorm(
-            prefix=f"{prefix}.final_layer_norm",
-            weights=weights,
-            eps=config.layer_norm_epsilon,
-        )
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        inputs_embeds=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        # Model parallel
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        if input_ids is not None and inputs_embeds is not None:
-            err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(
-                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(
-                f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds"
-            )
-
-        if inputs_embeds is None:
-            assert (
-                self.embed_tokens is not None
-            ), "You have to initialize the model with valid token embeddings"
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        batch_size, seq_length = input_shape
-
-        # required mask seq length can be calculated via length of past
-        mask_seq_length = (
-            past_key_values[0][0].shape[2] + seq_length
-            if past_key_values is not None
-            else seq_length
-        )
-
-        if use_cache is True:
-            assert (
-                self.is_decoder
-            ), f"`use_cache` can only be set to `True` if {self} is used as a decoder"
-
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                batch_size, mask_seq_length, device=inputs_embeds.device
-            )
-        if (
-            self.is_decoder
-            and encoder_attention_mask is None
-            and encoder_hidden_states is not None
-        ):
-            encoder_seq_length = encoder_hidden_states.shape[1]
-            encoder_attention_mask = torch.ones(
-                batch_size,
-                encoder_seq_length,
-                device=inputs_embeds.device,
-                dtype=torch.long,
-            )
-
-        # initialize past_key_values with `None` if past does not exist
-        if past_key_values is None:
-            past_key_values = [None] * len(self.block)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(
-            attention_mask, input_shape
-        )
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.is_decoder and encoder_hidden_states is not None:
-            (
-                encoder_batch_size,
-                encoder_sequence_length,
-                _,
-            ) = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(
-                    encoder_hidden_shape, device=inputs_embeds.device
-                )
-            encoder_extended_attention_mask = self.invert_attention_mask(
-                encoder_attention_mask
-            )
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-        cross_attn_head_mask = self.get_head_mask(
-            cross_attn_head_mask, self.config.num_layers
-        )
-        present_key_value_states = () if use_cache else None
-        all_hidden_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-        position_bias = None
-        encoder_decoder_position_bias = None
-
-        hidden_states = self.dropout(inputs_embeds)
-
-        for i, (layer_module, past_key_value) in enumerate(
-            zip(self.block, past_key_values)
-        ):
-            layer_head_mask = head_mask[i]
-            cross_attn_layer_head_mask = cross_attn_head_mask[i]
-            # Model parallel
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                hidden_states,
-                attention_mask=extended_attention_mask,
-                position_bias=position_bias,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_extended_attention_mask,
-                encoder_decoder_position_bias=encoder_decoder_position_bias,
-                layer_head_mask=layer_head_mask,
-                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=past_key_value,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-
-            # layer_outputs is a tuple with:
-            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
-            if use_cache is False:
-                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
-
-            hidden_states, present_key_value_state = layer_outputs[:2]
-
-            # We share the position biases between the layers - the first layer store them
-            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
-            # (cross-attention position bias), (cross-attention weights)
-            position_bias = layer_outputs[2]
-            if self.is_decoder and encoder_hidden_states is not None:
-                encoder_decoder_position_bias = layer_outputs[
-                    4 if output_attentions else 3
-                ]
-            # append next layer key value states
-            if use_cache:
-                present_key_value_states = present_key_value_states + (
-                    present_key_value_state,
-                )
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[3],)
-                if self.is_decoder:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
-
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        # Add last layer
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    present_key_value_states,
-                    all_hidden_states,
-                    all_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=present_key_value_states,
-            hidden_states=all_hidden_states,
-            attentions=all_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-class T5ForConditionalGeneration(T5PreTrainedModel):
-    def __init__(self, config: T5Config, weights):
-        super().__init__(config)
-        self.model_dim = config.d_model
-
-        self.shared = TensorParallelEmbedding(prefix="shared", weights=weights)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.is_decoder = False
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(
-            config=encoder_config,
-            prefix="encoder",
-            weights=weights,
-            embed_tokens=self.shared,
-        )
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        decoder_config.is_encoder_decoder = False
-        decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = T5Stack(
-            config=decoder_config,
-            prefix="decoder",
-            weights=weights,
-            embed_tokens=self.shared,
-        )
-
-        try:
-            self.lm_head = SpeculativeHead.load(
-                config, prefix="lm_head", weights=weights
-            )
-        except RuntimeError:
-            # Some models like t5-small were saved with shared weights unlike flan
-            # Since they are declared as the same arch we have no choice but hope
-            # that this is OK instead of using a proper flag.
-            self.lm_head = SpeculativeHead.load(
-                config, prefix="shared", weights=weights
-            )
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        decoder_head_mask: Optional[torch.FloatTensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-        if head_mask is not None and decoder_head_mask is None:
-            if self.config.num_layers == self.config.num_decoder_layers:
-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                decoder_head_mask = head_mask
-
-        # Encode if needed (training, first prediction pass)
-        if encoder_outputs is None:
-            # Convert encoder inputs in embeddings if needed
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
-
-        hidden_states = encoder_outputs[0]
-
-        if (
-            labels is not None
-            and decoder_input_ids is None
-            and decoder_inputs_embeds is None
-        ):
-            # get decoder inputs from shifting lm labels to the right
-            decoder_input_ids = self._shift_right(labels)
-
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_values=past_key_values,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = decoder_outputs[0]
-
-        if self.config.tie_word_embeddings:
-            # Rescale output before projecting on vocab
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-            sequence_output = sequence_output * (self.model_dim**-0.5)
-
-        logits, speculative_logits = self.lm_head(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            # move labels to correct device to enable PP
-            labels = labels.to(logits.device)
-            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
-            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
-
-        if not return_dict:
-            output = (logits,) + decoder_outputs[1:] + encoder_outputs
-            return ((loss,) + output) if loss is not None else output
-
-        return (
-            Seq2SeqLMOutput(
-                loss=loss,
-                logits=logits,
-                past_key_values=decoder_outputs.past_key_values,
-                decoder_hidden_states=decoder_outputs.hidden_states,
-                decoder_attentions=decoder_outputs.attentions,
-                cross_attentions=decoder_outputs.cross_attentions,
-                encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-                encoder_hidden_states=encoder_outputs.hidden_states,
-                encoder_attentions=encoder_outputs.attentions,
-            ),
-            speculative_logits,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        decoder_attention_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs,
-    ):
-        # cut decoder_input_ids if past is used
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            "decoder_input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "encoder_outputs": encoder_outputs,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "decoder_attention_mask": decoder_attention_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,
-        }
-
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return self._shift_right(labels)
-
-    def _reorder_cache(self, past_key_values, beam_idx):
-        # if decoder past is not included in output
-        # speedy decoding is disabled and no need to reorder
-        if past_key_values is None:
-            logger.warning(
-                "You might want to consider setting `use_cache=True` to speed up decoding"
-            )
-            return past_key_values
-
-        reordered_decoder_past = ()
-        for layer_past_states in past_key_values:
-            # get the correct batch idx from layer past batch dim
-            # batch dim of `past` is at 2nd position
-            reordered_layer_past_states = ()
-            for layer_past_state in layer_past_states:
-                # need to set correct `past` for each of the four key / value states
-                reordered_layer_past_states = reordered_layer_past_states + (
-                    layer_past_state.index_select(
-                        0, beam_idx.to(layer_past_state.device)
-                    ),
-                )
-
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
-
-            reordered_decoder_past = reordered_decoder_past + (
-                reordered_layer_past_states,
-            )
-        return reordered_decoder_past
diff --git a/backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py b/backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py
index e5c44045..ae704af3 100644
--- a/backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py
+++ b/backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py
@@ -4,7 +4,7 @@ def load_text_model(prefix, config, weights, name=None):
             FlashLlamaForCausalLM,
         )
 
-        return FlashLlamaForCausalLM(prefix, config, weights)
+        return FlashLlamaForCausalLM(prefix, config, weights, name=name)
     elif config.model_type == "mistral":
         from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
             FlashMistralForCausalLM,
@@ -16,7 +16,13 @@ def load_text_model(prefix, config, weights, name=None):
             FlashGemmaForCausalLM,
         )
 
-        return FlashGemmaForCausalLM(prefix, config, weights, causal=False)
+        return FlashGemmaForCausalLM(prefix, config, weights)
+    elif config.model_type == "gemma2":
+        from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
+            FlashGemma2ForCausalLM,
+        )
+
+        return FlashGemma2ForCausalLM(prefix, config, weights)
     elif config.model_type == "paligemma":
         from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
             FlashGemmaForCausalLM,
diff --git a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
index bc9d44a0..a4d58596 100644
--- a/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
@@ -1,4 +1,3 @@
-from contextlib import nullcontext
 import math
 import os
 import time
@@ -16,12 +15,19 @@ from transformers import (
     AutoTokenizer,
     GenerationConfig,
 )
-from typing import Any, ContextManager, Iterable, Optional, Tuple, List, Type, Dict
-
+from typing import (
+    Any,
+    Iterable,
+    Optional,
+    Tuple,
+    List,
+    Type,
+    Dict,
+    Union,
+)
+import torch.nn.functional as F
 from text_generation_server.adapters import AdapterBatchData, AdapterBatchMetadata
-from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from text_generation_server.utils.chunks import concat_text_chunks
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models import Model
 from text_generation_server.utils.log import log_master
 from text_generation_server.utils.tokens import batch_top_tokens
@@ -39,27 +45,34 @@ from text_generation_server.models.types import (
 )
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.globals import (
-    MEM_POOL,
-    ATTENTION,
     BLOCK_SIZE,
-    CUDA_GRAPHS,
+    REQUEST_LOGPROBS,
     TGI_WIGGLE_ROOM,
     get_adapter_to_index,
 )
-from text_generation_server.layers.attention import Seqlen
+from text_generation_server.layers.attention import (
+    KVCache,
+    Seqlen,
+    HPUPagedAttentionMetadata,
+    trim_attn_metadata,
+    trim_seqlen_metadata,
+)
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
 from text_generation_server.utils.quantization import get_loader
 from text_generation_server.utils.segments import SegmentConcatBuilder, find_segments
-
 from text_generation_server.utils.import_utils import (
     empty_cache,
     synchronize,
     get_free_memory,
 )
 
-tracer = trace.get_tracer(__name__)
+import vllm_hpu_extension.environment as environment
+import habana_frameworks.torch as htorch
+import itertools
+from vllm_hpu_extension.ops import batch2block, block2batch
 
+tracer = trace.get_tracer(__name__)
 
 # Will be set in init
 SLIDING_WINDOW: Optional[int] = None
@@ -75,38 +88,75 @@ def get_sliding_windows() -> int:
     return SLIDING_WINDOW
 
 
-def init_cpu_threads_env(rank_id: int, world_size: int):
-    import importlib.util
+def prepare_for_decode(
+    dtype, use_contiguous_pa, device, slot, block_tables, batch_size
+):
+    # Prepare values if we need to continue decoding
+    # need for HPUPagedAttentionMetadata preparation
+    def flatten(in_list):
+        return list(itertools.chain(*in_list))
 
-    if importlib.util.find_spec("numa") is not None:
-        import numa
-        import psutil
+    def gather_list(input, indices, v):
+        return [input[i] if i is not None else v for i in indices]
 
-        nodes = numa.info.get_max_node() + 1
-        rank_per_node = math.ceil(world_size / nodes)
-        num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
-        node_id = int(rank_id / rank_per_node)
-        rank_offset_per_node = rank_id % rank_per_node
-        if os.getenv("OMP_NUM_THREADS") is None:
-            num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
-        else:
-            num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
-        if len(numa.memory.get_membind_nodes()) == nodes:
-            numa.memory.set_membind_nodes((node_id))
-        torch.set_num_threads(num_cpus_per_rank)
-        if len(numa.schedule.get_affinitive_cpus(0)) == psutil.cpu_count(logical=True):
-            cpu_start = num_cpus_per_rank * rank_offset_per_node
-            numa.schedule.run_on_cpus(
-                0,
-                *(
-                    numa.info.node_to_cpus(node_id)[
-                        cpu_start : cpu_start + num_cpus_per_rank
-                    ]
-                ),
-            )
-        logger.info(
-            f"affinity={numa.schedule.get_affinitive_cpus(0)}, membind = {numa.memory.get_membind_nodes()}"
+    def pad_list(input, k, v):
+        input_len = len(input)
+        target_len = (input_len + k - 1) // k * k
+        padding = target_len - input_len
+        return input + [v] * padding
+
+    last_block_usage = slot % BLOCK_SIZE + 1
+    block_groups = [[i] * len(bt) for i, bt in enumerate(block_tables)]
+    block_usage = [
+        [BLOCK_SIZE] * (len(bt) - 1) + [lbu]
+        for bt, lbu in zip(block_tables, last_block_usage)
+        if bt
+    ]
+
+    block_list = flatten(block_tables)
+    block_groups = flatten(block_groups)
+    block_usage = flatten(block_usage)
+    assert len(block_list) == len(block_groups)
+    assert len(block_list) == len(block_usage)
+    if use_contiguous_pa:
+        block_bucket_size = max(max(block_list) + 1, len(block_list))
+        #     block_bucket_size = self.bucketing_ctx.get_padded_decode_num_blocks(
+        #         block_bucket_size)
+        indices: List[Any]
+        indices = [None] * block_bucket_size
+        for i, bid in enumerate(block_list):
+            indices[bid] = i
+        block_list = gather_list(block_list, indices, 0)
+        block_groups = gather_list(block_groups, indices, -1)
+        block_usage = gather_list(block_usage, indices, 1)
+    else:
+        block_bucket_size = len(block_list)
+        block_list = pad_list(block_list, block_bucket_size, 0)
+        block_groups = pad_list(block_groups, block_bucket_size, -1)
+        block_usage = pad_list(block_usage, block_bucket_size, 1)
+
+    block_list = torch.tensor(block_list, dtype=torch.int, device=device)
+    block_groups = torch.tensor(block_groups, dtype=torch.int, device=device)
+    block_usage = torch.tensor(block_usage, dtype=dtype, device=device)
+    block_mapping = torch.nn.functional.one_hot(block_groups, num_classes=batch_size)
+    mask = torch.arange(0, BLOCK_SIZE, device=device, dtype=torch.int32).unsqueeze(0)
+    mask = mask >= block_usage.unsqueeze(-1)
+    attn_bias = torch.zeros_like(mask, dtype=dtype).masked_fill_(mask, -math.inf)
+    ones = torch.ones(
+        (block_mapping.size(0),), device=device, dtype=block_mapping.dtype
+    )
+    sums = batch2block(block2batch(ones, block_mapping), block_mapping)
+    block_scales = torch.reciprocal(torch.maximum(ones, sums))
+    return trim_attn_metadata(
+        HPUPagedAttentionMetadata(
+            block_list=block_list,
+            block_groups=block_groups,
+            block_usage=block_usage,
+            block_mapping=block_mapping.to(dtype),
+            attn_bias=attn_bias,
+            block_scales=block_scales,
         )
+    )
 
 
 @dataclass
@@ -117,25 +167,17 @@ class FlashCausalLMBatch(Batch):
     requests_idx_mapping: Dict[int, int]
 
     # Decoder values
-    input_ids: torch.Tensor
-    position_ids: torch.Tensor
+    # Can be a list for easy filtering
+    # If `input_ids` is a list, it needs to be materialized to a tensor first
+    input_ids: Union[torch.Tensor, List[List[int]]]
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    position_ids: Optional[torch.Tensor]
     speculative_ids: Optional[torch.Tensor]
 
-    # Flash Attention values
-
-    # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
-    cu_seqlen_prefill: Optional[torch.Tensor]
-    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
-    # as we only keep SLIDING_WINDOW values instead of the whole tensor
-    prefill_cache_indices: Optional[torch.Tensor]
-
-    # Paged Attention values
-
     # Set when creating the batch
-    # CPU tensor of length b indicating the start of each sequence in slots
-    start_slots: torch.Tensor
     # tensor of indices of the currently used slots, length = \sum_{i=0}^{b} s_i in prefill, length = b in decode
-    slot_indices: torch.Tensor
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    slot_indices: Optional[torch.Tensor]
 
     # list of length b of list of length s_i // block_size
     block_tables: List[List[int]]
@@ -143,19 +185,32 @@ class FlashCausalLMBatch(Batch):
     block_tables_tensor: torch.Tensor
     # tensor of length \sum_{i=0}^{b} max_s_i  holding the paged attention slots for all sequences
     slots: torch.Tensor
-    # size [b], containing the number of blocks that can be retrieved from the cache
-    prefix_lens: List[int]
-    prefix_lens_tensor: torch.Tensor
+    # list of length b + 1  containing the cumulative sequence slot lengths of the sequences in the batch
+    # used for filtering
+    cu_slots: torch.Tensor
 
-    max_seqlen: int
+    max_input_length: int
+    max_current_length: int
+
+    # Whether this batch contains at least one request that is prefilling
+    prefilling: bool
+    # Whether each request is prefilling
+    prefilling_mask: List[bool]
 
     # Prefill metadata tensors to efficiently compute logprobs
+    # tensor of length b + 1  containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
+    cu_seqlen_prefill: Optional[torch.Tensor]
+    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
+    # as we only keep SLIDING_WINDOW values instead of the whole tensor
+    prefill_cache_indices: Optional[torch.Tensor]
+    # Will be set by `generate_token` and reset after each prefill forward
     prefill_head_indices: Optional[torch.Tensor]
+    # Will be set by `generate_token` and reset after each prefill forward
     prefill_next_token_indices: Optional[torch.tensor]
+    # Will be set by `generate_token` and reset after each prefill forward
     prefill_cu_outlens: Optional[List[int]]
-
-    # Prefixes
-    prefix_ids: List[List[int]]
+    # Will be set by `generate_token` and reset after each prefill forward
+    prefill_logprob_tokens: List[Optional[Tokens]]
 
     # All tokens
     all_input_ids: List[List[int]]
@@ -163,7 +218,14 @@ class FlashCausalLMBatch(Batch):
 
     # Lengths of all generations present in the batch
     input_lengths: List[int]
-    input_lengths_tensor: torch.Tensor
+    # size [b], containing the number of blocks that can be retrieved from the cache
+    cache_lengths: List[int]
+    prompt_lengths: List[int]
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    input_lengths_tensor: Optional[torch.Tensor]
+    cache_lengths_tensor: Optional[torch.Tensor]
+    prompt_lengths_tensor: torch.Tensor
+
     prefix_offsets: List[Optional[int]]
     read_offsets: List[Optional[int]]
 
@@ -174,19 +236,27 @@ class FlashCausalLMBatch(Batch):
     top_n_tokens_tensor: torch.Tensor
 
     # Adapter metadata for each request
-    adapter_meta: AdapterBatchMetadata
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    adapter_meta: Optional[AdapterBatchMetadata]
 
     # Number of blocks in this batch
     num_blocks: int
     # Maximum number of blocks
     max_blocks: int
 
+    hpu_attn_meta: Optional[HPUPagedAttentionMetadata]
+
     def to_pb(self) -> generate_pb2.CachedBatch:
         return generate_pb2.CachedBatch(
             id=self.batch_id,
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.num_blocks * BLOCK_SIZE,
+            current_tokens=(
+                sum([len(i) for i in self.input_ids])
+                if isinstance(self.input_ids, list)
+                else len(self.input_ids)
+            ),
         )
 
     @classmethod
@@ -218,86 +288,67 @@ class FlashCausalLMBatch(Batch):
         dtype: torch.dtype,
         device: torch.device,
     ) -> "FlashCausalLMBatch":
-        sliding_window = get_sliding_windows()
-        position_ids = []
-        cu_seqlen_prefill = [0]
-        start_slots = []
-        slot_indices = []
-        prefill_cache_indices = []
-
+        cache_lengths = []
         input_lengths = []
+        prompt_lengths = []
         prefix_offsets = []
         read_offsets = []
         all_input_ids = []
-        prefix_ids = []
+        all_postfix_ids = []
         requests_idx_mapping = {}
-
-        all_prefill_logprobs = True
-        no_prefill_logprobs = True
-        prefill_head_indices = []
-        prefill_next_token_indices = []
-        prefill_cu_outlens = [0]
+        slots = []
+        cu_slots = [0]
 
         next_token_chooser_parameters = []
         stopping_criterias = []
         top_n_tokens = []
 
-        adapter_indices_list = []
-        adapter_set = set()
-
-        # Cumulative length
-        cumulative_length = 0
-        cumulative_slot_tokens = 0
-        prefill_out_cumulative_length = 0
-
         num_blocks = 0
-        max_seqlen = 0
+        max_input_length = 0
+        max_current_length = 0
         max_length = 0
         max_blocks = 0
 
+        cu_blocks = [0]
         block_tables = []
-        slots = []
-        prefix_lens = []
+        block_tables_ragged = []
 
         # Parse batch
         for i, (r, tokenized_input) in enumerate(
             zip(pb.requests, batch_tokenized_inputs)
         ):
+            ### XXX: This consumes so much memory on long requests
+            ### Deactivating it by default seems like the best course.
+            if not REQUEST_LOGPROBS:
+                r.prefill_logprobs = False
             # request id -> idx in list mapping
             requests_idx_mapping[r.id] = i
 
-            orig_input_length = len(tokenized_input)
+            prompt_length = len(tokenized_input)
+            prompt_lengths.append(prompt_length)
+
+            cache_length = r.cache_len
 
-            prefix_len = r.prefix_len
             assert (
-                prefix_len <= orig_input_length
-            ), f"Prefix {prefix_len} vs input {orig_input_length}"
-            if prefix_len == orig_input_length:
-                assert prefix_len > 0
-                prefix_len -= 1
+                cache_length <= prompt_length
+            ), f"Prefix {cache_length} vs input {prompt_length}"
+            if cache_length == prompt_length:
+                assert False, "unreachable"
 
-            # Commented as it's costly.
-            # log_master(logger.debug, "Tokenized input ids {tokenized_input}")
-            prefix_ids.append(tokenized_input[:prefix_len])
-            tokenized_input = tokenized_input[prefix_len:]
+            # `chunk_len` is an optional field in the protobuf
+            # It is only set if the model support chunking
+            # Use all the remaining ids
+            postfix_ids = tokenized_input[cache_length:]
+            input_length = len(postfix_ids)
 
-            input_length = len(tokenized_input)
             input_lengths.append(input_length)
 
-            prefix_offsets.append(input_length - 5)
-            read_offsets.append(input_length)
+            prefix_offsets.append(prompt_length - 5)
+            read_offsets.append(prompt_length)
 
+            all_postfix_ids.append(postfix_ids)
             all_input_ids.append(tokenized_input)
 
-            # Position ids
-            request_position_ids = torch.arange(
-                prefix_len, orig_input_length, dtype=torch.int32
-            )
-            position_ids.append(request_position_ids)
-
-            # Add cumulative lengths of all previous inputs
-            cu_seqlen_prefill.append(cumulative_length + input_length)
-
             next_token_chooser_parameters.append(r.parameters)
 
             stopping_criteria = StoppingCriteria.from_pb(
@@ -307,22 +358,13 @@ class FlashCausalLMBatch(Batch):
             stopping_criterias.append(stopping_criteria)
             top_n_tokens.append(r.top_n_tokens)
 
-            ADAPTER_TO_INDEX = get_adapter_to_index()
-            adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
-            adapter_indices_list.append(torch.full((input_length,), adapter_index))
-            adapter_set.add(adapter_index)
-
             # Paged attention
             # Remove one as the first token des not have a past
             speculative_length = get_speculate()
             speculative_length = 0 if speculative_length is None else speculative_length
 
             # Tokens that need to be mapped to blocks.
-            block_tokens = orig_input_length + max_new_tokens - 1 + speculative_length
-
-            # Tokens that need to be mapped to slots. We don't need slots for the
-            # cached prefix (if present).
-            slot_tokens = input_length + max_new_tokens - 1 + speculative_length
+            block_tokens = prompt_length + max_new_tokens - 1 + speculative_length
 
             # blocks and slots can be empty (for example in warmup)
             if not r.blocks:
@@ -337,70 +379,30 @@ class FlashCausalLMBatch(Batch):
                 ]
             else:
                 request_blocks = r.blocks
-                request_slots = r.slots[
-                    prefix_len:  #: orig_input_length + max_new_tokens + speculative_length
-                ]
+                request_slots = r.slots
 
             block_tables.append(request_blocks)
+            block_tables_ragged.extend(request_blocks)
+            cu_blocks.append(len(block_tables_ragged))
 
             slots.extend(request_slots)
-            prefix_lens.append(prefix_len)
+            cu_slots.append(len(slots))
+
+            cache_lengths.append(cache_length)
             num_blocks += len(request_blocks)
-            start_slots.append(cumulative_slot_tokens)
-
-            request_slot_indices = torch.arange(
-                cumulative_slot_tokens,
-                cumulative_slot_tokens + input_length,
-                dtype=torch.int64,
-            )
-            slot_indices.append(request_slot_indices)
-
-            # Create tensor to slice into the kv tensor in prefill
-            if sliding_window is not None:
-                request_prefill_cache_indices = torch.arange(
-                    cumulative_length + max(0, input_length - sliding_window),
-                    cumulative_length + input_length,
-                    dtype=torch.int64,
-                )
-                prefill_cache_indices.append(request_prefill_cache_indices)
-
-            all_prefill_logprobs = all_prefill_logprobs and r.prefill_logprobs
-            no_prefill_logprobs = no_prefill_logprobs and not r.prefill_logprobs
-
-            if r.prefill_logprobs:
-                prefill_head_indices.append(request_position_ids + cumulative_length)
-                prefill_next_token_indices.append(
-                    prefill_out_cumulative_length + input_length - 1
-                )
-                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
-                prefill_out_cumulative_length += input_length
-            else:
-                prefill_head_indices.append(
-                    torch.tensor(
-                        [cumulative_length + input_length - 1], dtype=torch.int32
-                    )
-                )
-                prefill_next_token_indices.append(prefill_out_cumulative_length)
-                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
-                prefill_out_cumulative_length += 1
 
             # Update
-            cumulative_length += input_length
-            cumulative_slot_tokens += slot_tokens
-            max_seqlen = max(max_seqlen, input_length)
             max_blocks = max(max_blocks, len(request_blocks))
+            max_input_length = max(max_input_length, input_length)
+            max_current_length = max(max_current_length, cache_length + input_length)
             max_length = max(
-                max_length, input_length + max_new_tokens + speculative_length
+                max_length,
+                prompt_length + max_new_tokens + speculative_length,
             )
 
-        adapter_indices = torch.cat(adapter_indices_list).to(
-            dtype=torch.int64, device=device
-        )
-
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             next_token_chooser_parameters, dtype, device, tokenizer
         )
-        start_slots = torch.tensor(start_slots, dtype=torch.int64)
 
         # Padded all_input_ids_tensor
         all_input_ids_tensor = np.zeros(
@@ -414,103 +416,71 @@ class FlashCausalLMBatch(Batch):
             all_input_ids_tensor, dtype=torch.int64, device=device
         )
 
-        if len(pb.requests) > 1:
-            input_ids = np.concatenate(all_input_ids, dtype=np.int64)
-            position_ids = torch.cat(position_ids)
-            slot_indices = torch.cat(slot_indices)
-            if sliding_window is not None:
-                prefill_cache_indices = torch.cat(prefill_cache_indices)
-        else:
-            input_ids = all_input_ids[0]
-            position_ids = position_ids[0]
-            slot_indices = slot_indices[0]
-            if sliding_window is not None:
-                prefill_cache_indices = prefill_cache_indices[0]
-
-        cu_seqlen_prefill = torch.tensor(
-            cu_seqlen_prefill, device=device, dtype=torch.int32
-        )
-        position_ids = position_ids.to(device)
-        slot_indices = slot_indices.to(device)
-        prefill_cache_indices = (
-            prefill_cache_indices.to(device) if sliding_window is not None else None
-        )
-        input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
-        input_lengths_tensor = torch.tensor(
-            input_lengths, dtype=torch.int32, device=device
-        )
-
-        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
-        adapter_segments = torch.tensor(
-            adapter_segments, dtype=torch.int32, device=device
-        )
-
-        if all_prefill_logprobs:
-            prefill_head_indices = None
-            prefill_next_token_indices = cu_seqlen_prefill[1:] - 1
-        elif no_prefill_logprobs:
-            prefill_head_indices = cu_seqlen_prefill[1:] - 1
-            prefill_next_token_indices = None
-        else:
-            prefill_head_indices = torch.tensor(
-                torch.cat(prefill_head_indices), dtype=torch.int64, device=device
-            )
-            prefill_next_token_indices = torch.tensor(
-                prefill_next_token_indices, dtype=torch.int64, device=device
-            )
         top_n_tokens_tensor = torch.tensor(
             top_n_tokens, device=device, dtype=torch.int64
         )
 
-        slots = torch.tensor(slots, dtype=torch.int64, device=device)
-
-        block_tables_tensor = torch.zeros(
-            (len(block_tables), max_blocks), dtype=torch.int32, device="cpu"
+        block_tables_ragged = torch.tensor(
+            block_tables_ragged, device=device, dtype=torch.int32
         )
+        cu_blocks = torch.tensor(cu_blocks, device=device, dtype=torch.int64)
+        block_tables_tensor = torch.empty(
+            (len(block_tables), max_blocks),
+            device=device,
+            dtype=torch.int32,
+        )
+
         for i, request_blocks in enumerate(block_tables):
             block_tables_tensor[i, : len(request_blocks)] = torch.tensor(request_blocks)
-        block_tables_tensor = block_tables_tensor.to(device)
-        prefix_lens_tensor = torch.tensor(prefix_lens, dtype=torch.int32, device=device)
+
+        prompt_lengths_tensor = torch.tensor(
+            prompt_lengths, dtype=torch.int32, device=device
+        )
+
+        slots = torch.tensor(slots, dtype=torch.int64, device=device)
+        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)
 
         return cls(
             batch_id=pb.id,
             requests=pb.requests,
             requests_idx_mapping=requests_idx_mapping,
-            input_ids=input_ids,
-            position_ids=position_ids,
-            cu_seqlen_prefill=cu_seqlen_prefill,
-            prefill_cache_indices=prefill_cache_indices,
-            start_slots=start_slots,
-            slot_indices=slot_indices,
+            input_ids=all_postfix_ids,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
-            slots=slots,
-            prefix_lens=prefix_lens,
-            prefix_lens_tensor=prefix_lens_tensor,
-            max_seqlen=max_seqlen,
-            prefill_head_indices=prefill_head_indices,
-            prefill_next_token_indices=prefill_next_token_indices,
-            prefill_cu_outlens=prefill_cu_outlens,
+            cache_lengths=cache_lengths,
+            max_input_length=max_input_length,
+            max_current_length=max_current_length,
+            prefilling=True,
+            prefilling_mask=[True] * len(pb.requests),
+            prefill_logprob_tokens=[None] * len(pb.requests),
             input_lengths=input_lengths,
-            input_lengths_tensor=input_lengths_tensor,
+            prompt_lengths=prompt_lengths,
             prefix_offsets=prefix_offsets,
             read_offsets=read_offsets,
             all_input_ids=all_input_ids,
             all_input_ids_tensor=all_input_ids_tensor,
-            prefix_ids=prefix_ids,
             next_token_chooser=next_token_chooser,
             stopping_criterias=stopping_criterias,
             top_n_tokens=top_n_tokens,
             top_n_tokens_tensor=top_n_tokens_tensor,
             num_blocks=num_blocks,
             max_blocks=max_blocks,
-            adapter_meta=AdapterBatchMetadata(
-                adapter_indices=adapter_indices,
-                adapter_set=adapter_set,
-                adapter_segments=adapter_segments,
-                segment_indices=adapter_segment_indices,
-            ),
             speculative_ids=None,
+            prompt_lengths_tensor=prompt_lengths_tensor,
+            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
+            position_ids=None,
+            cu_seqlen_prefill=None,
+            prefill_cache_indices=None,
+            slot_indices=None,
+            slots=slots,
+            cu_slots=cu_slots,
+            prefill_head_indices=None,
+            prefill_next_token_indices=None,
+            prefill_cu_outlens=None,
+            cache_lengths_tensor=None,
+            input_lengths_tensor=None,
+            adapter_meta=None,
+            hpu_attn_meta=None,
         )
 
     @classmethod
@@ -533,7 +503,7 @@ class FlashCausalLMBatch(Batch):
         if len(request_ids) == len(self):
             return self
 
-        device = self.input_ids.device
+        device = self.block_tables_tensor.device
 
         # New values after filtering
         requests_idx_mapping = {}
@@ -548,18 +518,23 @@ class FlashCausalLMBatch(Batch):
 
         # Create on CPU to only move to GPU once instead of at every copy
         slot_indices = torch.empty(len(request_ids), dtype=torch.int64)
-        max_seqlen = 0
+        max_input_length = 0
+        max_current_length = 0
 
         requests = []
-        start_slots = []
         block_tables = []
         all_input_ids = []
-        prefix_ids = []
+        input_ids = []
 
+        prompt_lengths = []
         input_lengths = []
-        prefix_lens = []
+        cache_lengths = []
         prefix_offsets = []
         read_offsets = []
+        cu_slots = [0]
+
+        prefilling_mask = []
+        prefill_logprob_tokens = []
 
         stopping_criterias = []
         top_n_tokens = []
@@ -567,8 +542,8 @@ class FlashCausalLMBatch(Batch):
 
         num_blocks = 0
         max_blocks = 0
-        # Cumulative length
-        cumulative_max_length = 0
+        max_slots = 0
+        cumulative_slot_tokens = 0
 
         for i, request_id in enumerate(request_ids):
             idx = self.requests_idx_mapping[request_id]
@@ -577,16 +552,23 @@ class FlashCausalLMBatch(Batch):
 
             requests.append(self.requests[idx])
 
+            # Prefilling
+            request_prefilling = self.prefilling_mask[idx]
+            prefilling_mask.append(request_prefilling)
+
             # Get length
             request_input_length = self.input_lengths[idx]
-            prefix_len = self.prefix_lens[idx]
-            max_seqlen = max(max_seqlen, request_input_length)
+            request_cache_length = self.cache_lengths[idx]
+            max_input_length = max(max_input_length, request_input_length)
+            max_current_length = max(
+                max_current_length, request_cache_length + request_input_length
+            )
 
             all_input_ids.append(self.all_input_ids[idx])
-            prefix_ids.append(self.prefix_ids[idx])
 
+            prompt_lengths.append(self.prompt_lengths[idx])
             input_lengths.append(request_input_length)
-            prefix_lens.append(prefix_len)
+            cache_lengths.append(request_cache_length)
             prefix_offsets.append(self.prefix_offsets[idx])
             read_offsets.append(self.read_offsets[idx])
 
@@ -594,60 +576,78 @@ class FlashCausalLMBatch(Batch):
             stopping_criterias.append(stopping_criteria)
 
             top_n_tokens.append(self.top_n_tokens[idx])
+            prefill_logprob_tokens.append(self.prefill_logprob_tokens[idx])
 
             ADAPTER_TO_INDEX = get_adapter_to_index()
             adapter_index = ADAPTER_TO_INDEX.get(self.requests[idx].adapter_id, 0)
             adapter_set.add(adapter_index)
 
-            remaining_tokens = (
-                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
-            )
-
             request_block_table = self.block_tables[idx]
             num_blocks += len(request_block_table)
             block_tables.append(request_block_table)
-            start_slots.append(cumulative_max_length)
 
-            # Copy to tensor (CPU)
-            slot_indices[i] = cumulative_max_length + request_input_length - 1
+            start_slot = self.cu_slots[idx]
+            end_slot = self.cu_slots[idx + 1]
+            slot_length = end_slot - start_slot
 
             # Set slice
-            slot_filtering_indices[
-                self.start_slots[idx] : self.start_slots[idx]
-                + request_input_length
-                + remaining_tokens
-                - 1
-            ] = True
+            slot_filtering_indices[start_slot:end_slot] = True
 
-            cumulative_max_length += request_input_length + remaining_tokens - 1
+            cu_slots.append(cumulative_slot_tokens + slot_length)
 
+            # Input ids if the request was part of a prefilling batch
+            # If the batch was decoding we can index into the tensor directly later
+            if self.prefilling:
+                input_ids.append(self.input_ids[idx])
+            else:
+                # Copy to tensor (CPU)
+                slot_indices[i] = cumulative_slot_tokens + request_cache_length
+
+            cumulative_slot_tokens += slot_length
             max_blocks = max(max_blocks, len(request_block_table))
+            max_slots = max(max_slots, slot_length)
 
-        # Index into tensors
-        input_ids = self.input_ids[indices]
-        position_ids = self.position_ids[indices]
-        adapter_indices = self.adapter_meta.adapter_indices[indices]
         all_input_ids_tensor = self.all_input_ids_tensor[indices]
         block_tables_tensor = self.block_tables_tensor[indices]
-        input_lengths_tensor = self.input_lengths_tensor[indices]
-        slots = self.slots[slot_filtering_indices]
-        prefix_lens_tensor = self.prefix_lens_tensor[indices]
         next_token_chooser = self.next_token_chooser.filter(indices)
         top_n_tokens_tensor = self.top_n_tokens_tensor[indices]
         speculative_ids = (
             self.speculative_ids[indices] if self.speculative_ids is not None else None
         )
+        prompt_lengths_tensor = self.prompt_lengths_tensor[indices]
 
-        start_slots = torch.tensor(start_slots, dtype=torch.int64)
+        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)
 
-        # Move to GPU now that we have the whole tensor
-        slot_indices = slot_indices.to(device)
+        slots = self.slots[slot_filtering_indices]
 
-        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
-        adapter_segments = torch.tensor(
-            adapter_segments, dtype=torch.int32, device=device
-        )
-        # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum()
+        if self.prefilling:
+            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
+            position_ids = None
+            slot_indices = None
+            cache_lengths_tensor = None
+            input_lengths_tensor = None
+            adapter_meta = None
+        else:
+            # Index into tensors
+            input_ids = self.input_ids[indices]
+            position_ids = self.position_ids[indices]
+            adapter_indices = self.adapter_meta.adapter_indices[indices]
+            input_lengths_tensor = self.input_lengths_tensor[indices]
+            cache_lengths_tensor = self.cache_lengths_tensor[indices]
+
+            # Move to GPU now that we have the whole tensor
+            slot_indices = slot_indices.to(device)
+
+            adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+            adapter_segments = torch.tensor(
+                adapter_segments, dtype=torch.int32, device=device
+            )
+            adapter_meta = AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            )
 
         return type(self)(
             batch_id=self.batch_id,
@@ -657,24 +657,29 @@ class FlashCausalLMBatch(Batch):
             position_ids=position_ids,
             cu_seqlen_prefill=None,
             prefill_cache_indices=None,
-            start_slots=start_slots,
             slot_indices=slot_indices,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
             slots=slots,
-            max_seqlen=max_seqlen,
+            cu_slots=cu_slots,
+            max_input_length=max_input_length,
+            max_current_length=max_current_length,
+            prefilling=self.prefilling,
+            prefilling_mask=prefilling_mask,
             prefill_head_indices=None,
             prefill_next_token_indices=None,
             prefill_cu_outlens=None,
+            prefill_logprob_tokens=prefill_logprob_tokens,
+            prompt_lengths=prompt_lengths,
+            prompt_lengths_tensor=prompt_lengths_tensor,
             input_lengths=input_lengths,
             input_lengths_tensor=input_lengths_tensor,
-            prefix_lens=prefix_lens,
-            prefix_lens_tensor=prefix_lens_tensor,
+            cache_lengths=cache_lengths,
+            cache_lengths_tensor=cache_lengths_tensor,
             prefix_offsets=prefix_offsets,
             read_offsets=read_offsets,
             all_input_ids=all_input_ids,
             all_input_ids_tensor=all_input_ids_tensor,
-            prefix_ids=prefix_ids,
             next_token_chooser=next_token_chooser,
             stopping_criterias=stopping_criterias,
             top_n_tokens=top_n_tokens,
@@ -682,12 +687,8 @@ class FlashCausalLMBatch(Batch):
             num_blocks=num_blocks,
             max_blocks=max_blocks,
             speculative_ids=speculative_ids,
-            adapter_meta=AdapterBatchMetadata(
-                adapter_indices=adapter_indices,
-                adapter_set=adapter_set,
-                adapter_segments=adapter_segments,
-                segment_indices=adapter_segment_indices,
-            ),
+            adapter_meta=adapter_meta,
+            hpu_attn_meta=None,
         )
 
     @classmethod
@@ -697,74 +698,105 @@ class FlashCausalLMBatch(Batch):
         requests = []
         requests_idx_mapping = {}
 
+        prefilling = False
         num_blocks = 0
         total_batch_size = 0
         total_slots = 0
         max_blocks = 0
         max_length = 0
-        max_seqlen = 0
+        max_input_length = 0
+        max_current_length = 0
         for b in batches:
             total_batch_size += len(b)
+            max_blocks = max(max_blocks, b.max_blocks)
             total_slots += len(b.slots)
             num_blocks += b.num_blocks
             speculative_length = (
                 b.speculative_ids.shape[1] if b.speculative_ids is not None else 0
             )
-            max_blocks = max(max_blocks, b.max_blocks)
-            max_seqlen = max(max_seqlen, b.max_seqlen)
+            max_input_length = max(max_input_length, b.max_input_length)
+            max_current_length = max(max_current_length, b.max_current_length)
             max_length = max(
                 max_length,
                 max(
-                    input_length
+                    prompt_length
                     + stopping_criteria.max_new_tokens
                     + speculative_length
-                    - stopping_criteria.current_tokens
-                    for input_length, stopping_criteria in zip(
-                        b.input_lengths, b.stopping_criterias
+                    for prompt_length, stopping_criteria in zip(
+                        b.prompt_lengths, b.stopping_criterias
                     )
                 ),
             )
+            prefilling = prefilling or b.prefilling
 
-        input_ids = batches[0].input_ids.new_empty(total_batch_size)
-        position_ids = batches[0].position_ids.new_empty(total_batch_size)
         slots = batches[0].slots.new_empty(total_slots)
-        slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
-        input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
+        cu_slots = torch.zeros(total_batch_size + 1, dtype=torch.int64)
+        if prefilling:
+            input_ids = []
+            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
+            position_ids = None
+            slot_indices = None
+            cache_lengths_tensor = None
+            input_lengths_tensor = None
+            adapter_meta = None
+            adapter_segment_builder = None
+        else:
+            input_ids = batches[0].input_ids.new_empty(total_batch_size)
+            if (
+                batches[0].position_ids is not None
+                and batches[0].position_ids.dim() == 2
+            ):
+                # Qwen2_vl case:
+                position_ids = batches[0].position_ids.new_empty(
+                    (total_batch_size, batches[0].position_ids.shape[-1])
+                )
+            else:
+                position_ids = batches[0].position_ids.new_empty(total_batch_size)
+            slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
+            input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
+                total_batch_size
+            )
+            cache_lengths_tensor = batches[0].cache_lengths_tensor.new_empty(
+                total_batch_size
+            )
+            total_indices_size = sum(
+                b.adapter_meta.adapter_indices.shape[0] for b in batches
+            )
+            adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
+                total_indices_size
+            )
+            adapter_segment_builder = SegmentConcatBuilder()
+            adapter_set = set()
+
+        prompt_lengths_tensor = batches[0].prompt_lengths_tensor.new_empty(
             total_batch_size
         )
         block_tables_tensor = batches[0].block_tables_tensor.new_zeros(
             (total_batch_size, max_blocks)
         )
-        prefix_lens_tensor = batches[0].prefix_lens_tensor.new_empty(total_batch_size)
         all_input_ids_tensor = batches[0].all_input_ids_tensor.new_zeros(
             (total_batch_size, max_length)
         )
         top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
             total_batch_size,
         )
-        total_indices_size = sum(
-            b.adapter_meta.adapter_indices.shape[0] for b in batches
-        )
-        adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
-            total_indices_size
-        )
-        adapter_set = set()
-        adapter_segment_builder = SegmentConcatBuilder()
 
-        start_slots = []
         block_tables = []
-        prefix_lens = []
+        cache_lengths = []
         all_input_ids = []
-        prefix_ids = []
 
+        prompt_lengths = []
         input_lengths = []
         prefix_offsets = []
         read_offsets = []
 
+        prefill_logprob_tokens = []
+
         next_token_chooser_parameters = []
         fsm_grammar_states = []
         stopping_criterias = []
         top_n_tokens = []
+        prefilling_mask = []
 
         # Cumulative length
         cumulative_batch_size = 0
@@ -783,32 +815,9 @@ class FlashCausalLMBatch(Batch):
 
             start_index = cumulative_batch_size
             end_index = cumulative_batch_size + len(batch)
-            slots_start_index = cumulative_slots
-            slots_end_index = cumulative_slots + len(batch.slots)
 
             # Copy tensors (GPU)
-            input_ids[start_index:end_index] = batch.input_ids
-            position_ids[start_index:end_index] = batch.position_ids
-            slot_indices[start_index:end_index] = batch.slot_indices + cumulative_slots
-            input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor
             top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor
-            slots[slots_start_index:slots_end_index] = batch.slots
-
-            # Copy over adapter indices
-            adapter_start_index = cumulative_adapter_indices_size
-            adapter_end_index = (
-                cumulative_adapter_indices_size
-                + batch.adapter_meta.adapter_indices.shape[0]
-            )
-            adapter_indices[adapter_start_index:adapter_end_index] = (
-                batch.adapter_meta.adapter_indices
-            )
-            cumulative_adapter_indices_size = adapter_end_index
-            adapter_set.update(batch.adapter_meta.adapter_set)
-            adapter_segment_builder.concat(
-                batch.adapter_meta.adapter_segments, batch.adapter_meta.segment_indices
-            )
-
             all_input_ids_tensor[
                 start_index:end_index, : batch.all_input_ids_tensor.shape[1]
             ] = batch.all_input_ids_tensor[:, :max_length]
@@ -816,20 +825,56 @@ class FlashCausalLMBatch(Batch):
             block_tables_tensor[
                 start_index:end_index, : batch.block_tables_tensor.shape[1]
             ] = batch.block_tables_tensor[:, :max_blocks]
+            prompt_lengths_tensor[start_index:end_index] = batch.prompt_lengths_tensor
 
-            prefix_lens_tensor[start_index:end_index] = batch.prefix_lens_tensor
+            slots_start_index = cumulative_slots
+            slots_end_index = cumulative_slots + len(batch.slots)
+            slots[slots_start_index:slots_end_index] = batch.slots
+            cu_slots[start_index + 1 : end_index + 1] = (
+                batch.cu_slots[1:] + cumulative_slots
+            )
 
-            start_slots.append(batch.start_slots + cumulative_slots)
+            if not prefilling:
+                input_ids[start_index:end_index] = batch.input_ids
+                position_ids[start_index:end_index] = batch.position_ids
+                slot_indices[start_index:end_index] = (
+                    batch.slot_indices + cumulative_slots
+                )
+                input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor
+                cache_lengths_tensor[start_index:end_index] = batch.cache_lengths_tensor
 
+                # Copy over adapter indices
+                adapter_start_index = cumulative_adapter_indices_size
+                adapter_end_index = (
+                    cumulative_adapter_indices_size
+                    + batch.adapter_meta.adapter_indices.shape[0]
+                )
+                adapter_indices[adapter_start_index:adapter_end_index] = (
+                    batch.adapter_meta.adapter_indices
+                )
+                cumulative_adapter_indices_size = adapter_end_index
+                adapter_set.update(batch.adapter_meta.adapter_set)
+                adapter_segment_builder.concat(
+                    batch.adapter_meta.adapter_segments,
+                    batch.adapter_meta.segment_indices,
+                )
+            else:
+                if isinstance(batch.input_ids, torch.Tensor):
+                    batch.input_ids = batch.input_ids.view(-1, 1).tolist()
+                input_ids.extend(batch.input_ids)
+
+            prefilling_mask.extend(batch.prefilling_mask)
             block_tables.extend(batch.block_tables)
-            prefix_lens.extend(batch.prefix_lens)
+            cache_lengths.extend(batch.cache_lengths)
             all_input_ids.extend(batch.all_input_ids)
-            prefix_ids.extend(batch.prefix_ids)
 
+            prompt_lengths.extend(batch.prompt_lengths)
             input_lengths.extend(batch.input_lengths)
             prefix_offsets.extend(batch.prefix_offsets)
             read_offsets.extend(batch.read_offsets)
 
+            prefill_logprob_tokens.extend(batch.prefill_logprob_tokens)
+
             next_token_chooser_parameters.extend([r.parameters for r in batch.requests])
             fsm_grammar_states.extend(batch.next_token_chooser.fsm_grammar_states)
             stopping_criterias.extend(batch.stopping_criterias)
@@ -837,12 +882,8 @@ class FlashCausalLMBatch(Batch):
             top_n_tokens.extend(batch.top_n_tokens)
 
             # Update
-            cumulative_batch_size += len(batch)
             cumulative_slots += len(batch.slots)
-
-        start_slots = torch.concat(start_slots)
-
-        # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum()
+            cumulative_batch_size += len(batch)
 
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             next_token_chooser_parameters,
@@ -852,13 +893,21 @@ class FlashCausalLMBatch(Batch):
             fsm_grammar_states=fsm_grammar_states,
         )
 
-        speculative_ids = (
-            torch.cat([b.speculative_ids for b in batches], dim=0)
-            if batches[0].speculative_ids is not None
-            else None
-        )
+        # We skip computing the speculative_ids when the batch size is too large, so
+        # we must check that all batches have them, otherwise they must be discarded
+        if get_speculate() > 0 and all(b.speculative_ids is not None for b in batches):
+            speculative_ids = torch.cat([b.speculative_ids for b in batches], dim=0)
+        else:
+            speculative_ids = None
 
-        adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
+        if adapter_segment_builder is not None:
+            adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
+            adapter_meta = AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            )
 
         return cls(
             batch_id=batches[0].batch_id,
@@ -868,24 +917,29 @@ class FlashCausalLMBatch(Batch):
             position_ids=position_ids,
             cu_seqlen_prefill=None,
             prefill_cache_indices=None,
-            start_slots=start_slots,
             slot_indices=slot_indices,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
-            prefix_lens=prefix_lens,
-            prefix_lens_tensor=prefix_lens_tensor,
+            cache_lengths=cache_lengths,
+            cache_lengths_tensor=cache_lengths_tensor,
             slots=slots,
-            max_seqlen=max_seqlen,
+            cu_slots=cu_slots,
+            max_input_length=max_input_length,
+            max_current_length=max_current_length,
+            prefilling=prefilling,
+            prefilling_mask=prefilling_mask,
             prefill_head_indices=None,
             prefill_next_token_indices=None,
             prefill_cu_outlens=None,
+            prefill_logprob_tokens=prefill_logprob_tokens,
+            prompt_lengths=prompt_lengths,
+            prompt_lengths_tensor=prompt_lengths_tensor,
             input_lengths=input_lengths,
             input_lengths_tensor=input_lengths_tensor,
             prefix_offsets=prefix_offsets,
             read_offsets=read_offsets,
             all_input_ids=all_input_ids,
             all_input_ids_tensor=all_input_ids_tensor,
-            prefix_ids=prefix_ids,
             next_token_chooser=next_token_chooser,
             stopping_criterias=stopping_criterias,
             top_n_tokens=top_n_tokens,
@@ -893,12 +947,286 @@ class FlashCausalLMBatch(Batch):
             num_blocks=num_blocks,
             max_blocks=max_blocks,
             speculative_ids=speculative_ids,
-            adapter_meta=AdapterBatchMetadata(
-                adapter_indices=adapter_indices,
-                adapter_set=adapter_set,
-                adapter_segments=adapter_segments,
-                segment_indices=adapter_segment_indices,
-            ),
+            adapter_meta=adapter_meta,
+            hpu_attn_meta=None,
+        )
+
+    def prepare_for_decode(self, dtype, use_contiguous_pa):
+        block_num = self.cache_lengths_tensor // BLOCK_SIZE + 1
+        block_tables = []
+        for i, bt in enumerate(self.block_tables):
+            block_tables.append(bt[0 : block_num[i]])
+
+        self.hpu_attn_meta = prepare_for_decode(
+            dtype,
+            use_contiguous_pa,
+            self.block_tables_tensor.device,
+            self.slots[self.slot_indices],
+            block_tables,
+            self.input_ids.size(0),
+        )
+
+    def prepare_for_prefill(self):
+        # Prepare values if we need to continue prefilling
+        # Speculation must be ignored while we prefill even with chunking
+        # it simplifies everything
+        assert self.speculative_ids is None
+
+        device = self.block_tables_tensor.device
+
+        # hpu does not support varlen for prefill, use sdpa instead. so need to pad input_tensor, position
+        # padding to left to work with sliding window
+        # use prefill_cache_indices to indicate the valid kv slot, update prefill_next_token_indices to indicate
+        # the right logit position
+        input_ids_padded_length = []
+        # need extra pad to match warmup seq
+        extra_pad = 0
+        if isinstance(self.input_ids, list) and len(self) > 1:
+            input_ids_padded_length = []
+            input_ids = []
+            for input_id in self.input_ids:
+                padded = self.max_input_length - len(input_id) + extra_pad
+                if padded > 0:
+                    input_id = [0] * padded + input_id
+                input_ids.append(input_id)
+                input_ids_padded_length.append(padded)
+            input_ids = np.concatenate(input_ids, dtype=np.int64)
+            self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
+        elif isinstance(self.input_ids, list):
+            input_ids = self.input_ids[0]
+            input_ids_padded_length.append(extra_pad)
+            input_ids = [0] * extra_pad + input_ids
+            self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
+        else:
+            self.input_ids = F.pad(self.input_ids, (extra_pad, 0), value=0)
+            input_ids_padded_length.append(extra_pad)
+
+        self.input_lengths_tensor = torch.tensor(
+            self.input_lengths, dtype=torch.int32, device=device
+        )
+        cu_seqlen_prefill = self.input_lengths_tensor.new_zeros(len(self) + 1)
+        torch.cumsum(self.input_lengths_tensor, out=cu_seqlen_prefill[1:], dim=0)
+        self.cu_seqlen_prefill = cu_seqlen_prefill.to(torch.int32)
+        self.cache_lengths_tensor = torch.tensor(
+            self.cache_lengths, dtype=torch.int32, device=device
+        )
+
+        sliding_window = get_sliding_windows()
+        position_ids = []
+        slot_indices = []
+        prefill_cache_indices = []
+        all_prefill_logprobs = True
+        no_prefill_logprobs = True
+        prefill_cu_outlens = [0]
+
+        # Cumulative length
+        cumulative_length = 0
+        cumulative_slot_tokens = 0
+        prefill_out_cumulative_length = 0
+
+        adapter_indices_list = []
+        adapter_set = set()
+
+        for i, (
+            r,
+            cache_length,
+            input_length,
+            prompt_length,
+            request_prefilling,
+            blocks,
+        ) in enumerate(
+            zip(
+                self.requests,
+                self.cache_lengths,
+                self.input_lengths,
+                self.prompt_lengths,
+                self.prefilling_mask,
+                self.block_tables,
+            )
+        ):
+            next_chunk_length = input_length
+
+            # Position ids
+            request_position_ids = torch.arange(
+                cache_length, cache_length + input_length, dtype=torch.int32
+            )
+            request_position_ids = F.pad(
+                request_position_ids, (input_ids_padded_length[i], 0), value=1
+            )
+            position_ids.append(request_position_ids)
+
+            if not r.slots:
+                request_slots = [
+                    s
+                    for b in blocks
+                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
+                ]
+            else:
+                request_slots = r.slots
+
+            request_slot_indices = torch.arange(
+                cache_length + cumulative_slot_tokens,
+                cache_length + cumulative_slot_tokens + input_length,
+                dtype=torch.int64,
+            )
+
+            slot_indices.append(request_slot_indices)
+
+            # Update
+            cumulative_slot_tokens += len(request_slots)
+
+            # Create tensor to slice into the kv tensor in prefill
+            # hpu need request_prefill_cache_indices to skip padding in kv cache
+            sliding_window = get_sliding_windows()
+            if sliding_window is None:
+                sliding_window = input_length
+            cumulative_length += input_ids_padded_length[i]
+            if sliding_window is not None:
+                request_prefill_cache_indices = torch.arange(
+                    cumulative_length + max(0, input_length - sliding_window),
+                    cumulative_length + input_length,
+                    dtype=torch.int64,
+                )
+
+            # Prefill logprobs is ignored if the request is done prefilling
+            prefill_logprobs = r.prefill_logprobs and request_prefilling
+
+            all_prefill_logprobs = all_prefill_logprobs and prefill_logprobs
+            no_prefill_logprobs = no_prefill_logprobs and not prefill_logprobs
+
+            if prefill_logprobs:
+                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
+                prefill_out_cumulative_length += input_length
+            else:
+                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
+                prefill_out_cumulative_length += 1
+
+            prefill_cache_indices.append(request_prefill_cache_indices)
+
+            ADAPTER_TO_INDEX = get_adapter_to_index()
+            if ADAPTER_TO_INDEX:
+                adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
+                adapter_indices_list.append(
+                    torch.full((next_chunk_length,), adapter_index)
+                )
+                adapter_set.add(adapter_index)
+
+            # Update
+            cumulative_length += next_chunk_length
+
+        if not all_prefill_logprobs and not no_prefill_logprobs:
+            prefill_head_indices = []
+            prefill_next_token_indices = []
+
+            # Cumulative length
+            cumulative_length = 0
+            prefill_out_cumulative_length = 0
+
+            for i, (
+                r,
+                input_length,
+                request_prefilling,
+            ) in enumerate(
+                zip(
+                    self.requests,
+                    self.input_lengths,
+                    self.prefilling_mask,
+                )
+            ):
+                # Prefill logprobs is ignored if the request is done prefilling
+                prefill_logprobs = r.prefill_logprobs and request_prefilling
+
+                if prefill_logprobs:
+                    prefill_head_indices.append(
+                        torch.arange(
+                            cumulative_length,
+                            cumulative_length + input_length,
+                            dtype=torch.int64,
+                        )
+                    )
+                    prefill_next_token_indices.append(
+                        prefill_out_cumulative_length + input_length - 1
+                    )
+                    prefill_out_cumulative_length += input_length
+                else:
+                    prefill_head_indices.append(
+                        torch.tensor(
+                            [cumulative_length + input_length - 1],
+                            dtype=torch.int64,
+                        )
+                    )
+                    prefill_next_token_indices.append(prefill_out_cumulative_length)
+                    prefill_out_cumulative_length += 1
+
+                # Update
+                cumulative_length += input_length
+
+        if len(self) > 1:
+            if position_ids:
+                position_ids = torch.cat(position_ids)
+            if slot_indices:
+                slot_indices = torch.cat(slot_indices)
+            prefill_cache_indices = torch.cat(prefill_cache_indices)
+        else:
+            if position_ids:
+                position_ids = position_ids[0]
+            if slot_indices:
+                slot_indices = slot_indices[0]
+            prefill_cache_indices = prefill_cache_indices[0]
+
+        self.position_ids = position_ids.to(device)
+        self.slot_indices = slot_indices.to(device)
+
+        self.prefill_cu_outlens = prefill_cu_outlens
+        self.prefill_cache_indices = torch.zeros_like(self.input_ids, dtype=torch.bool)
+        self.prefill_cache_indices[prefill_cache_indices.to(device)] = True
+
+        if all_prefill_logprobs:
+            prefill_head_indices = None
+            prefill_next_token_indices = self.cu_seqlen_prefill[1:] - 1
+        elif no_prefill_logprobs:
+            prefill_head_indices = self.cu_seqlen_prefill[1:] - 1
+            prefill_next_token_indices = None
+        else:
+            prefill_head_indices = torch.cat(prefill_head_indices).to(device)
+            prefill_next_token_indices = torch.tensor(
+                prefill_next_token_indices, dtype=torch.int64, device=device
+            )
+
+        self.prefill_head_indices = prefill_head_indices
+        self.prefill_next_token_indices = prefill_next_token_indices
+        input_ids_padded_length_tensor = torch.cumsum(
+            torch.tensor(input_ids_padded_length, dtype=torch.int64, device=device),
+            dim=-1,
+        )
+        if self.prefill_head_indices is not None:
+            self.prefill_head_indices = (
+                self.prefill_head_indices + input_ids_padded_length_tensor
+            )
+
+        if self.prefill_next_token_indices is not None:
+            self.prefill_next_token_indices = (
+                self.prefill_next_token_indices + input_ids_padded_length_tensor
+            )
+
+        if adapter_set:
+            adapter_indices = torch.cat(adapter_indices_list).to(
+                dtype=torch.int64, device=device
+            )
+            adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+        else:
+            adapter_indices = torch.zeros_like(self.input_ids)
+            adapter_segments = [0, len(adapter_indices)]
+            adapter_segment_indices = [len(adapter_indices) - 1]
+
+        adapter_segments = torch.tensor(
+            adapter_segments, dtype=torch.int32, device=device
+        )
+        self.adapter_meta = AdapterBatchMetadata(
+            adapter_indices=adapter_indices,
+            adapter_set=adapter_set,
+            adapter_segments=adapter_segments,
+            segment_indices=adapter_segment_indices,
         )
 
     def __len__(self):
@@ -937,23 +1265,14 @@ class FlashCausalLM(Model):
         # Deepseek V2 uses different QK and V dims.
         head_size: Optional[int] = None,
         skip_special_tokens: bool = True,
+        kv_cache_dtype: Optional[torch.dtype] = None,
+        support_chunking: bool = True,
     ):
         self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = default_dtype if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = default_dtype if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                # Float16 doesn't exist on target.
-                dtype = torch.bfloat16 if dtype is None else dtype
-                init_cpu_threads_env(rank_id=rank, world_size=world_size)
-        else:
-            raise NotImplementedError(f"{model_class} is only available on GPU")
+
+        device = torch.device("hpu")
+        dtype = torch.bfloat16 if dtype is None else dtype
 
         tokenizer = tokenizer_class.from_pretrained(
             model_id,
@@ -991,7 +1310,7 @@ class FlashCausalLM(Model):
             weights_loader=weights_loader,
         )
 
-        prefix = ""
+        prefix = None
         model = model_class(prefix, config, weights)
         torch.distributed.barrier(group=self.process_group)
 
@@ -1007,6 +1326,7 @@ class FlashCausalLM(Model):
 
         self.num_layers = config.num_hidden_layers
         self.num_heads = config.num_attention_heads // self.process_group.size()
+        self.config = config
         # Validation is done in the model itself
         if num_kv_heads is None:
             num_kv_heads = getattr(config, "num_key_value_heads", None)
@@ -1034,25 +1354,14 @@ class FlashCausalLM(Model):
 
         self.cuda_graphs = {}
         self.kv_cache = []
+        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
 
-        if ATTENTION == "flashinfer":
-            from text_generation_server.layers.attention.flashinfer import (
-                create_prefill_state,
-                create_decode_state,
-                create_prefill_with_paged_kv_state,
-            )
-
-            self.prefill_state = create_prefill_state(device=device)
-            self.prefill_with_paged_kv_state = create_prefill_with_paged_kv_state(
-                device=device
-            )
-
-            self.decode_state = create_decode_state(
-                device=device,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_kv_heads,
-            )
-
+        if htorch.utils.internal.is_lazy():
+            htorch.hpu.wrap_in_hpu_graph(model, disable_tensor_cache=False)
+        environment.set_model_config(self.config)
+        self.use_contiguous_pa = (
+            os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"
+        )
         super().__init__(
             model_id=model_id,
             model=model,
@@ -1063,6 +1372,7 @@ class FlashCausalLM(Model):
             rank=rank,
             world_size=world_size,
             sliding_window=config.sliding_window,
+            support_chunking=support_chunking,
         )
 
     @property
@@ -1083,317 +1393,126 @@ class FlashCausalLM(Model):
     ):
         self.kv_cache = []
         empty_cache()
-
-        element_size = torch.tensor([], dtype=dtype).element_size()
-        if SYSTEM == "ipex" and device.type == "xpu":
-            x = 1
-        else:
-            x = BLOCK_SIZE // element_size
-
-        if ATTENTION in {"flashdecoding", "flashinfer"}:
-            self.kv_cache = [
-                (
-                    torch.empty(
-                        (num_blocks, BLOCK_SIZE, num_heads, head_size),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                    torch.empty(
-                        (num_blocks, BLOCK_SIZE, num_heads, head_size),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                )
-                for _ in range(num_layers)
-            ]
-        elif SYSTEM == "ipex" and device == torch.device("cpu"):
-            self.kv_cache = [
-                (
-                    torch.empty(
-                        (num_blocks, num_heads, BLOCK_SIZE, head_size),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                    torch.empty(
-                        (num_blocks, num_heads, BLOCK_SIZE, head_size),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                )
-                for _ in range(num_layers)
-            ]
-        else:
-            self.kv_cache = [
-                (
-                    torch.zeros(
-                        (num_blocks, num_heads, head_size // x, BLOCK_SIZE, x),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                    torch.zeros(
-                        (num_blocks, num_heads, head_size, BLOCK_SIZE),
-                        dtype=dtype,
-                        device=device,
-                    ),
-                )
-                for _ in range(num_layers)
-            ]
-
-    def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
-        input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
-        position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
-        slots = torch.arange(bs, dtype=torch.int64, device=self.device)
-        input_lengths = [max_s] * bs
-        prefix_lengths = [0] * bs
-        input_lengths_tensor = (
-            torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
-        )
-        prefix_lengths_tensor = torch.zeros(bs, dtype=torch.int32, device=self.device)
-        block_tables = torch.arange(
-            max_bt, dtype=torch.int32, device=self.device
-        ).repeat(bs)
-        block_tables = block_tables.reshape((bs, max_bt))
-
-        if ATTENTION == "flashinfer":
-            block_tables = block_tables_to_ragged(
-                block_tables=block_tables,
-                input_lengths=input_lengths,
-                prefix_lens=prefix_lengths,
-            )
-            from text_generation_server.layers.attention.flashinfer import (
-                create_decode_state_cuda_graphs,
+        self.kv_cache = [
+            KVCache(
+                num_blocks=num_blocks,
+                num_heads=num_heads,
+                head_size=head_size,
+                dtype=dtype,
+                device=device,
             )
+            for _ in range(num_layers)
+        ]
 
-            block_tables_ptr = torch.zeros(
-                bs + 1, dtype=torch.int32, device=self.device
-            )
-            last_page_len = torch.ones(bs, dtype=torch.int32, device=self.device)
-            state = create_decode_state_cuda_graphs(
-                device=input_ids.device,
-                block_tables=block_tables,
-                block_tables_ptr=block_tables_ptr,
-                last_page_len=last_page_len,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_kv_heads,
-            )
-        else:
-            state = None
-
-        graph = torch.cuda.CUDAGraph()
-        self.cuda_graphs[bs] = {
-            "input_ids": input_ids,
-            "position_ids": position_ids,
-            "kv_cache": self.kv_cache,
-            "block_tables": block_tables,
-            "slots": slots,
-            "input_lengths": input_lengths_tensor,
-            "prefix_lengths": prefix_lengths_tensor,
-            "state": state,
-            "graph": graph,
-        }
-
-        torch.cuda.synchronize()
-        # Run once outside to warmup
-        with self._forward_context(
-            block_tables=block_tables,
-            cu_seqlen_prefill=None,
-            input_lengths_tensor=input_lengths_tensor,
-            state=state,
-            prefix_lens_tensor=prefix_lengths_tensor,
-        ):
-            seqlen = Seqlen(
-                input_lengths=input_lengths_tensor,
-                prefix_lengths=prefix_lengths_tensor,
-                cu_seqlen_q=None,
-                max_q=1,
-                max_k=max_s,
-            )
-            self.model.forward(
-                input_ids=input_ids,
-                position_ids=position_ids,
-                cu_seqlen_prefill=None,
-                kv_cache=self.kv_cache,
-                block_tables=block_tables,
-                slots=slots,
-                seqlen=seqlen,
-                max_s=max_s,
-                prefill_cache_indices=None,
-                lm_head_indices=None,
-            )
-            del seqlen
-
-            torch.cuda.synchronize()
-
-            with torch.cuda.graph(graph, pool=MEM_POOL):
-                seqlen = Seqlen(
-                    input_lengths=input_lengths_tensor,
-                    prefix_lengths=prefix_lengths_tensor,
-                    cu_seqlen_q=None,
-                    max_q=1,
-                    max_k=max_s,
-                )
-                logits, speculative_logits = self.model.forward(
-                    input_ids=input_ids,
-                    position_ids=position_ids,
-                    cu_seqlen_prefill=None,
-                    kv_cache=self.kv_cache,
-                    block_tables=block_tables,
-                    slots=slots,
-                    seqlen=seqlen,
-                    max_s=max_s,
-                    prefill_cache_indices=None,
-                    lm_head_indices=None,
-                )
-                self.cuda_graphs[bs]["logits"] = logits
-                self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
-        torch.cuda.synchronize()
-
-    def warmup(self, batch: FlashCausalLMBatch):
+    def warmup(
+        self,
+        batch: FlashCausalLMBatch,
+        max_input_tokens: Optional[int],
+        max_total_tokens: Optional[int],
+    ):
         # The warmup batch is the biggest batch we could ever receive
+        self.kv_cache = []
         empty_cache()
 
+        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
+        # Calculate the number of blocks that can be allocated with the free memory
+        dtype_size = torch.tensor([], dtype=self.kv_cache_dtype).element_size()
+        cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
+        total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
+
         try:
             self.init_kv_cache(
                 batch.num_blocks,
                 self.num_layers,
                 self.num_kv_heads,
                 self.head_size,
-                self.dtype,
+                self.kv_cache_dtype,
                 self.device,
             )
-            max_bt = batch.max_blocks
-            max_s = max_bt * BLOCK_SIZE
 
-            if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
-                torch.cuda.tunable.tuning_enable(False)
-            _, batch, _ = self.generate_token(batch)
-        except torch.cuda.OutOfMemoryError as e:
+            batch_num_blocks = batch.num_blocks
+
+            num_tokens = batch.to_pb().current_tokens
+            synchronize(self.device)
+            free_memory = get_free_memory(
+                self.device, MEMORY_FRACTION * TGI_WIGGLE_ROOM
+            )
+            real_free_memory = get_free_memory(self.device, MEMORY_FRACTION)
+            log_master(
+                logger.debug,
+                f"Free memory {free_memory / 1e9:.2f}GB , (real: {real_free_memory / 1e9:.2f}GB",
+            )
+
+            _, _batch, _ = self.generate_token([batch])
+        except Exception:
             raise RuntimeError(
-                f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
+                f"Not enough memory to handle {num_tokens} prefill tokens. "
                 f"You need to decrease `--max-batch-prefill-tokens`"
-            ) from e
+            )
 
         synchronize(self.device)
-
-        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
-        # Calculate the number of blocks that can be allocated with the free memory
-        dtype_size = torch.tensor([], dtype=self.dtype).element_size()
-        cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
-        total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
-
-        free_memory = get_free_memory(self.device, MEMORY_FRACTION)
-        batch_num_blocks = batch.num_blocks if batch is not None else 0
-
+        free_memory = get_free_memory(self.device, MEMORY_FRACTION * TGI_WIGGLE_ROOM)
+        kv_memory = free_memory
         num_blocks = (
             # Leave 5% for some wiggle room
-            int((free_memory * TGI_WIGGLE_ROOM) // total_cache_size)
+            int(kv_memory // total_cache_size)
             # Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
             + batch_num_blocks
         )
 
-        del batch
+        log_master(logger.info, f"KV-cache blocks: {num_blocks}, size: {BLOCK_SIZE}")
+        if max_total_tokens is None:
+            max_total_tokens = sum(batch.cache_lengths)
+
+        if max_input_tokens is None:
+            max_input_tokens = max_total_tokens - 1
+
+        del _batch, batch
+        self.kv_cache = []
+        empty_cache()
 
         self.init_kv_cache(
             num_blocks,
             self.num_layers,
             self.num_kv_heads,
             self.head_size,
-            self.dtype,
+            self.kv_cache_dtype,
             self.device,
         )
+        return int(num_blocks * BLOCK_SIZE), max_input_tokens, max_total_tokens
 
-        if SYSTEM == "rocm":
-            if (
-                os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
-                or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
-            ):
-                torch.cuda.tunable.enable()
+    def warmup_prefill(self, prompt_len: int, bs: int):
+        input_ids = torch.zeros(
+            prompt_len, dtype=torch.int64, device=self.device
+        ).repeat(bs)
+        position_ids = torch.arange(
+            prompt_len, dtype=torch.int32, device=self.device
+        ).repeat(bs)
+        max_bt = (prompt_len // BLOCK_SIZE + 1) * bs
+        block_tables = torch.arange(
+            max_bt, dtype=torch.int32, device=self.device
+        ).reshape(bs, -1)
+        slot_acc = []
+        for i in range(bs):
+            slots = []
+            for b in block_tables[i]:
+                slots.extend(range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE))
+            slot_acc.extend(slots[:prompt_len])
+        slots = torch.tensor(slot_acc, dtype=torch.int64, device=self.device)
 
-                if os.environ.get("PYTORCH_TUNABLEOP_TUNING") != "0":
-                    torch.cuda.tunable.tuning_enable(True)
-
-                if os.environ.get("PYTORCH_TUNABLEOP_SEQLENS") is not None:
-                    tuning_sequences = [
-                        int(val)
-                        for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
-                    ]
-                elif CUDA_GRAPHS is not None:
-                    tuning_sequences = CUDA_GRAPHS
-                else:
-                    tuning_sequences = [1, 2, 3, 4, 5, 6, 7]
-
-                tunableop_filepath = os.path.join(
-                    HUGGINGFACE_HUB_CACHE,
-                    f"tunableop_{self.model_id.replace('/', '-')}_tp{self.world_size}_rank{self.rank}.csv",
-                )
-
-                log_master(
-                    logger.info,
-                    f"PyTorch TunableOp is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
-                )
-
-                torch.cuda.tunable.set_filename(
-                    tunableop_filepath, insert_device_ordinal=False
-                )
-
-                if os.path.isfile(tunableop_filepath):
-                    log_master(
-                        logger.info,
-                        f"The file {tunableop_filepath} already exists and will be reused.",
-                    )
-                    torch.cuda.tunable.read_file(tunableop_filepath)
-
-                os.makedirs(HUGGINGFACE_HUB_CACHE, exist_ok=True)
-
-                for seqlen in tuning_sequences:
-                    log_master(logger.info, f"Warming up TunableOp for seqlen={seqlen}")
-                    self.tunableop_warmup(seqlen)
-                    torch.cuda.tunable.write_file(tunableop_filepath)
-                if os.environ.get("PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP") != "1":
-                    torch.cuda.tunable.tuning_enable(False)
-            else:
-                log_master(
-                    logger.info,
-                    "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp.",
-                )
-
-        if CUDA_GRAPHS:
-            try:
-                log_master(
-                    logger.info, f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}"
-                )
-                # Warmup cuda graphs
-                for bs in CUDA_GRAPHS:
-                    if self.speculate is None or self.speculate + 1 <= bs:
-                        self.cuda_graph_warmup(bs, max_s, max_bt)
-            except torch.cuda.OutOfMemoryError:
-                logger.exception("Decode cuda graph warmup failed")
-        else:
-            log_master(
-                logger.info, f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS})."
-            )
-
-        return int(num_blocks * BLOCK_SIZE)
-
-    def tunableop_warmup(self, seqlen: int):
-        input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device)
-        position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
-        slots = torch.arange(seqlen, dtype=torch.int64, device=self.device)
-
-        # Dummy value, some models (starcoder2) don't accept `None`.
-        input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device)
-        prefix_lens_tensor = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
-        cu_seqlen_prefill = torch.tensor(
-            [0, seqlen], device=self.device, dtype=torch.int32
+        input_lengths = (
+            torch.ones(bs, dtype=torch.int32, device=self.device) * prompt_len
         )
-        max_s = seqlen
+        cache_lengths_tensor = torch.zeros(bs, dtype=torch.int32, device=self.device)
+        cu_seqlen_prefill = torch.zeros(bs + 1, device=self.device, dtype=torch.int32)
+        torch.cumsum(input_lengths, -1, out=cu_seqlen_prefill[1:])
+
         seqlen = Seqlen(
             input_lengths=input_lengths,
-            prefix_lengths=prefix_lens_tensor,
+            cache_lengths=cache_lengths_tensor,
             cu_seqlen_q=cu_seqlen_prefill,
-            max_q=1,
-            max_k=seqlen,
         )
+        lm_head_indices = input_lengths - 1
 
         # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
         self.model.forward(
@@ -1401,12 +1520,64 @@ class FlashCausalLM(Model):
             position_ids=position_ids,
             cu_seqlen_prefill=cu_seqlen_prefill,
             kv_cache=self.kv_cache,
-            block_tables=None,
-            seqlen=seqlen,
             slots=slots,
-            max_s=max_s,
+            seqlen=trim_seqlen_metadata(seqlen),
+            lm_head_indices=lm_head_indices,
+            adapter_data=None,
+            hpu_attention_meta=None,
+        )
+
+    def warmup_decode(self, bs: int, block_num: int):
+        input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
+        position_ids = torch.arange(bs, dtype=torch.int32, device=self.device)
+        block_tables = torch.arange(
+            start=1, end=block_num + 1, dtype=torch.int32, device=self.device
+        ).reshape(bs, -1)
+        slots = []
+        past_len = (
+            len(block_tables[0]) * BLOCK_SIZE - 1
+        )  # for decode, we only need to pass the past token
+        # fetch the last blocked to warmup block num
+        for i in range(bs):
+            slots.append(BLOCK_SIZE * block_tables[i][-1] + BLOCK_SIZE - 1)
+        slots = torch.tensor(slots, dtype=torch.int64, device=self.device)
+        input_lengths = torch.ones(bs, dtype=torch.int32, device=self.device)
+        cache_lengths_tensor = (
+            torch.ones(bs, dtype=torch.int32, device=self.device) * past_len
+        )
+        cu_seqlen_prefill = torch.zeros(bs + 1, device=self.device, dtype=torch.int32)
+        torch.cumsum(input_lengths, -1, out=cu_seqlen_prefill[1:])
+
+        seqlen = Seqlen(
+            input_lengths=input_lengths,
+            cache_lengths=cache_lengths_tensor,
+            cu_seqlen_q=cu_seqlen_prefill,
+        )
+        block_num = cache_lengths_tensor // BLOCK_SIZE + 1
+        block_tables_valid = []
+        for i, bt in enumerate(block_tables.tolist()):
+            block_tables_valid.append(bt[0 : block_num[i]])
+
+        hpu_attention_meta = prepare_for_decode(
+            self.dtype,
+            self.use_contiguous_pa,
+            self.device,
+            slots,
+            block_tables_valid,
+            bs,
+        )
+
+        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
+        self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=None,
+            kv_cache=self.kv_cache,
+            slots=slots,
+            seqlen=trim_seqlen_metadata(seqlen),
             lm_head_indices=None,
-            prefill_cache_indices=None,
+            adapter_data=None,
+            hpu_attention_meta=hpu_attention_meta,
         )
 
     def forward(
@@ -1421,7 +1592,7 @@ class FlashCausalLM(Model):
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
             speculative_ids = batch.speculative_ids
@@ -1436,12 +1607,20 @@ class FlashCausalLM(Model):
             new_position_ids = (
                 position_ids.unsqueeze(-1).expand(B, new_length) + arange
             ).view(-1)
-            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
+
+            # Slots can be discontiguous when prefix caching is enabled, so we need to expand the slot_indices,
+            # then update the slots with the additional indices to ensure we're grabbing the ones that have been
+            # allocated
+            slot_indices = (
+                batch.slot_indices.unsqueeze(-1).expand(B, new_length) + arange_int
+            ).view(-1)
+            slots = batch.slots[slot_indices]
+
             input_lengths = (
                 input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
             ).view(-1)
-            prefix_lens_tensor = (
-                batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length)
+            cache_lengths_tensor = (
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
             ).reshape(-1)
 
             # Add Copy the block tables for all members
@@ -1463,8 +1642,8 @@ class FlashCausalLM(Model):
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            prefix_lens_tensor = batch.prefix_lens_tensor
-            max_s = batch.max_seqlen
+            cache_lengths_tensor = batch.cache_lengths_tensor
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
         if cu_seqlen_prefill is None and self.max_past() is not None:
@@ -1473,105 +1652,48 @@ class FlashCausalLM(Model):
             # This makes sure the max_s for the decode pass is correct.
             max_s = min(self.max_past(), max_s)
 
-        bs = input_ids.shape[0]
-        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
-        if sorted_padded_bs:
-            # Get associated cuda graph
-            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
-        else:
-            cuda_graph = None
-
-        if cu_seqlen_prefill is not None or cuda_graph is None:
-            if ATTENTION == "flashinfer":
-                block_tables = block_tables_to_ragged(
-                    block_tables=block_tables,
-                    input_lengths=batch.input_lengths,
-                    prefix_lens=batch.prefix_lens,
-                )
-            with self._forward_context(
-                block_tables=block_tables,
-                cu_seqlen_prefill=cu_seqlen_prefill,
-                input_lengths_tensor=input_lengths,
-                prefix_lens_tensor=prefix_lens_tensor,
-            ):
-                max_k = (input_lengths + prefix_lens_tensor).max().item()
-                seqlen = Seqlen(
-                    input_lengths=input_lengths,
-                    prefix_lengths=prefix_lens_tensor,
-                    cu_seqlen_q=cu_seqlen_prefill,
-                    max_q=max_s,
-                    max_k=max_k,
-                )
-                logits, speculative_logits = self.model.forward(
-                    input_ids=input_ids,
-                    position_ids=position_ids,
-                    cu_seqlen_prefill=cu_seqlen_prefill,
-                    kv_cache=kv_cache,
-                    block_tables=block_tables,
-                    slots=slots,
-                    seqlen=seqlen,
-                    max_s=max_s,
-                    prefill_cache_indices=batch.prefill_cache_indices,
-                    lm_head_indices=lm_head_indices,
-                    adapter_data=adapter_data,
-                )
-                if batch.prefill_cache_indices is not None:
-                    batch.prefill_cache_indices = None
-                return logits, speculative_logits
-
-        # Copy inputs to the static inputs of the cuda graph
-        # Static inputs are potentially padded
-        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
-        cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
-        if ATTENTION == "flashinfer":
-            block_tables = block_tables_to_ragged(
-                block_tables=block_tables,
-                input_lengths=batch.input_lengths,
-                prefix_lens=batch.prefix_lens,
-            )
-            # assert block_tables.shape[0] >= slots.shape[0]
-            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
-        else:
-            cuda_graph["block_tables"][
-                : block_tables.shape[0], : block_tables.shape[1]
-            ] = block_tables
-
-        # XXX: This is working only because block 0 is reserved for the healthcheck
-        # so it doesn't matter if we override it with bogus values.
-        cuda_graph["slots"].fill_(0)
-        cuda_graph["slots"][: slots.shape[0]] = slots
-        cuda_graph["input_lengths"].zero_()
-        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
-        cuda_graph["prefix_lengths"].zero_()
-        cuda_graph["prefix_lengths"][: prefix_lens_tensor.shape[0]] = prefix_lens_tensor
-
-        with self._forward_context(
-            block_tables=cuda_graph["block_tables"],
-            cu_seqlen_prefill=None,
-            input_lengths_tensor=cuda_graph["input_lengths"],
-            prefix_lens_tensor=cuda_graph["prefix_lengths"],
-            state=cuda_graph["state"],
-        ):
-            # Replay the graph
-            cuda_graph["graph"].replay()
-
-        # Slice output to the correct shape
-        speculative_logits = (
-            cuda_graph["speculative_logits"][:bs]
-            if cuda_graph["speculative_logits"] is not None
-            else None
+        seqlen = Seqlen(
+            input_lengths=input_lengths,
+            cache_lengths=cache_lengths_tensor,
+            cu_seqlen_q=cu_seqlen_prefill,
+        )
+        kwargs = {}
+        if htorch.utils.internal.is_lazy():
+            kwargs["bypass_hpu_graphs"] = False
+        if batch.prefill_cache_indices is not None:
+            slots_pad = torch.zeros_like(input_ids)
+            slots_pad[batch.prefill_cache_indices] = slots
+            slots = slots_pad
+        logits, speculative_logits = self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            slots=slots,
+            seqlen=trim_seqlen_metadata(seqlen),
+            lm_head_indices=lm_head_indices,
+            # TODO not support adapter now, need the add in the future
+            adapter_data=None,
+            hpu_attention_meta=batch.hpu_attn_meta,
+            **kwargs,
         )
-        logits = cuda_graph["logits"][:bs]
         return logits, speculative_logits
 
     @tracer.start_as_current_span("generate_token")
     def generate_token(
-        self, batch: FlashCausalLMBatch
+        self, batches: List[FlashCausalLMBatch]
     ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch], Tuple[int, int]]:
+        if len(batches) > 1:
+            batch = self.batch_type.concatenate(batches)
+        else:
+            batch = batches[0]
         start = time.time_ns()
-        prefill = batch.cu_seqlen_prefill is not None
+        prefill = batch.prefilling
+        if prefill:
+            batch.prepare_for_prefill()
+        else:
+            batch.prepare_for_decode(self.dtype, self.use_contiguous_pa)
         prefill_logprobs = batch.prefill_next_token_indices is not None
-
         # Update adapter indices for speculative tokens (if present)
         adapter_meta = batch.adapter_meta
         if batch.speculative_ids is not None:
@@ -1611,13 +1733,23 @@ class FlashCausalLM(Model):
                     if prefill_logprobs
                     else speculative_logits
                 )
-            next_adapter_indices = batch.adapter_meta.adapter_indices.new_empty(
-                len(batch)
-            )
-
+            if len(batch) > 1 and prefill_logprobs:
+                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
+                # When batch == 1, we will just use the batch.input_ids values directly
+                prefill_tokens_indices = batch.input_ids.new_zeros(len(out))
         else:
+            prefill_logprobs = None
             next_token_logits = out
-            next_adapter_indices = batch.adapter_meta.adapter_indices
+
+        finished_prefilling = True
+        next_chunk_lengths = []
+        current_prefilling_mask = batch.prefilling_mask
+        if prefill:
+            finished_prefilling = True
+            next_prefilling_mask = [False] * len(batch)
+
+            batch.prefilling = not finished_prefilling
+            batch.prefilling_mask = next_prefilling_mask
 
         speculate = get_speculate()
         (
@@ -1627,7 +1759,7 @@ class FlashCausalLM(Model):
             accepted_ids,
             speculative_ids,
         ) = batch.next_token_chooser(
-            batch.all_input_ids_tensor[:, : batch.max_seqlen],
+            batch.all_input_ids_tensor[:, : batch.max_current_length],
             next_token_logits,
             speculate,
             batch.speculative_ids,
@@ -1638,85 +1770,110 @@ class FlashCausalLM(Model):
             batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs, accepted_ids
         )
 
-        if prefill:
-            if len(batch) > 1 and prefill_logprobs:
-                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
-                # When batch == 1, we will just use the batch.input_ids values directly
-                prefill_tokens_indices = batch.input_ids.new_zeros(len(out))
+        # Since we are done prefilling, all the tensors that were concatenating values for all the requests
+        # instantly become of shape [BATCH_SIZE]
+        if prefill and finished_prefilling:
+            indices = batch.cu_seqlen_prefill[1:] - 1
+            # pad in left
+            if batch.prefill_cache_indices is not None:
+                batch.position_ids = batch.position_ids[batch.prefill_cache_indices][
+                    indices
+                ]
+            else:
+                batch.position_ids = batch.position_ids[indices]
 
-            next_position_ids = batch.position_ids.new_empty(len(batch))
-            batch.slot_indices = batch.slot_indices[batch.cu_seqlen_prefill[1:] - 1]
-            # We do not need cu_seqlen_prefill anymore
-            batch.cu_seqlen_prefill = None
-        else:
-            prefill_logprobs = None
-            next_position_ids = batch.position_ids
-
-        # Cumulative length
-        cumulative_length = 0
-
-        # Results
-        generations: List[Generation] = []
-        stopped = True
+            batch.slot_indices = batch.slot_indices[indices]
+            batch.adapter_meta.adapter_indices = batch.adapter_meta.adapter_indices[
+                indices
+            ]
 
         # Zipped iterator
-        iterator = zip(batch.input_lengths, batch.all_input_ids, accepted_ids)
+        iterator = zip(
+            batch.requests,
+            batch.prompt_lengths,
+            batch.cache_lengths,
+            batch.input_lengths,
+            batch.all_input_ids,
+            accepted_ids,
+            current_prefilling_mask,
+            batch.prefilling_mask,
+        )
 
         # We do two for loops as the first one can run completely asynchronously from the GPU while for the second
-        # one, we need to first do a GPU <-> CPU sync
+        # one, we need to first do a HPU <-> CPU sync
         # It is faster if we delay this sync for the maximum amount of time
 
         # For each member of the batch
-        index = 0
-        for i, (input_length, all_input_ids, n_accepted_ids) in enumerate(iterator):
-            # Indexing metadata
-            start_index = cumulative_length
-            end_index = cumulative_length + input_length
-
-            if prefill:
+        # Cumulative length
+        cu_accepted_ids = accepted_ids.new_zeros(accepted_ids.shape[0] + 1)
+        torch.cumsum(accepted_ids, dim=0, out=cu_accepted_ids[1:])
+        cumulative_length = 0
+        for i, (
+            request,
+            prompt_length,
+            cache_length,
+            input_length,
+            all_input_ids,
+            n_accepted_ids,
+            request_was_prefilling,
+            request_is_prefilling,
+        ) in enumerate(iterator):
+            # Used to gather prefill logprobs
+            # Copy batch.all_input_ids_tensor to prefill_token_indices
+            if request.prefill_logprobs and request_was_prefilling:
                 # Indexing metadata
                 out_start_index = batch.prefill_cu_outlens[i]
                 out_end_index = batch.prefill_cu_outlens[i + 1]
-                out_length = out_end_index - out_start_index
 
-                # Initialize position_ids
-                # In decode, we do not need this as we can just increment position ids
-                next_position_ids[i] = batch.position_ids[end_index - 1]
-
-                # Initialize adapter indices
-                # In decode, we only have one token per row in the batch, so grab last index
-                next_adapter_indices[i] = batch.adapter_meta.adapter_indices[
-                    end_index - 1
+                # Logprobs generated by the model are for the next token
+                # So we need to translate the id tensor by 1
+                ids = batch.all_input_ids_tensor[
+                    i, cache_length + 1 : cache_length + input_length + 1
                 ]
+                if len(batch) > 1:
+                    prefill_tokens_indices[out_start_index:out_end_index] = ids
+                else:
+                    # Set prefill_tokens_indices to the correct slice
+                    prefill_tokens_indices = ids
 
-                # Used to gather prefill logprobs
-                # Copy batch.input_ids to prefill_token_indices
-                if prefill_logprobs:
-                    if len(batch) > 1:
-                        prefill_tokens_indices[out_start_index : out_end_index - 1] = (
-                            batch.input_ids[start_index + 1 : start_index + out_length]
-                        )
-                    else:
-                        # Set prefill_tokens_indices to the correct slice
-                        prefill_tokens_indices = batch.input_ids[
-                            start_index + 1 : start_index + out_length
-                        ]
-
-            for j in range(n_accepted_ids):
-                batch.all_input_ids_tensor[i, input_length + j] = next_input_ids[index]
-                index += 1
-
+            # If the device does not support triton, we copy one by one
+            if not request_is_prefilling:
+                # Only save tokens if we are done prefilling for this request
+                batch.all_input_ids_tensor[
+                    i,
+                    batch.cache_lengths_tensor[i]
+                    + batch.input_lengths[i] : batch.cache_lengths_tensor[i]
+                    + batch.input_lengths[i]
+                    + accepted_ids[i],
+                ] = next_input_ids[cu_accepted_ids[i] : cu_accepted_ids[i + 1]]
             cumulative_length += input_length
 
         # Update values
-        batch.input_ids = next_input_ids[accepted_ids.cumsum(dim=-1) - 1]
-        batch.speculative_ids = speculative_ids
-        batch.position_ids = next_position_ids + accepted_ids
-        batch.input_lengths_tensor += accepted_ids
-        batch.slot_indices += accepted_ids
-        batch.adapter_meta.adapter_indices = next_adapter_indices
+        # These values can be updated without a HPU -> CPU sync
+        if not prefill or (prefill and finished_prefilling):
+            batch.input_ids = next_input_ids[cu_accepted_ids[1:] - 1]
+            batch.speculative_ids = speculative_ids
+            if batch.position_ids.dim() == 2:
+                # Qwen2_vl case:
+                batch.position_ids += accepted_ids.unsqueeze(-1)
+            else:
+                batch.position_ids += accepted_ids
+            batch.cache_lengths_tensor += batch.input_lengths_tensor + accepted_ids - 1
+            batch.input_lengths_tensor = torch.ones_like(batch.input_lengths_tensor)
+            batch.slot_indices += accepted_ids
 
-        if prefill:
+        if prefill and prefill_logprobs:
+            # Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size))
+            torch.log_softmax(out, -1, out=out)
+            prefill_logprobs_tensor = out
+            prefill_logprobs = torch.gather(
+                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
+            )
+            # HPU <-> CPU sync
+            prefill_logprobs = prefill_logprobs.view(-1).tolist()
+
+        # Does a HPU <-> CPU sync internally
+        if prefill and finished_prefilling:
             # adjust segment lengths to account for all request lengths being 1 during decoding
             adapter_segments, _ = find_segments(batch.adapter_meta.adapter_indices)
             batch.adapter_meta.adapter_segments = torch.tensor(
@@ -1725,192 +1882,282 @@ class FlashCausalLM(Model):
                 device=batch.adapter_meta.adapter_segments.device,
             )
 
-        if prefill and prefill_logprobs:
-            # Get prefill logprobs
-            prefill_logprobs_tensor = torch.log_softmax(out, -1)
-            prefill_logprobs = torch.gather(
-                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
-            )
-            # GPU <-> CPU sync
-            prefill_logprobs = prefill_logprobs.view(-1).tolist()
-
-        # GPU <-> CPU sync
+        # HPU <-> CPU sync
         next_token_logprobs = next_token_logprobs.tolist()
         next_token_ids = next_input_ids.tolist()
         accepted_ids = accepted_ids.tolist()
+
+        # Update values if we need to continue prefilling
+        # This represents the `else` case of the `Update values` if above
+        # but since this require the `next_token_ids` to be on CPU, it is better to do it here
+        if prefill and not finished_prefilling:
+            # Speculation must be ignored while we prefill even with chunking
+            # it simplifies everything
+            assert batch.speculative_ids is None
+
+            all_postfix_ids = []
+            for i, (
+                request_prefilling,
+                next_token_id,
+                all_input_ids,
+                cache_length,
+                input_length,
+                next_chunk_length,
+            ) in enumerate(
+                zip(
+                    batch.prefilling_mask,
+                    next_token_ids,
+                    batch.all_input_ids,
+                    batch.cache_lengths,
+                    batch.input_lengths,
+                    next_chunk_lengths,
+                )
+            ):
+                if request_prefilling:
+                    next_cache_length = cache_length + input_length
+                    # Get new prompt IDs to prefill
+                    postfix_ids = all_input_ids[
+                        next_cache_length : next_cache_length + next_chunk_length
+                    ]
+                else:
+                    # This request is done prefilling, the new id is the one selected the sampling method
+                    postfix_ids = [next_token_id]
+
+                all_postfix_ids.append(postfix_ids)
+
+            batch.input_ids = all_postfix_ids
+
         start_decode = time.time_ns()
 
+        # Results
+        generations: List[Generation] = []
+        stopped = True
+
         # Zipped iterator
         iterator = zip(
             batch.requests,
+            batch.prompt_lengths,
+            batch.cache_lengths,
             batch.input_lengths,
             batch.prefix_offsets,
             batch.read_offsets,
             batch.stopping_criterias,
             batch.all_input_ids,
-            batch.prefix_ids,
             batch.next_token_chooser.do_sample,
             batch.next_token_chooser.seeds,
             batch.top_n_tokens,
+            current_prefilling_mask,
+            batch.prefilling_mask,
             accepted_ids,
             batch_top_token_ids,
             batch_top_token_logprobs,
         )
 
+        # Reset max_input_length
+        batch.max_input_length = 0
         # For each member of the batch
         index = 0
         for i, (
             request,
+            prompt_length,
+            cache_length,
             input_length,
             prefix_offset,
             read_offset,
             stopping_criteria,
             all_input_ids,
-            prefix_ids,
             do_sample,
             seed,
             top_n_tokens,
+            request_was_prefilling,
+            request_is_prefilling,
             n_accepted_ids,
             top_token_ids,
             top_token_logprobs,
         ) in enumerate(iterator):
-            # Append next token to all tokens
-            next_token_texts = []
-            left = 0
-
-            if n_accepted_ids > 1:
-                log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}")
-
-            current_stopped = False
-            for j in range(index, index + n_accepted_ids):
-                # Generated token
-                next_token_id = next_token_ids[j]
-                all_input_ids.append(next_token_id)
-                next_token_text, prefix_offset, read_offset = self.decode_token(
-                    all_input_ids,
-                    prefix_offset,
-                    read_offset,
-                )
-                next_token_texts.append(next_token_text)
-
-                stop, reason = stopping_criteria(
-                    next_token_id,
-                    next_token_text,
-                )
-
-                if stop:
-                    left = index + n_accepted_ids - j - 1
-                    current_stopped = True
-                    break
-                else:
-                    current_stopped = False
-            stopped = stopped and current_stopped
-
-            _next_token_ids = next_token_ids[index : index + n_accepted_ids - left]
-            _next_token_logprobs = next_token_logprobs[
-                index : index + n_accepted_ids - left
-            ]
-            index += n_accepted_ids
-
-            # Shard generations
-            # All generations will be appended in the rust sharded client
-            if i % self.world_size == self.rank:
-                if stop:
-                    # Decode generated tokens
-                    output_text, _, _ = self.decode_token(
-                        all_input_ids,
-                        prefix_offset=len(all_input_ids)
-                        - stopping_criteria.current_tokens
-                        - 1,
-                        read_offset=len(all_input_ids)
-                        - stopping_criteria.current_tokens,
-                        skip_special_tokens=True,
-                    )
-                    generated_text = GeneratedText(
-                        output_text,
-                        stopping_criteria.current_tokens,
-                        reason,
-                        seed if do_sample else None,
-                    )
-                else:
-                    generated_text = None
-
+            # Compute logprobs first as, even though we might skip the token,
+            # it can still be required to compute the logprobs
+            # modulo on request.id as it is robust to batch.filter whereas the index in the batch is not and we need
+            # this state to be stable
+            if request.id % self.world_size == self.rank:
                 # Prefill
-                if prefill and request.prefill_logprobs:
+                if request_was_prefilling and request.prefill_logprobs:
                     out_start_index = batch.prefill_cu_outlens[i]
                     out_end_index = batch.prefill_cu_outlens[i + 1]
+                    if not request_is_prefilling:
+                        # The request is dones prefilling, meaning that we started generating new tokens
+                        # The last logprob is a logprob for a generated token that was not part of the prompt
+                        # We need to remove it
+                        out_end_index -= 1
+
+                    request_prefill_logprobs = prefill_logprobs[
+                        out_start_index:out_end_index
+                    ]
+                    # Logprobs generated by the model are for the next token
+                    # So we need to translate the id tensor by 1
+                    prefill_token_ids = all_input_ids[
+                        cache_length + 1 : cache_length + input_length + 1
+                    ]
+
+                    past_prefill_logprob_tokens = batch.prefill_logprob_tokens[i]
+
+                    if past_prefill_logprob_tokens is None:
+                        # add nan for cached prompt tokens/first token
+                        request_prefill_logprobs = [float("nan")] * (
+                            cache_length + 1
+                        ) + request_prefill_logprobs
+                        prefill_token_ids = (
+                            all_input_ids[: cache_length + 1] + prefill_token_ids
+                        )
 
-                    # Remove generated token to only have prefill and add nan for first prompt token
-                    request_prefill_logprobs = (
-                        [float("nan")] * (len(prefix_ids) + 1)
-                    ) + prefill_logprobs[out_start_index : out_end_index - 1]
-                    prefill_token_ids = all_input_ids[:-1]
                     prefill_texts = self.tokenizer.batch_decode(
-                        prefix_ids + prefill_token_ids,
+                        prefill_token_ids,
                         clean_up_tokenization_spaces=False,
                         skip_special_tokens=False,
                     )
 
-                    prefill_tokens = Tokens(
-                        prefix_ids + prefill_token_ids,
+                    prefill_logprob_tokens = Tokens(
+                        prefill_token_ids,
                         request_prefill_logprobs,
                         prefill_texts,
                         is_special=[],
                     )
-                else:
-                    prefill_tokens = None
-
-                if top_n_tokens > 0:
-                    all_top_tokens = []
-                    for top_token_ids, top_token_logprobs in zip(
-                        top_token_ids, top_token_logprobs
-                    ):
-                        toptoken_texts = self.tokenizer.batch_decode(
-                            top_token_ids,
-                            clean_up_tokenization_spaces=False,
-                            skip_special_tokens=False,
+                    if past_prefill_logprob_tokens is not None:
+                        prefill_logprob_tokens = (
+                            past_prefill_logprob_tokens + prefill_logprob_tokens
                         )
-                        special_toptokens = [
-                            token_id in self.all_special_ids
-                            for token_id in top_token_ids
-                        ]
-                        top_tokens = Tokens(
-                            top_token_ids,
-                            top_token_logprobs,
-                            toptoken_texts,
-                            special_toptokens,
-                        )
-                        all_top_tokens.append(top_tokens)
-                    top_tokens = all_top_tokens
+
+                    batch.prefill_logprob_tokens[i] = prefill_logprob_tokens
                 else:
-                    top_tokens = None
+                    batch.prefill_logprob_tokens[i] = None
 
-                generation = Generation(
-                    request.id,
-                    prefill_tokens,
-                    Tokens(
-                        _next_token_ids,
-                        _next_token_logprobs,
-                        next_token_texts,
-                        [nid in self.all_special_ids for nid in _next_token_ids],
-                    ),
-                    generated_text,
-                    top_tokens,
-                )
+            # If it is, the tokens we decoded should be ignored
+            if request_is_prefilling:
+                # Make sure that we do not stop as even though this request did not create a token, it is still
+                # processing
+                stopped = False
+                new_input_length = next_chunk_lengths[i]
+                new_cache_length = cache_length + input_length
+            else:
+                new_input_length = 1
+                new_cache_length = cache_length + input_length + n_accepted_ids - 1
+                # Append next token to all tokens
+                next_token_texts = []
+                left = 0
 
-                generations.append(generation)
+                if n_accepted_ids > 1:
+                    log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}")
 
-            # accept each new token for this specific request since we may
-            # have more than one new token per request with speculative decoding
-            for next_token_id in _next_token_ids:
-                batch.next_token_chooser = (
-                    batch.next_token_chooser.advance_grammar_single(i, next_token_id)
-                )
+                current_stopped = False
+                for j in range(index, index + n_accepted_ids):
+                    # Generated token
+                    next_token_id = next_token_ids[j]
+                    all_input_ids.append(next_token_id)
+                    next_token_text, prefix_offset, read_offset = self.decode_token(
+                        all_input_ids,
+                        prefix_offset,
+                        read_offset,
+                    )
+                    next_token_texts.append(next_token_text)
+
+                    stop, reason = stopping_criteria(
+                        next_token_id,
+                        next_token_text,
+                    )
+
+                    if stop:
+                        left = index + n_accepted_ids - j - 1
+                        current_stopped = True
+                        break
+                    else:
+                        current_stopped = False
+                stopped = stopped and current_stopped
+
+                _next_token_ids = next_token_ids[index : index + n_accepted_ids - left]
+                _next_token_logprobs = next_token_logprobs[
+                    index : index + n_accepted_ids - left
+                ]
+
+                # Shard generations
+                # All generations will be appended in the rust sharded client
+                if request.id % self.world_size == self.rank:
+                    if stop:
+                        # Decode generated tokens
+                        output_text, _, _ = self.decode_token(
+                            all_input_ids,
+                            prefix_offset=len(all_input_ids)
+                            - stopping_criteria.current_tokens
+                            - 1,
+                            read_offset=len(all_input_ids)
+                            - stopping_criteria.current_tokens,
+                            skip_special_tokens=True,
+                        )
+                        generated_text = GeneratedText(
+                            output_text,
+                            stopping_criteria.current_tokens,
+                            reason,
+                            seed if do_sample else None,
+                        )
+                    else:
+                        generated_text = None
+
+                    if top_n_tokens > 0:
+                        all_top_tokens = []
+                        for top_token_ids, top_token_logprobs in zip(
+                            top_token_ids, top_token_logprobs
+                        ):
+                            toptoken_texts = self.tokenizer.batch_decode(
+                                top_token_ids,
+                                clean_up_tokenization_spaces=False,
+                                skip_special_tokens=False,
+                            )
+                            special_toptokens = [
+                                token_id in self.all_special_ids
+                                for token_id in top_token_ids
+                            ]
+                            top_tokens = Tokens(
+                                top_token_ids,
+                                top_token_logprobs,
+                                toptoken_texts,
+                                special_toptokens,
+                            )
+                            all_top_tokens.append(top_tokens)
+                        top_tokens = all_top_tokens
+                    else:
+                        top_tokens = None
+
+                    generation = Generation(
+                        request.id,
+                        batch.prefill_logprob_tokens[i],
+                        Tokens(
+                            _next_token_ids,
+                            _next_token_logprobs,
+                            next_token_texts,
+                            [nid in self.all_special_ids for nid in _next_token_ids],
+                        ),
+                        generated_text,
+                        top_tokens,
+                    )
+
+                    generations.append(generation)
+
+                # accept each new token for this specific request since we may
+                # have more than one new token per request with speculative decoding
+                for next_token_id in _next_token_ids:
+                    batch.next_token_chooser = (
+                        batch.next_token_chooser.advance_grammar_single(
+                            i, next_token_id
+                        )
+                    )
 
             # Update values
-            batch.input_lengths[i] = input_length + n_accepted_ids
-            if batch.input_lengths[i] > batch.max_seqlen:
-                batch.max_seqlen = batch.input_lengths[i]
+            index += n_accepted_ids
+            batch.cache_lengths[i] = new_cache_length
+            batch.max_input_length = max(batch.max_input_length, new_input_length)
+            batch.input_lengths[i] = new_input_length
+            current_length = new_cache_length + new_input_length
+            batch.max_current_length = max(batch.max_current_length, current_length)
+
             batch.prefix_offsets[i] = prefix_offset
             batch.read_offsets[i] = read_offset
             batch.all_input_ids[i] = all_input_ids
@@ -1921,83 +2168,14 @@ class FlashCausalLM(Model):
             decode_ns = time.time_ns() - start_decode
             return generations, None, (forward_ns, decode_ns)
 
-        batch.prefill_cu_outlens = None
-        batch.prefill_head_indices = None
-        batch.prefill_next_token_indices = None
+        if prefill and finished_prefilling:
+            # We do not need prefill tensors anymore
+            batch.cu_seqlen_prefill = None
+            batch.prefill_cache_indices = None
+            batch.prefill_cu_outlens = None
+            batch.prefill_head_indices = None
+            batch.prefill_next_token_indices = None
 
         forward_ns = start_decode - start
         decode_ns = time.time_ns() - start_decode
         return generations, batch, (forward_ns, decode_ns)
-
-    def _forward_context(
-        self,
-        *,
-        block_tables: torch.Tensor,
-        cu_seqlen_prefill: Optional[torch.Tensor],
-        input_lengths_tensor: torch.Tensor,
-        prefix_lens_tensor: torch.Tensor,
-        state: Optional[Any] = None,
-    ) -> ContextManager:
-        if ATTENTION != "flashinfer":
-            return nullcontext()
-
-        from text_generation_server.layers.attention.flashinfer import (
-            use_decode_state,
-            use_prefill_with_paged_kv_state,
-        )
-
-        # has_prefix_lens = any(prefix_len > 0 for prefix_len in prefix_lens)
-
-        if cu_seqlen_prefill is not None:
-            return use_prefill_with_paged_kv_state(
-                state=(
-                    state if state is not None else self.prefill_with_paged_kv_state
-                ),
-                # block_tables=block_tables_to_ragged(
-                #     block_tables=block_tables,
-                #     input_lengths=input_lengths,
-                #     prefix_lens=prefix_lens,
-                # ),
-                block_tables=block_tables,
-                cu_seqlens=cu_seqlen_prefill,
-                input_lengths=input_lengths_tensor + prefix_lens_tensor,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_kv_heads,
-                head_size=self.head_size,
-                page_size=BLOCK_SIZE,
-                dtype=self.dtype,
-                window_left=self.sliding_window,
-            )
-        else:
-            assert input_lengths_tensor is not None
-            return use_decode_state(
-                state=state if state is not None else self.decode_state,
-                input_lengths=input_lengths_tensor + prefix_lens_tensor,
-                block_tables=block_tables,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_kv_heads,
-                head_size=self.head_size,
-                page_size=BLOCK_SIZE,
-                dtype=self.dtype,
-                window_left=self.sliding_window,
-            )
-
-
-def block_tables_to_ragged(
-    *, block_tables: torch.Tensor, input_lengths: List[int], prefix_lens: List[int]
-) -> torch.Tensor:
-    """Convert block table to ragged format compatible with FlashInfer."""
-    assert len(input_lengths) == len(prefix_lens)
-
-    total_len = sum(input_lengths) + sum(prefix_lens)
-    block_tables_ragged = torch.empty(
-        total_len, dtype=torch.int32, device=block_tables.device
-    )
-
-    offset = 0
-    for i, (input_length, prefix_len) in enumerate(zip(input_lengths, prefix_lens)):
-        seq_len = prefix_len + input_length
-        block_tables_ragged[offset : offset + seq_len] = block_tables[i][:seq_len]
-        offset += seq_len
-
-    return block_tables_ragged
diff --git a/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
new file mode 100644
index 00000000..208ab358
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
@@ -0,0 +1,489 @@
+import torch
+from PIL import Image
+from io import BytesIO
+
+from opentelemetry import trace
+from typing import Iterable, Optional, Tuple, List, Type, Dict
+
+from transformers import PreTrainedTokenizerBase
+from transformers.image_processing_utils import select_best_resolution
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.flash_causal_lm import (
+    FlashCausalLMBatch,
+    FlashCausalLM,
+)
+from text_generation_server.models.globals import PREFIX_CACHING
+from loguru import logger
+from text_generation_server.utils.log import log_master
+from transformers import AutoProcessor
+from text_generation_server.layers.attention import Seqlen, trim_seqlen_metadata
+import habana_frameworks.torch as htorch
+
+tracer = trace.get_tracer(__name__)
+
+IDEFICS2_FAKE_TOKEN = "<fake_token_around_image>"
+IDEFICS2_IMAGE_TOKEN = "<image>"
+
+IDEFICS3_IMAGE_TOKEN = "<image>"
+IDEFICS3_FAKE_IMAGE_TOKEN = "<fake_token_around_image>"
+IDEFICS3_GLOBAL_IMG_TOKEN = "<global-img>"
+
+
+# copied from: https://github.com/huggingface/transformers/blob/02ed609285c2448b3b54c31e362f2c389fa952ab/src/transformers/models/idefics3/processing_idefics3.py#L44-L60
+def _prompt_split_image(
+    *,
+    image_seq_len: int,
+    image_rows: int,
+    image_cols: int,
+    fake_token_around_image: str,
+    image_token: str,
+    global_img_token: str,
+):
+    """Prompt with expanded image tokens for when the image is split into patches."""
+    text_split_images = ""
+    for n_h in range(image_rows):
+        for n_w in range(image_cols):
+            text_split_images += (
+                f"{fake_token_around_image}"
+                + f"<row_{n_h + 1}_col_{n_w + 1}>"
+                + f"{image_token}" * image_seq_len
+            )
+        text_split_images += "\n"
+
+    text_split_images += (
+        f"\n{fake_token_around_image}"
+        + f"{global_img_token}"
+        + f"{image_token}" * image_seq_len
+        + f"{fake_token_around_image}"
+    )
+    return text_split_images
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`tuple`):
+            The size of the input image in the format (height, width).
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise ValueError("grid_pinpoints should be a list of tuples or lists")
+
+    height, width = select_best_resolution(image_size, grid_pinpoints)
+    return height // patch_size, width // patch_size
+
+
+def image_text_replacement(processor, image_input, config, image_id: int) -> str:
+    if config.model_type == "idefics2":
+        image_seq_len = 64
+        image_str = f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_IMAGE_TOKEN * image_seq_len}{IDEFICS2_FAKE_TOKEN}"
+        if processor.image_processor.do_image_splitting:
+            image_str *= 5
+        return image_str
+    if config.model_type == "idefics3":
+        # TODO: implement this in a more general way
+        n_rows = image_input["rows"][0][image_id]
+        n_cols = image_input["cols"][0][image_id]
+        image_seq_len = int(
+            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2)
+            / (config.scale_factor**2)
+        )
+        image_str = _prompt_split_image(
+            image_seq_len=image_seq_len,
+            image_rows=n_rows,
+            image_cols=n_cols,
+            fake_token_around_image=IDEFICS3_FAKE_IMAGE_TOKEN,
+            image_token=IDEFICS3_IMAGE_TOKEN,
+            global_img_token=IDEFICS3_GLOBAL_IMG_TOKEN,
+        )
+        return image_str
+    elif config.model_type == "llava_next":
+        height, width = image_input["image_sizes"][image_id]
+        num_features = get_number_of_features(height, width, config)
+
+        log_master(
+            logger.info,
+            f"Found {num_features} features in image of resolution {height}x{width}",
+        )
+        return "<image>" * num_features
+
+    elif config.model_type == "paligemma":
+        return "<image>" * config.text_config.num_image_tokens
+    elif config.model_type == "qwen2_vl":
+        grid_t, grid_h, grid_w = image_input["image_grid_thw"][image_id]
+        num_pads = grid_t * grid_h * grid_w // 4
+        padding = "<|image_pad|>" * num_pads
+        return f"<|vision_start|>{padding}<|vision_end|>"
+    elif config.model_type == "qwen2_5_vl":
+        grid_t, grid_h, grid_w = image_input["image_grid_thw"][image_id]
+        num_pads = grid_t * grid_h * grid_w // 4
+        padding = "<|image_pad|>" * num_pads
+        return f"<|vision_start|>{padding}<|vision_end|>"
+    elif config.model_type == "gemma3":
+        # TODO: get correct number of features via reviewing the Gemma3 architecture
+        # and calculating the number of image tokens
+        num_pads = 256
+        padding = "<image_soft_token>" * num_pads
+        return f"\n\n<start_of_image>{padding}<end_of_image>\n\n"
+    else:
+        raise RuntimeError(f"Unknown config {config.model_type} for multimodal")
+
+
+def image_text_replacement_fixup(config, text: str) -> str:
+    if config.model_type == "idefics2":
+        return text.replace(
+            f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_FAKE_TOKEN}", IDEFICS2_FAKE_TOKEN
+        )
+    return text
+
+
+def get_unpadded_features(
+    original_height: int,
+    original_width: int,
+    npatches: int,
+    num_patch_height: int,
+    num_patch_width: int,
+) -> Tuple[int, int]:
+    current_height = npatches * num_patch_height
+    current_width = npatches * num_patch_width
+
+    aspect_ratio: float = original_width / original_height
+    current_aspect_ratio: float = current_width / current_height
+
+    if aspect_ratio > current_aspect_ratio:
+        new_height = (original_height * current_width) // original_width
+        padding = (current_height - new_height) // 2
+        current_height = current_height - (2 * padding)
+    else:
+        new_width = (original_width * current_height) // original_height
+        padding = (current_width - new_width) // 2
+        current_width = current_width - (2 * padding)
+
+    unpadded_features = current_height * current_width
+    newline_features = current_height
+    return (unpadded_features, newline_features)
+
+
+def get_number_of_features(height: int, width: int, config) -> int:
+    # From config
+    # Hardcoded for CLIP for now
+    # image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
+    image_grid_pinpoints = config.image_grid_pinpoints
+    image_size = config.vision_config.image_size
+    patch_size = config.vision_config.patch_size
+
+    assert image_size % patch_size == 0
+
+    npatches = image_size // patch_size
+
+    # Dimensions are intentionally swapped to be bug-compatible with
+    # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
+    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+        [height, width],
+        image_grid_pinpoints,
+        image_size,
+    )
+    unpadded_features, newline_features = get_unpadded_features(
+        height, width, npatches, num_patch_height, num_patch_width
+    )
+    # The base patch covers the entire image
+    base_features = npatches**2
+    return unpadded_features + newline_features + base_features
+
+
+class FlashVlmCausalLMBatch(FlashCausalLMBatch):
+    pixel_values: Optional[List[torch.Tensor]]
+    pixel_attention_mask: Optional[List[torch.Tensor]]
+    image_sizes: Optional[List[Tuple[int, int]]]
+    image_grid_thw: Optional[torch.Tensor]
+
+    @classmethod
+    @tracer.start_as_current_span("concatenate")
+    def concatenate(cls, batches):
+        batch = super(FlashVlmCausalLMBatch, cls).concatenate(batches)
+        batch.pixel_values = None
+        batch.pixel_attention_mask = None
+        batch.image_sizes = None
+        batch.image_grid_thw = None
+        return batch
+
+    @tracer.start_as_current_span("filter")
+    def filter(self, request_ids: List[int]):
+        batch = super().filter(request_ids)
+        batch.pixel_values = None
+        batch.pixel_attention_mask = None
+        batch.image_sizes = None
+        batch.image_grid_thw = None
+        return batch
+
+    @classmethod
+    def batch_tokenized_inputs(
+        cls, requests: Iterable[generate_pb2.Request], tokenizer, processor, config
+    ):
+        # Process images first. We need all of them so that the processor
+        # can make the image splits the same size. And we need the final
+        # sizes to insert correct number of image tokens.
+        images = []
+        for r in requests:
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    pass
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    # qwen2_vl expects images to be greater than 20 pixels, this is for warmup since the
+                    # default warmup image is 20x20
+                    if config.model_type in {"qwen2_vl", "qwen2_5_vl"}:
+                        if image.width <= 20:
+                            w = image.width * 2
+                            h = image.height * 2
+                            image = image.resize((w, h))
+
+                    if config.model_type == "llava_next":
+                        images.append(image)
+                    elif config.model_type == "gemma3":
+                        images.append(image)
+                    else:
+                        images.append([image])
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+
+        if images:
+            kwargs = {}
+            if (
+                hasattr(processor, "image_processor_class")
+                and processor.image_processor_class == "Idefics3ImageProcessor"
+            ):
+                kwargs["return_row_col_info"] = True
+
+            image_inputs = processor.image_processor(
+                images, return_tensors="pt", **kwargs
+            )
+        else:
+            image_inputs = None
+
+        batch_tokenized_inputs = []
+        max_length = 0
+        image_id = 0
+        for r in requests:
+            full_text = ""
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    full_text += chunk.text
+                elif chunk_type == "image":
+                    full_text += image_text_replacement(
+                        processor, image_inputs, config, image_id
+                    )
+                    image_id += 1
+
+            full_text = image_text_replacement_fixup(config, full_text)
+            input_ids = tokenizer(
+                full_text,
+                truncation=True,
+                max_length=r.truncate,
+                add_special_tokens=r.add_special_tokens,
+            )["input_ids"]
+            max_length = max(max_length, len(input_ids))
+            batch_tokenized_inputs.append(input_ids)
+
+        return batch_tokenized_inputs, image_inputs
+
+    @classmethod
+    def from_pb_processor(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        processor,
+        config,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "FlashVlmCausalLMBatch":
+        batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs(
+            pb.requests, tokenizer, processor, config
+        )
+        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
+        if image_inputs is not None:
+            batch.pixel_values = image_inputs["pixel_values"].to(device=device)
+            if "pixel_attention_mask" in image_inputs:
+                batch.pixel_attention_mask = image_inputs["pixel_attention_mask"].to(
+                    device=device
+                )
+            else:
+                batch.pixel_attention_mask = None
+            if "image_sizes" in image_inputs:
+                batch.image_sizes = image_inputs["image_sizes"].to(device=device)
+            else:
+                batch.image_sizes = None
+            if "image_grid_thw" in image_inputs:
+                batch.image_grid_thw = image_inputs["image_grid_thw"].to(device=device)
+            else:
+                batch.image_grid_thw = None
+        else:
+            batch.pixel_values = None
+            batch.pixel_attention_mask = None
+            batch.image_sizes = None
+            batch.image_grid_thw = None
+        return batch
+
+
+class FlashVlmCausalLM(FlashCausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        *,
+        processor_class=AutoProcessor,
+        processor_kwargs=None,
+        batch_class=FlashVlmCausalLMBatch,
+        revision,
+        trust_remote_code: bool,
+        **kwargs,
+    ):
+        if PREFIX_CACHING:
+            raise NotImplementedError("Vlm do not work with prefix caching yet")
+        if processor_kwargs is None:
+            processor_kwargs = {}
+        self.processor = processor_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **processor_kwargs,
+        )
+        self.batch_class = batch_class
+        super().__init__(
+            model_id=model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            # FIXME: VLM do not work with context chunking yet
+            support_chunking=False,
+            **kwargs,
+        )
+
+    @property
+    def batch_type(self) -> Type[FlashVlmCausalLMBatch]:
+        return self.batch_class
+
+    def max_past(self) -> Optional[int]:
+        return getattr(self.model.text_model, "max_past", None)
+
+    def forward(
+        self,
+        batch: FlashVlmCausalLMBatch,
+        adapter_data: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # Model Forward
+        if batch.speculative_ids is not None:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            max_s = batch.max_current_length
+            lm_head_indices = batch.prefill_head_indices
+
+            speculative_ids = batch.speculative_ids
+
+            B, speculative_length = speculative_ids.shape
+            new_length = speculative_length + 1
+            new_input_ids = torch.cat(
+                [input_ids.unsqueeze(-1), speculative_ids], dim=1
+            ).reshape(-1)
+            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
+            arange_int = arange.to(dtype=torch.int32)
+            new_position_ids = (
+                position_ids.unsqueeze(-1).expand(B, new_length) + arange
+            ).view(-1)
+            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
+            input_lengths = (
+                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
+            ).view(-1)
+            cache_lengths_tensor = (
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
+            ).reshape(-1)
+
+            # Add Copy the block tables for all members
+            block_tables = (
+                block_tables.unsqueeze(1)
+                .expand(B, new_length, -1)
+                .reshape(B * new_length, -1)
+                .contiguous()
+            )
+            max_s = max_s + speculative_length
+
+            input_ids = new_input_ids
+            position_ids = new_position_ids
+        else:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            cache_lengths_tensor = batch.cache_lengths_tensor
+            max_s = batch.max_current_length
+            lm_head_indices = batch.prefill_head_indices
+
+        if self.model.config.model_type in {"qwen2_vl", "qwen2_5_vl"}:
+            if position_ids.dim() == 1 and batch.prefilling:
+                position_ids = self.model.get_position_ids(
+                    input_ids, batch.image_grid_thw
+                )
+                batch.position_ids = position_ids
+
+        if cu_seqlen_prefill is None and self.max_past() is not None:
+            # In decode, not prefill, we're actually overwriting the KV-cache
+            # in a circular buffer mode.
+            # This makes sure the max_s for the decode pass is correct.
+            max_s = min(self.max_past(), max_s)
+
+        kwargs = {}
+        if htorch.utils.internal.is_lazy():
+            kwargs["bypass_hpu_graphs"] = False
+
+        seqlen = Seqlen(
+            input_lengths=input_lengths,
+            cache_lengths=cache_lengths_tensor,
+            cu_seqlen_q=cu_seqlen_prefill,
+        )
+        if batch.prefill_cache_indices is not None:
+            slots_pad = torch.zeros_like(input_ids)
+            slots_pad[batch.prefill_cache_indices] = slots
+            slots = slots_pad
+        logits, speculative_logits = self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            slots=slots,
+            seqlen=trim_seqlen_metadata(seqlen),
+            hpu_attention_meta=batch.hpu_attn_meta,
+            lm_head_indices=lm_head_indices,
+            pixel_values=batch.pixel_values,
+            pixel_attention_mask=batch.pixel_attention_mask,
+            image_sizes=batch.image_sizes,
+            image_grid_thw=batch.image_grid_thw,
+            **kwargs,
+        )
+        if batch.prefill_cache_indices is not None:
+            batch.prefill_cache_indices = None
+        if batch.pixel_values is not None:
+            batch.pixel_values = None
+        if batch.pixel_attention_mask is not None:
+            batch.pixel_attention_mask = None
+        if batch.image_sizes is not None:
+            batch.image_sizes = None
+        if batch.image_grid_thw is not None:
+            batch.image_grid_thw = None
+        return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/globals.py b/backends/gaudi/server/text_generation_server/models/globals.py
index 30a5d3da..cd221e14 100644
--- a/backends/gaudi/server/text_generation_server/models/globals.py
+++ b/backends/gaudi/server/text_generation_server/models/globals.py
@@ -1,53 +1,31 @@
-import torch
 import os
 from typing import Dict, Optional
 from loguru import logger
 from text_generation_server.utils.log import log_master
 
+REQUEST_LOGPROBS = os.getenv("REQUEST_LOGPROBS", "0").lower() in {"1", "true"}
 ATTENTION = os.getenv("ATTENTION", "default")
 # default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0"
 PREFIX_CACHING = os.getenv("PREFIX_CACHING", "0").lower() in {
     "1",
     "true",
 }
-PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "0").lower() in {"1", "true"}
 log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
-_expected = {"paged", "flashdecoding", "flashinfer", "default"}
+_expected = {"paged", "default"}
 assert (
     ATTENTION in _expected
 ), f"Attention is not valid {ATTENTION}, expected {_expected}"
 log_master(logger.info, f"Using Attention = {ATTENTION}")
 
-if PREFIX_CACHING and ATTENTION not in {"flashinfer", "flashdecoding"}:
-    raise RuntimeError("Prefix caching is only supported with flashinfer")
-
-MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
 TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
 assert TGI_WIGGLE_ROOM > 0
 assert TGI_WIGGLE_ROOM < 1
 
 # This is overridden by the cli
 BLOCK_SIZE: int
-if ATTENTION == "flashdecoding":
-    BLOCK_SIZE = 256
-elif ATTENTION == "flashinfer":
-    BLOCK_SIZE = 1
-else:
-    BLOCK_SIZE = 16
 
-# This is overridden by the cli
-cuda_graphs = os.getenv("CUDA_GRAPHS")
-if cuda_graphs is not None:
-    try:
-        cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
-    except Exception as e:
-        raise RuntimeError(
-            f"Could not parse cuda graphs {cuda_graphs}, expected comma separated list for batch sizes to run on: {e}"
-        )
-else:
-    cuda_graphs = None
+BLOCK_SIZE = 128
 
-CUDA_GRAPHS = cuda_graphs
 
 # This is overridden at model loading.
 global MODEL_ID
diff --git a/backends/gaudi/server/text_generation_server/models/idefics_causal_lm.py b/backends/gaudi/server/text_generation_server/models/idefics_causal_lm.py
index 9a7a6fe1..98d7352a 100644
--- a/backends/gaudi/server/text_generation_server/models/idefics_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/idefics_causal_lm.py
@@ -34,9 +34,6 @@ from text_generation_server.utils import (
 )
 from text_generation_server.utils.quantization import get_loader
 
-from text_generation_server.utils.import_utils import SYSTEM
-
-
 tracer = trace.get_tracer(__name__)
 
 
@@ -596,22 +593,8 @@ class IdeficsCausalLM(Model):
     ):
         self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            # 9b seems to work correctly enough in float16, but 80b seems
-            # to be really saturating for f16.
-            dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                # Float16 doesn't exist on target.
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
+        device = torch.device("hpu")
+        dtype = torch.bfloat16 if dtype is None else dtype
         self.device, self.dtype = device, dtype
 
         config = AutoConfig.from_pretrained(
diff --git a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
index 9e19e171..e034ed49 100644
--- a/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
@@ -1,28 +1,30 @@
-from io import BytesIO
-from PIL import Image
 import torch
+
+import numpy as np
+
 from typing import Iterable, Optional, Tuple, List, Dict
 from text_generation_server.pb.generate_pb2 import Request
-
+from io import BytesIO
+from PIL import Image
 from dataclasses import dataclass
 from opentelemetry import trace
 from transformers import (
     PreTrainedTokenizerBase,
 )
-from text_generation_server.models.vlm_causal_lm import VlmCausalLMBatch, VlmCausalLM
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.flash_causal_lm import (
-    block_tables_to_ragged,
-)
-from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
-from text_generation_server.layers.attention import Seqlen
 
+from text_generation_server.models.flash_vlm_causal_lm import (
+    FlashVlmCausalLMBatch,
+    FlashVlmCausalLM,
+)
+from text_generation_server.pb import generate_pb2
+from text_generation_server.layers.attention import Seqlen, trim_seqlen_metadata
+import habana_frameworks.torch as htorch
 
 tracer = trace.get_tracer(__name__)
 
 
 @dataclass
-class MllamaCausalLMBatch(VlmCausalLMBatch):
+class FlashMllamaCausalLMBatch(FlashVlmCausalLMBatch):
     image_indices: List[int] = 42
     aspect_ratio_ids: Optional[torch.Tensor] = None
     aspect_ratio_mask: Optional[torch.Tensor] = None
@@ -158,7 +160,7 @@ class MllamaCausalLMBatch(VlmCausalLMBatch):
         config,
         dtype: torch.dtype,
         device: torch.device,
-    ) -> "VlmCausalLMBatch":
+    ) -> "FlashVlmCausalLMBatch":
         batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs(
             pb.requests, tokenizer, processor, config
         )
@@ -167,6 +169,13 @@ class MllamaCausalLMBatch(VlmCausalLMBatch):
         batch.all_input_ids_tensor = batch.all_input_ids_tensor.clamp(
             max=config.text_config.vocab_size - 1
         )
+        if isinstance(batch.input_ids, list):
+            if len(batch) > 1:
+                input_ids = np.concatenate(batch.input_ids, dtype=np.int64)
+            else:
+                input_ids = batch.input_ids[0]
+            batch.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
+
         batch.input_ids = batch.input_ids.clamp(max=config.text_config.vocab_size - 1)
 
         if image_inputs is not None:
@@ -187,10 +196,10 @@ class MllamaCausalLMBatch(VlmCausalLMBatch):
         return batch
 
 
-class MllamaCausalLM(VlmCausalLM):
+class FlashMllamaCausalLM(FlashVlmCausalLM):
     def forward(
         self,
-        batch: VlmCausalLMBatch,
+        batch: FlashMllamaCausalLMBatch,
         adapter_data: Optional[Dict[str, torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         # Model Forward
@@ -202,7 +211,7 @@ class MllamaCausalLM(VlmCausalLM):
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
             speculative_ids = batch.speculative_ids
@@ -221,8 +230,8 @@ class MllamaCausalLM(VlmCausalLM):
             input_lengths = (
                 input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
             ).view(-1)
-            prefix_lens_tensor = (
-                batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length)
+            cache_lengths_tensor = (
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
             ).reshape(-1)
 
             # Add Copy the block tables for all members
@@ -244,8 +253,8 @@ class MllamaCausalLM(VlmCausalLM):
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            prefix_lens_tensor = batch.prefix_lens_tensor
-            max_s = batch.max_seqlen
+            cache_lengths_tensor = batch.cache_lengths_tensor
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
         if cu_seqlen_prefill is None and self.max_past() is not None:
@@ -254,104 +263,46 @@ class MllamaCausalLM(VlmCausalLM):
             # This makes sure the max_s for the decode pass is correct.
             max_s = min(self.max_past(), max_s)
 
-        bs = input_ids.shape[0]
-        # Try to find an associated cuda graph
-        bs = input_ids.shape[0]
-        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
-        if sorted_padded_bs:
-            # Get associated cuda graph
-            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
-        else:
-            cuda_graph = None
-        if (
-            cu_seqlen_prefill is not None
-            or cuda_graph is None
-            # Only run cuda graphs when there's no images.
-            or batch.cross_attention_states is not None
-        ):
-            input_lengths = input_lengths + prefix_lens_tensor
-            if PREFIX_CACHING:
-                block_tables = block_tables_to_ragged(
-                    block_tables=block_tables,
-                    input_lengths=batch.input_lengths,
-                    prefix_lens=batch.prefix_lens,
-                )
-            with self._forward_context(
-                block_tables=block_tables,
-                cu_seqlen_prefill=cu_seqlen_prefill,
-                input_lengths_tensor=input_lengths,
-                prefix_lens_tensor=prefix_lens_tensor,
-            ):
-                max_k = (input_lengths + prefix_lens_tensor).max().item()
-                seqlen = Seqlen(
-                    input_lengths=input_lengths,
-                    prefix_lengths=prefix_lens_tensor,
-                    cu_seqlen_q=cu_seqlen_prefill,
-                    max_q=max_s,
-                    max_k=max_k,
-                )
+        seqlen = Seqlen(
+            input_lengths=input_lengths,
+            cache_lengths=cache_lengths_tensor,
+            cu_seqlen_q=cu_seqlen_prefill,
+        )
 
-                if batch.pixel_values is not None:
-                    cross_attention_states = self.model.vision_forward(
-                        pixel_values=batch.pixel_values,
-                        aspect_ratio_ids=batch.aspect_ratio_ids,
-                        aspect_ratio_mask=batch.aspect_ratio_mask,
-                    )
-                    batch.cross_attention_states = cross_attention_states
-
-                cross_attention_states = batch.cross_attention_states
-
-                logits, speculative_logits = self.model.forward(
-                    input_ids=input_ids,
-                    position_ids=position_ids,
-                    cu_seqlen_prefill=cu_seqlen_prefill,
-                    kv_cache=kv_cache,
-                    block_tables=block_tables,
-                    slots=slots,
-                    seqlen=seqlen,
-                    max_s=max_s,
-                    prefill_cache_indices=batch.prefill_cache_indices,
-                    lm_head_indices=lm_head_indices,
-                    cross_attention_states=cross_attention_states,
-                    adapter_data=adapter_data,
-                    image_indices=batch.image_indices[:],
-                )
-                if batch.prefill_cache_indices is not None:
-                    batch.prefill_cache_indices = None
-                if batch.pixel_values is not None:
-                    batch.pixel_values = None
-                return logits, speculative_logits
-
-        # Copy inputs to the static inputs of the cuda graph
-        # Static inputs are potentially padded
-        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
-        cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
-        if ATTENTION == "flashinfer":
-            block_tables = block_tables_to_ragged(
-                block_tables=block_tables,
-                input_lengths=batch.input_lengths,
-                prefix_lens=batch.prefix_lens,
+        if batch.pixel_values is not None:
+            cross_attention_states = self.model.vision_forward(
+                pixel_values=batch.pixel_values,
+                aspect_ratio_ids=batch.aspect_ratio_ids,
+                aspect_ratio_mask=batch.aspect_ratio_mask,
             )
-            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
-        else:
-            cuda_graph["block_tables"][
-                : block_tables.shape[0], : block_tables.shape[1]
-            ] = block_tables
-        cuda_graph["slots"].fill_(0)
-        cuda_graph["slots"][: slots.shape[0]] = slots
-        cuda_graph["input_lengths"].zero_()
-        cuda_graph["input_lengths"][: input_lengths.shape[0]] = (
-            input_lengths + prefix_lens_tensor
-        )
+            batch.cross_attention_states = cross_attention_states
 
-        # Replay the graph
-        cuda_graph["graph"].replay()
+        cross_attention_states = batch.cross_attention_states
 
-        # Slice output to the correct shape
-        speculative_logits = (
-            cuda_graph["speculative_logits"][:bs]
-            if cuda_graph["speculative_logits"] is not None
-            else None
+        kwargs = {}
+        if htorch.utils.internal.is_lazy():
+            kwargs["bypass_hpu_graphs"] = False
+        if batch.prefill_cache_indices is not None:
+            slots_pad = torch.zeros_like(input_ids)
+            slots_pad[batch.prefill_cache_indices] = slots
+            slots = slots_pad
+        logits, speculative_logits = self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            slots=slots,
+            seqlen=trim_seqlen_metadata(seqlen),
+            hpu_attention_meta=batch.hpu_attn_meta,
+            lm_head_indices=lm_head_indices,
+            cross_attention_states=cross_attention_states,
+            # TODO list
+            adapter_data=None,
+            image_indices=batch.image_indices[:],
+            **kwargs,
         )
-        logits = cuda_graph["logits"][:bs]
+        if batch.prefill_cache_indices is not None:
+            batch.prefill_cache_indices = None
+        if batch.pixel_values is not None:
+            batch.pixel_values = None
         return logits, speculative_logits
diff --git a/backends/gaudi/server/text_generation_server/models/model.py b/backends/gaudi/server/text_generation_server/models/model.py
index 4fda2271..66c69bc1 100644
--- a/backends/gaudi/server/text_generation_server/models/model.py
+++ b/backends/gaudi/server/text_generation_server/models/model.py
@@ -33,6 +33,7 @@ class Model(ABC):
         sliding_window: Optional[int] = None,
         speculate: Optional[int] = None,
         adapter_id: str = BASE_MODEL_ADAPTER_ID,
+        support_chunking: bool = False,
     ):
         self.model_id = model_id
         self.model = model.eval()
diff --git a/backends/gaudi/server/text_generation_server/models/pali_gemma.py b/backends/gaudi/server/text_generation_server/models/pali_gemma.py
index fe75570e..e91aaed9 100644
--- a/backends/gaudi/server/text_generation_server/models/pali_gemma.py
+++ b/backends/gaudi/server/text_generation_server/models/pali_gemma.py
@@ -4,8 +4,8 @@ import torch
 import torch.distributed
 from opentelemetry import trace
 from typing import Iterable
-from text_generation_server.models.vlm_causal_lm import (
-    VlmCausalLMBatch,
+from text_generation_server.models.flash_vlm_causal_lm import (
+    FlashVlmCausalLMBatch,
     image_text_replacement,
 )
 
@@ -14,7 +14,7 @@ from text_generation_server.pb.generate_pb2 import Request
 tracer = trace.get_tracer(__name__)
 
 
-class PaliGemmaBatch(VlmCausalLMBatch):
+class PaliGemmaBatch(FlashVlmCausalLMBatch):
     @classmethod
     def batch_tokenized_inputs(
         cls, requests: Iterable[Request], tokenizer, processor, config
diff --git a/backends/gaudi/server/text_generation_server/models/seq2seq_lm.py b/backends/gaudi/server/text_generation_server/models/seq2seq_lm.py
index 04d4c28b..0ee6ed16 100644
--- a/backends/gaudi/server/text_generation_server/models/seq2seq_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/seq2seq_lm.py
@@ -10,7 +10,6 @@ from transformers import (
     AutoConfig,
 )
 from typing import Optional, Tuple, List, Type, Dict
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils import (
     initialize_torch_distributed,
     weight_files,
@@ -555,20 +554,9 @@ class Seq2SeqLM(Model):
     ):
         self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = default_dtype if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = default_dtype if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                # Float16 doesn't exist on target.
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
+
+        device = torch.device("hpu")
+        dtype = torch.bfloat16 if dtype is None else dtype
 
         config = config_class.from_pretrained(
             model_id,
@@ -600,7 +588,7 @@ class Seq2SeqLM(Model):
             aliases=aliases,
             weights_loader=weights_loader,
         )
-        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
+        if config.quantize in ["awq", "gptq"]:
             weights._set_gptq_params(model_id, revision)
 
         model = model_class(config, weights)
diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
index 543b07e8..709437d9 100644
--- a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
@@ -69,11 +69,7 @@ MAX_TOTAL_TOKENS = int(os.environ.get("MAX_TOTAL_TOKENS", 8192))
 PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 128))
 CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
 LAZY_MODE = int(os.environ.get("PT_HPU_LAZY_MODE", 1))
-max_batch_size_str = os.environ.get("MAX_BATCH_SIZE")
-if max_batch_size_str is not None:
-    MAX_BATCH_SIZE = int(max_batch_size_str)
-else:
-    raise ValueError("MAX_BATCH_SIZE is not set")
+
 
 PREFILL_WARMUP_BATCH_SIZE_LIST = []
 PREFILL_WARMUP_SEQLEN_LIST = []
@@ -1467,6 +1463,12 @@ class VlmCausalLM(Model):
         batch = self.batch_from_pb(request.batch, is_warmup=True)
         max_input_tokens = request.max_input_tokens
         max_prefill_batch_size = batch.input_ids.shape[0]
+        max_batch_size_str = os.environ.get("MAX_BATCH_SIZE")
+        if max_batch_size_str is not None:
+            MAX_BATCH_SIZE = int(max_batch_size_str)
+        else:
+            raise ValueError("MAX_BATCH_SIZE is not set")
+
         try:
             # max prefill batch size warmup
             _, prefill_batch, _ = self.generate_token([batch], is_warmup=True)
diff --git a/backends/gaudi/server/text_generation_server/server.py b/backends/gaudi/server/text_generation_server/server.py
index 674a8aed..5a7d2117 100644
--- a/backends/gaudi/server/text_generation_server/server.py
+++ b/backends/gaudi/server/text_generation_server/server.py
@@ -18,22 +18,27 @@ from text_generation_server.interceptor import ExceptionInterceptor
 from text_generation_server.models import Model, get_model_with_lora_adapters
 from text_generation_server.pb import generate_pb2_grpc, generate_pb2
 from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
-from text_generation_server.models.globals import set_model_id
+from text_generation_server.models.globals import set_model_id, ATTENTION
 from text_generation_server.models.globals import set_adapter_to_index
 from text_generation_server.utils.adapter import AdapterInfo
 from text_generation_server.utils.tokens import make_tokenizer_optional
+from text_generation_server.utils.prefill_chunking import set_max_prefill_tokens
 
 try:
     from text_generation_server.models.pali_gemma import PaliGemmaBatch
+    from text_generation_server.models.mllama_causal_lm import FlashMllamaCausalLMBatch
     from text_generation_server.models.vlm_causal_lm import (
         VlmCausalLMBatch,
     )
-    from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch
+    from text_generation_server.models.flash_vlm_causal_lm import (
+        FlashVlmCausalLMBatch,
+    )
 
     VLM_BATCH_TYPES = {
         PaliGemmaBatch,
         VlmCausalLMBatch,
-        IdeficsCausalLMBatch,
+        FlashVlmCausalLMBatch,
+        FlashMllamaCausalLMBatch,
     }
 except (ImportError, NotImplementedError):
     # These imports can fail on CPU/Non flash.
@@ -103,14 +108,50 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
         return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
 
     async def Warmup(self, request, context):
-        max_supported_total_tokens, max_input_tokens, max_total_tokens = (
-            self.model.warmup(request)
-        )
+        if ATTENTION == "paged":
+            set_max_prefill_tokens(request.max_prefill_tokens)
+            if (
+                self.model.batch_type in VLM_BATCH_TYPES
+            ):  # Hack, i would rather use kwargs in the `from_pb` call
+                batch = self.model.batch_type.from_pb_processor(
+                    request.batch,
+                    self.model.tokenizer,
+                    self.model.processor,
+                    self.model.model.config,
+                    self.model.dtype,
+                    self.model.device,
+                )
+            else:
+                batch = self.model.batch_type.from_pb(
+                    request.batch,
+                    self.model.tokenizer,
+                    self.model.dtype,
+                    self.model.device,
+                )
 
-        # W/A for the skip tokenizer path
-        # We need to call make_tokenizer_optional after the warmup,
-        # because router is not aware of that feature
-        make_tokenizer_optional(self.model.tokenizer)
+            # Override default values with None for clearer semantics.
+            max_input_tokens = (
+                request.max_input_tokens
+                if request.HasField("max_input_tokens")
+                else None
+            )
+            max_total_tokens = (
+                request.max_total_tokens
+                if request.HasField("max_total_tokens")
+                else None
+            )
+            max_supported_total_tokens, max_input_tokens, max_total_tokens = (
+                self.model.warmup(batch, max_input_tokens, max_total_tokens)
+            )
+        else:
+            max_supported_total_tokens, max_input_tokens, max_total_tokens = (
+                self.model.warmup(request)
+            )
+
+            # W/A for the skip tokenizer path
+            # We need to call make_tokenizer_optional after the warmup,
+            # because router is not aware of that feature
+            make_tokenizer_optional(self.model.tokenizer)
 
         return generate_pb2.WarmupResponse(
             max_supported_total_tokens=max_supported_total_tokens,
diff --git a/backends/gaudi/server/text_generation_server/utils/dist.py b/backends/gaudi/server/text_generation_server/utils/dist.py
index 0e9b97fb..1c45713e 100644
--- a/backends/gaudi/server/text_generation_server/utils/dist.py
+++ b/backends/gaudi/server/text_generation_server/utils/dist.py
@@ -1,15 +1,13 @@
 import os
 import torch
-
+from torch.distributed import ProcessGroup
 from datetime import timedelta
 from loguru import logger
 
 # Tensor Parallelism settings
 RANK = int(os.getenv("RANK", "0"))
 WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
-
-# CUDA memory fraction
-MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))
+MEMORY_FRACTION = float(os.getenv("HPU_MEMORY_FRACTION", "0.8"))
 
 
 class FakeBarrier:
@@ -17,10 +15,11 @@ class FakeBarrier:
         pass
 
 
-class FakeGroup:
+class FakeGroup(ProcessGroup):
     def __init__(self, rank, size):
         self._rank = rank
         self._size = size
+        super().__init__(rank, size)
 
     def allreduce(self, *args, **kwargs):
         return FakeBarrier()
@@ -42,42 +41,11 @@ class FakeGroup:
     def rank(self):
         return self._rank
 
+    def _get_backend_name(self):
+        return "fake"
+
 
 def initialize_torch_distributed():
-
-    world_size = int(os.getenv("WORLD_SIZE", "1"))
-
-    options = None
-    if torch.cuda.is_available():
-        from torch.distributed import ProcessGroupNCCL
-
-        # Set the device id.
-        assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"
-        device = RANK % torch.cuda.device_count()
-        torch.cuda.set_device(device)
-        torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)
-        backend = "nccl"
-        options = ProcessGroupNCCL.Options()
-        options.is_high_priority_stream = True
-        options._timeout = timedelta(seconds=60)
-    elif torch.hpu.is_available():
-        backend = "hccl"
-        n_hpus = torch.hpu.device_count()
-        if world_size > n_hpus:
-            raise ValueError(
-                f"WORLD_SIZE ({world_size}) is higher than the number of available HPUs ({n_hpus})."
-            )
-    else:
-        try:
-            import oneccl_bindings_for_pytorch  # noqa: F401
-
-            backend = "ccl"
-            if os.getenv("CCL_WORKER_COUNT", None) is None:
-                os.environ["CCL_WORKER_COUNT"] = str(1)
-        except ImportError:
-            backend = "gloo"
-        options = None
-
     if WORLD_SIZE == 1:
         return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
     else:
@@ -87,11 +55,10 @@ def initialize_torch_distributed():
         if not torch.distributed.is_initialized():
             # Call the init process.
             torch.distributed.init_process_group(
-                backend=backend,
+                backend="hccl",
                 world_size=WORLD_SIZE,
                 rank=RANK,
-                timeout=timedelta(seconds=60),
-                pg_options=options,
+                timeout=timedelta(seconds=120),
             )
         else:
             logger.warning("torch.distributed is already initialized.")
diff --git a/backends/gaudi/server/text_generation_server/utils/import_utils.py b/backends/gaudi/server/text_generation_server/utils/import_utils.py
index 782b4f15..22560dd7 100644
--- a/backends/gaudi/server/text_generation_server/utils/import_utils.py
+++ b/backends/gaudi/server/text_generation_server/utils/import_utils.py
@@ -1,75 +1,28 @@
 import torch
 from loguru import logger
-import os
 
 
-import importlib.util
+def get_hpu_free_memory(device, memory_fraction):
+    from habana_frameworks.torch.hpu import memory_stats
 
-
-def is_ipex_available():
-    return importlib.util.find_spec("intel_extension_for_pytorch") is not None
-
-
-def get_cuda_free_memory(device, memory_fraction):
-    total_free_memory, _ = torch.cuda.mem_get_info(device)
-    total_gpu_memory = torch.cuda.get_device_properties(device).total_memory
-    free_memory = max(0, total_free_memory - (1 - memory_fraction) * total_gpu_memory)
-    return free_memory
-
-
-def get_xpu_free_memory(device, memory_fraction):
-    total_memory = torch.xpu.get_device_properties(device).total_memory
     device_id = device.index
-    memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "1.0"))
+    mem_stats = memory_stats(device_id)
+    logger.info(f"mem_stats: {mem_stats}")
+    total_free_memory = mem_stats["Limit"] - mem_stats["MaxInUse"]
     free_memory = max(
-        0,
-        int(
-            total_memory * 0.9 * memory_fraction - torch.xpu.memory_reserved(device_id)
-        ),
+        0, int(total_free_memory - (1 - memory_fraction) * mem_stats["Limit"])
     )
     return free_memory
 
 
-def get_cpu_free_memory(device, memory_fraction):
-    import psutil
-    from text_generation_server.utils.dist import WORLD_SIZE
-
-    mem = psutil.virtual_memory()
-    free_memory = int(mem.available * 0.95 / WORLD_SIZE)
-    return free_memory
+def synchronize_hpu(device):
+    torch.hpu.synchronize()
 
 
 def noop(*args, **kwargs):
     pass
 
 
-SYSTEM = None
-if torch.version.hip is not None:
-    SYSTEM = "rocm"
-    empty_cache = torch.cuda.empty_cache
-    synchronize = torch.cuda.synchronize
-    get_free_memory = get_cuda_free_memory
-elif torch.version.cuda is not None and torch.cuda.is_available():
-    SYSTEM = "cuda"
-    empty_cache = torch.cuda.empty_cache
-    synchronize = torch.cuda.synchronize
-    get_free_memory = get_cuda_free_memory
-elif is_ipex_available():
-    SYSTEM = "ipex"
-    import intel_extension_for_pytorch  # noqa: F401
-
-    if hasattr(torch, "xpu") and torch.xpu.is_available():
-        empty_cache = torch.xpu.empty_cache
-        synchronize = torch.xpu.synchronize
-        get_free_memory = get_xpu_free_memory
-    else:
-        empty_cache = noop
-        synchronize = noop
-        get_free_memory = get_cpu_free_memory
-else:
-    SYSTEM = "cpu"
-
-    empty_cache = noop
-    synchronize = noop
-    get_free_memory = get_cpu_free_memory
-logger.info(f"Detected system {SYSTEM}")
+empty_cache = noop
+synchronize = synchronize_hpu
+get_free_memory = get_hpu_free_memory
diff --git a/backends/gaudi/server/text_generation_server/utils/kernels.py b/backends/gaudi/server/text_generation_server/utils/kernels.py
new file mode 100644
index 00000000..42745c71
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/kernels.py
@@ -0,0 +1,22 @@
+import importlib
+
+from loguru import logger
+from hf_kernels import load_kernel as hf_load_kernel
+
+from text_generation_server.utils.log import log_once
+
+
+def load_kernel(*, module: str, repo_id: str):
+    """
+    Load a kernel. First try to load it as the given module (e.g. for
+    local development), falling back to a locked Hub kernel.
+    """
+    try:
+        m = importlib.import_module(module)
+        log_once(logger.info, f"Using local module for `{module}`")
+        return m
+    except ModuleNotFoundError:
+        return hf_load_kernel(repo_id=repo_id)
+
+
+__all__ = ["load_kernel"]
diff --git a/backends/gaudi/server/text_generation_server/utils/prefill_chunking.py b/backends/gaudi/server/text_generation_server/utils/prefill_chunking.py
new file mode 100644
index 00000000..c227d30f
--- /dev/null
+++ b/backends/gaudi/server/text_generation_server/utils/prefill_chunking.py
@@ -0,0 +1,24 @@
+from typing import Optional
+
+SUPPORT_CHUNKING: Optional[bool] = None
+MAX_PREFILL_TOKENS: Optional[int] = None
+
+
+def set_support_chunking(support_chunking: bool):
+    global SUPPORT_CHUNKING
+    SUPPORT_CHUNKING = support_chunking
+
+
+def get_support_chunking() -> bool:
+    global SUPPORT_CHUNKING
+    return SUPPORT_CHUNKING
+
+
+def set_max_prefill_tokens(max_prefill_tokens: int):
+    global MAX_PREFILL_TOKENS
+    MAX_PREFILL_TOKENS = max_prefill_tokens
+
+
+def get_max_prefill_tokens() -> int:
+    global MAX_PREFILL_TOKENS
+    return MAX_PREFILL_TOKENS
diff --git a/backends/gaudi/server/text_generation_server/utils/quantization.py b/backends/gaudi/server/text_generation_server/utils/quantization.py
index ee561acc..a8faf4a5 100644
--- a/backends/gaudi/server/text_generation_server/utils/quantization.py
+++ b/backends/gaudi/server/text_generation_server/utils/quantization.py
@@ -4,9 +4,7 @@ from dataclasses import dataclass
 from typing import Optional
 
 from huggingface_hub import hf_hub_download
-from text_generation_server.layers.marlin.gptq import can_use_gptq_marlin
 from text_generation_server.utils.weights import (
-    DefaultWeightsLoader,
     WeightsLoader,
 )
 
@@ -129,64 +127,13 @@ def get_loader(
                 f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
             )
 
-        if can_use_gptq_marlin(
+        return GPTQWeightsLoader(
             bits=quantizer_config.bits,
+            desc_act=quantizer_config.desc_act,
             groupsize=quantizer_config.groupsize,
             quant_method=quantizer_config.quant_method,
             quantize=quantize,
             sym=quantizer_config.sym,
-        ):
-            from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader
-
-            return GPTQMarlinWeightsLoader(
-                bits=quantizer_config.bits,
-                desc_act=quantizer_config.desc_act,
-                groupsize=quantizer_config.groupsize,
-                quant_method=quantizer_config.quant_method,
-                quantize=quantize,
-                sym=quantizer_config.sym,
-            )
-        else:
-            return GPTQWeightsLoader(
-                bits=quantizer_config.bits,
-                desc_act=quantizer_config.desc_act,
-                groupsize=quantizer_config.groupsize,
-                quant_method=quantizer_config.quant_method,
-                quantize=quantize,
-                sym=quantizer_config.sym,
-            )
-    elif quantize == "bitsandbytes":
-        from text_generation_server.layers.bnb import BNBWeight
-
-        return DefaultWeightsLoader(BNBWeight)
-    elif quantize == "bitsandbytes-fp4":
-        from text_generation_server.layers.bnb import BNBFP4Weight
-
-        return DefaultWeightsLoader(BNBFP4Weight)
-    elif quantize == "bitsandbytes-nf4":
-        from text_generation_server.layers.bnb import BNBNF4Weight
-
-        return DefaultWeightsLoader(BNBNF4Weight)
-    elif quantize == "eetq":
-        from text_generation_server.layers.eetq import EETQWeight
-
-        return DefaultWeightsLoader(EETQWeight)
-    elif quantize == "exl2":
-        from text_generation_server.layers.exl2 import Exl2WeightsLoader
-
-        return Exl2WeightsLoader()
-    elif quantize == "marlin":
-        from text_generation_server.layers.marlin import MarlinWeightsLoader
-
-        # TODO: improve check once we have one config type per quantize value
-        if not isinstance(quantizer_config, _QuantizerConfig):
-            raise ValueError(
-                f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
-            )
-
-        return MarlinWeightsLoader(
-            bits=quantizer_config.bits,
-            is_marlin_24=quantizer_config.checkpoint_format == "marlin_24",
         )
     elif quantize == "fp8" or quantize is None:
         from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
diff --git a/backends/gaudi/server/text_generation_server/utils/weights.py b/backends/gaudi/server/text_generation_server/utils/weights.py
index 75e01f7c..acd598d7 100644
--- a/backends/gaudi/server/text_generation_server/utils/weights.py
+++ b/backends/gaudi/server/text_generation_server/utils/weights.py
@@ -7,8 +7,6 @@ from typing import Dict, List, Optional, Union, Type
 from safetensors import safe_open
 from dataclasses import dataclass
 
-from text_generation_server.utils.import_utils import SYSTEM
-
 
 class WeightsLoader(ABC):
     """
@@ -88,12 +86,9 @@ class UnquantizedWeight(Weight):
     weight: torch.Tensor
 
     def get_linear(self, bias: torch.Tensor):
-        from text_generation_server.layers.linear import FastLinear, FastLinearROCm
+        from text_generation_server.layers.linear import FastLinear
 
-        if SYSTEM == "rocm":
-            return FastLinearROCm(self.weight, bias)
-        else:
-            return FastLinear(self.weight, bias)
+        return FastLinear(self.weight, bias)
 
 
 class DefaultWeightsLoader(WeightsLoader):
@@ -197,7 +192,7 @@ class Weights:
         slice_ = f.get_slice(tensor_name)
         return slice_
 
-    def _has_tensor(self, tensor_name: str):
+    def has_tensor(self, tensor_name: str):
         try:
             self.get_filename(tensor_name)
         except Exception:
@@ -207,7 +202,9 @@ class Weights:
     def get_shape(self, tensor_name: str):
         return self._get_slice(tensor_name).get_shape()
 
-    def get_tensor(self, tensor_name: str, to_device=True, to_dtype=True):
+    def get_tensor(
+        self, tensor_name: str, to_device: bool = True, to_dtype: bool = True
+    ) -> torch.Tensor:
         filename, tensor_name = self.get_filename(tensor_name)
         f = self._get_handle(filename)
         tensor = f.get_tensor(tensor_name)
@@ -218,6 +215,7 @@ class Weights:
             tensor.dtype
             not in [
                 torch.float8_e4m3fn,
+                torch.int8,
                 torch.int16,
                 torch.int32,
                 torch.int64,
@@ -253,7 +251,8 @@ class Weights:
         # u4 which are disguised as int32. exl2 uses int16.
         # FP8 uses torch.float8_e4m3fn.
         if (
-            tensor.dtype not in (torch.float8_e4m3fn, torch.int16, torch.int32)
+            tensor.dtype
+            not in (torch.float8_e4m3fn, torch.int8, torch.int16, torch.int32)
             and to_dtype
         ):
             tensor = tensor.to(dtype=self.dtype)
@@ -329,6 +328,7 @@ class Weights:
             tensor.dtype
             not in [
                 torch.float8_e4m3fn,
+                torch.int8,
                 torch.int16,
                 torch.int32,
                 torch.int64,
diff --git a/backends/v3/src/block_allocator.rs b/backends/v3/src/block_allocator.rs
index e7f3d85a..6da2b51d 100644
--- a/backends/v3/src/block_allocator.rs
+++ b/backends/v3/src/block_allocator.rs
@@ -2,7 +2,7 @@ use std::sync::Arc;
 use tokio::sync::{mpsc, oneshot};
 
 use crate::radix::RadixAllocator;
-
+use text_generation_router::usage_stats::Env;
 #[derive(Debug, Clone)]
 pub struct BlockAllocation {
     pub allocation_id: u64,
@@ -141,6 +141,7 @@ pub struct SimpleAllocator {
     free_blocks: Vec<u32>,
     block_size: u32,
     window_size: Option<u32>,
+    is_hpu_device: bool,
 }
 
 impl SimpleAllocator {
@@ -150,6 +151,7 @@ impl SimpleAllocator {
             // Block 0 is reserved for health checks
             free_blocks: (1..blocks).collect(),
             window_size,
+            is_hpu_device: Env::new().is_hpu_device(),
         }
     }
 }
@@ -179,9 +181,15 @@ impl Allocator for SimpleAllocator {
         if required_blocks > self.free_blocks.len() as u32 {
             None
         } else {
-            let blocks = self
+            if self.is_hpu_device {
+                self.free_blocks.sort_by(|a, b| b.cmp(a));
+            }
+            let mut blocks = self
                 .free_blocks
                 .split_off(self.free_blocks.len() - required_blocks as usize);
+            if self.is_hpu_device {
+                blocks.sort();
+            }
             let mut slots =
                 Vec::with_capacity((required_blocks * self.block_size * repeats as u32) as usize);
 
diff --git a/launcher/src/env_runtime.rs b/launcher/src/env_runtime.rs
index d7ae11d5..d9056e41 100644
--- a/launcher/src/env_runtime.rs
+++ b/launcher/src/env_runtime.rs
@@ -28,8 +28,8 @@ impl Env {
         }
     }
 
-    pub fn is_hpu_device(&self) -> bool {
-        self.hpu_env != "N/A"
+    pub fn should_start_a_single_hpu_shard(&self) -> bool {
+        self.hpu_env != "N/A" && std::env::var("ATTENTION").as_deref() != Ok("paged")
     }
 }
 
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index acff8573..c169a78c 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1559,7 +1559,7 @@ fn spawn_shards(
 ) -> Result<(), LauncherError> {
     // Start shard processes
     for rank in 0..num_shard {
-        if rank != 0 && env_runtime::Env::new().is_hpu_device() {
+        if rank != 0 && env_runtime::Env::new().should_start_a_single_hpu_shard() {
             tracing::info!("Running on HPU, the launcher will not do any sharding as actual sharding is done in the server");
             break;
         }
@@ -1639,7 +1639,7 @@ fn spawn_shards(
                 if shard_ready == num_shard {
                     break;
                 }
-                if env_runtime::Env::new().is_hpu_device() {
+                if env_runtime::Env::new().should_start_a_single_hpu_shard() {
                     tracing::info!("HPU detected, shard is ready");
                     break;
                 }
diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs
index 353e9e37..a17aade9 100644
--- a/router/src/usage_stats.rs
+++ b/router/src/usage_stats.rs
@@ -157,6 +157,7 @@ pub struct Env {
     docker_label: &'static str,
     nvidia_info: Option<Vec<NvidiaSmiInfo>>,
     xpu_info: Option<Vec<XpuSmiInfo>>,
+    hpu_info: Option<Vec<HpuSmiInfo>>,
     system_env: SystemInfo,
 }
 
@@ -289,6 +290,60 @@ impl XpuSmiInfo {
     }
 }
 
+#[derive(Debug, Serialize, Clone)]
+struct HpuSmiInfo {
+    name: String,
+    pci_bus_id: String,
+    driver_version: String,
+    temperature: String,
+    utilization: String,
+    memory_total: String,
+    memory_free: String,
+    memory_used: String,
+    power_draw_instant: String,
+}
+
+impl HpuSmiInfo {
+    fn new() -> Option<Vec<HpuSmiInfo>> {
+        let output = Command::new("hl-smi")
+            .args([
+                "--query-aip=name,bus_id,driver_version,temperature.aip,utilization.aip,memory.total,memory.free,memory.used,power.draw",
+                "--format=csv"
+            ])
+            .output()
+            .ok()?;
+
+        if !output.status.success() {
+            return None;
+        }
+
+        let stdout = String::from_utf8(output.stdout).ok()?;
+
+        let mut rdr = ReaderBuilder::new()
+            .has_headers(true)
+            .from_reader(stdout.as_bytes());
+
+        let mut infos = Vec::new();
+
+        for result in rdr.records() {
+            let record = result.ok()?;
+            infos.push(HpuSmiInfo {
+                name: record[0].to_string(),
+                pci_bus_id: record[1].to_string(),
+                driver_version: record[2].to_string(),
+                temperature: record[3].to_string(),
+                utilization: record[4].to_string(),
+                memory_total: record[5].to_string(),
+                memory_free: record[6].to_string(),
+                memory_used: record[7].to_string(),
+                power_draw_instant: record[8].to_string(),
+            });
+        }
+
+        Some(infos)
+    }
+}
+
 #[derive(Serialize, Debug, Clone)]
 pub struct SystemInfo {
     cpu_count: usize,
@@ -335,10 +390,14 @@ impl Env {
             system_env: SystemInfo::new(),
             nvidia_info: NvidiaSmiInfo::new(),
             xpu_info: XpuSmiInfo::new(),
+            hpu_info: HpuSmiInfo::new(),
             git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
             docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
         }
     }
+    pub fn is_hpu_device(&self) -> bool {
+        self.hpu_info.is_some()
+    }
 }
 
 pub fn is_container() -> io::Result<bool> {

From fe56f760df3230dd9a24c4ed25741bb576904621 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 14 Apr 2025 17:18:43 +0200
Subject: [PATCH 176/183] Upgrading the python client deps (still deprecated,
 but used for integration-tests)

---
 clients/python/poetry.lock    | 1872 ++++++++++++++++++---------------
 clients/python/pyproject.toml |   12 +-
 2 files changed, 1047 insertions(+), 837 deletions(-)

diff --git a/clients/python/poetry.lock b/clients/python/poetry.lock
index 148d9906..36e82f2a 100644
--- a/clients/python/poetry.lock
+++ b/clients/python/poetry.lock
@@ -1,124 +1,131 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.6.1"
+description = "Happy Eyeballs for asyncio"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"},
+    {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"},
+]
 
 [[package]]
 name = "aiohttp"
-version = "3.8.5"
+version = "3.11.16"
 description = "Async http client/server framework (asyncio)"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"},
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"},
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"},
-    {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"},
-    {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"},
-    {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"},
-    {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"},
-    {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"},
-    {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"},
-    {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"},
-    {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"},
-    {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"},
+    {file = "aiohttp-3.11.16-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fb46bb0f24813e6cede6cc07b1961d4b04f331f7112a23b5e21f567da4ee50aa"},
+    {file = "aiohttp-3.11.16-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:54eb3aead72a5c19fad07219acd882c1643a1027fbcdefac9b502c267242f955"},
+    {file = "aiohttp-3.11.16-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:38bea84ee4fe24ebcc8edeb7b54bf20f06fd53ce4d2cc8b74344c5b9620597fd"},
+    {file = "aiohttp-3.11.16-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0666afbe984f6933fe72cd1f1c3560d8c55880a0bdd728ad774006eb4241ecd"},
+    {file = "aiohttp-3.11.16-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ba92a2d9ace559a0a14b03d87f47e021e4fa7681dc6970ebbc7b447c7d4b7cd"},
+    {file = "aiohttp-3.11.16-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3ad1d59fd7114e6a08c4814983bb498f391c699f3c78712770077518cae63ff7"},
+    {file = "aiohttp-3.11.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b88a2bf26965f2015a771381624dd4b0839034b70d406dc74fd8be4cc053e3"},
+    {file = "aiohttp-3.11.16-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:576f5ca28d1b3276026f7df3ec841ae460e0fc3aac2a47cbf72eabcfc0f102e1"},
+    {file = "aiohttp-3.11.16-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a2a450bcce4931b295fc0848f384834c3f9b00edfc2150baafb4488c27953de6"},
+    {file = "aiohttp-3.11.16-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:37dcee4906454ae377be5937ab2a66a9a88377b11dd7c072df7a7c142b63c37c"},
+    {file = "aiohttp-3.11.16-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:4d0c970c0d602b1017e2067ff3b7dac41c98fef4f7472ec2ea26fd8a4e8c2149"},
+    {file = "aiohttp-3.11.16-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:004511d3413737700835e949433536a2fe95a7d0297edd911a1e9705c5b5ea43"},
+    {file = "aiohttp-3.11.16-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:c15b2271c44da77ee9d822552201180779e5e942f3a71fb74e026bf6172ff287"},
+    {file = "aiohttp-3.11.16-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ad9509ffb2396483ceacb1eee9134724443ee45b92141105a4645857244aecc8"},
+    {file = "aiohttp-3.11.16-cp310-cp310-win32.whl", hash = "sha256:634d96869be6c4dc232fc503e03e40c42d32cfaa51712aee181e922e61d74814"},
+    {file = "aiohttp-3.11.16-cp310-cp310-win_amd64.whl", hash = "sha256:938f756c2b9374bbcc262a37eea521d8a0e6458162f2a9c26329cc87fdf06534"},
+    {file = "aiohttp-3.11.16-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8cb0688a8d81c63d716e867d59a9ccc389e97ac7037ebef904c2b89334407180"},
+    {file = "aiohttp-3.11.16-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ad1fb47da60ae1ddfb316f0ff16d1f3b8e844d1a1e154641928ea0583d486ed"},
+    {file = "aiohttp-3.11.16-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:df7db76400bf46ec6a0a73192b14c8295bdb9812053f4fe53f4e789f3ea66bbb"},
+    {file = "aiohttp-3.11.16-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc3a145479a76ad0ed646434d09216d33d08eef0d8c9a11f5ae5cdc37caa3540"},
+    {file = "aiohttp-3.11.16-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d007aa39a52d62373bd23428ba4a2546eed0e7643d7bf2e41ddcefd54519842c"},
+    {file = "aiohttp-3.11.16-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f6ddd90d9fb4b501c97a4458f1c1720e42432c26cb76d28177c5b5ad4e332601"},
+    {file = "aiohttp-3.11.16-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a2f451849e6b39e5c226803dcacfa9c7133e9825dcefd2f4e837a2ec5a3bb98"},
+    {file = "aiohttp-3.11.16-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8df6612df74409080575dca38a5237282865408016e65636a76a2eb9348c2567"},
+    {file = "aiohttp-3.11.16-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:78e6e23b954644737e385befa0deb20233e2dfddf95dd11e9db752bdd2a294d3"},
+    {file = "aiohttp-3.11.16-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:696ef00e8a1f0cec5e30640e64eca75d8e777933d1438f4facc9c0cdf288a810"},
+    {file = "aiohttp-3.11.16-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e3538bc9fe1b902bef51372462e3d7c96fce2b566642512138a480b7adc9d508"},
+    {file = "aiohttp-3.11.16-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:3ab3367bb7f61ad18793fea2ef71f2d181c528c87948638366bf1de26e239183"},
+    {file = "aiohttp-3.11.16-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:56a3443aca82abda0e07be2e1ecb76a050714faf2be84256dae291182ba59049"},
+    {file = "aiohttp-3.11.16-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:61c721764e41af907c9d16b6daa05a458f066015abd35923051be8705108ed17"},
+    {file = "aiohttp-3.11.16-cp311-cp311-win32.whl", hash = "sha256:3e061b09f6fa42997cf627307f220315e313ece74907d35776ec4373ed718b86"},
+    {file = "aiohttp-3.11.16-cp311-cp311-win_amd64.whl", hash = "sha256:745f1ed5e2c687baefc3c5e7b4304e91bf3e2f32834d07baaee243e349624b24"},
+    {file = "aiohttp-3.11.16-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:911a6e91d08bb2c72938bc17f0a2d97864c531536b7832abee6429d5296e5b27"},
+    {file = "aiohttp-3.11.16-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6ac13b71761e49d5f9e4d05d33683bbafef753e876e8e5a7ef26e937dd766713"},
+    {file = "aiohttp-3.11.16-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fd36c119c5d6551bce374fcb5c19269638f8d09862445f85a5a48596fd59f4bb"},
+    {file = "aiohttp-3.11.16-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d489d9778522fbd0f8d6a5c6e48e3514f11be81cb0a5954bdda06f7e1594b321"},
+    {file = "aiohttp-3.11.16-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69a2cbd61788d26f8f1e626e188044834f37f6ae3f937bd9f08b65fc9d7e514e"},
+    {file = "aiohttp-3.11.16-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd464ba806e27ee24a91362ba3621bfc39dbbb8b79f2e1340201615197370f7c"},
+    {file = "aiohttp-3.11.16-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ce63ae04719513dd2651202352a2beb9f67f55cb8490c40f056cea3c5c355ce"},
+    {file = "aiohttp-3.11.16-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09b00dd520d88eac9d1768439a59ab3d145065c91a8fab97f900d1b5f802895e"},
+    {file = "aiohttp-3.11.16-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7f6428fee52d2bcf96a8aa7b62095b190ee341ab0e6b1bcf50c615d7966fd45b"},
+    {file = "aiohttp-3.11.16-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:13ceac2c5cdcc3f64b9015710221ddf81c900c5febc505dbd8f810e770011540"},
+    {file = "aiohttp-3.11.16-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fadbb8f1d4140825069db3fedbbb843290fd5f5bc0a5dbd7eaf81d91bf1b003b"},
+    {file = "aiohttp-3.11.16-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6a792ce34b999fbe04a7a71a90c74f10c57ae4c51f65461a411faa70e154154e"},
+    {file = "aiohttp-3.11.16-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f4065145bf69de124accdd17ea5f4dc770da0a6a6e440c53f6e0a8c27b3e635c"},
+    {file = "aiohttp-3.11.16-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa73e8c2656a3653ae6c307b3f4e878a21f87859a9afab228280ddccd7369d71"},
+    {file = "aiohttp-3.11.16-cp312-cp312-win32.whl", hash = "sha256:f244b8e541f414664889e2c87cac11a07b918cb4b540c36f7ada7bfa76571ea2"},
+    {file = "aiohttp-3.11.16-cp312-cp312-win_amd64.whl", hash = "sha256:23a15727fbfccab973343b6d1b7181bfb0b4aa7ae280f36fd2f90f5476805682"},
+    {file = "aiohttp-3.11.16-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a3814760a1a700f3cfd2f977249f1032301d0a12c92aba74605cfa6ce9f78489"},
+    {file = "aiohttp-3.11.16-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9b751a6306f330801665ae69270a8a3993654a85569b3469662efaad6cf5cc50"},
+    {file = "aiohttp-3.11.16-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ad497f38a0d6c329cb621774788583ee12321863cd4bd9feee1effd60f2ad133"},
+    {file = "aiohttp-3.11.16-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca37057625693d097543bd88076ceebeb248291df9d6ca8481349efc0b05dcd0"},
+    {file = "aiohttp-3.11.16-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5abcbba9f4b463a45c8ca8b7720891200658f6f46894f79517e6cd11f3405ca"},
+    {file = "aiohttp-3.11.16-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f420bfe862fb357a6d76f2065447ef6f484bc489292ac91e29bc65d2d7a2c84d"},
+    {file = "aiohttp-3.11.16-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58ede86453a6cf2d6ce40ef0ca15481677a66950e73b0a788917916f7e35a0bb"},
+    {file = "aiohttp-3.11.16-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fdec0213244c39973674ca2a7f5435bf74369e7d4e104d6c7473c81c9bcc8c4"},
+    {file = "aiohttp-3.11.16-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:72b1b03fb4655c1960403c131740755ec19c5898c82abd3961c364c2afd59fe7"},
+    {file = "aiohttp-3.11.16-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:780df0d837276276226a1ff803f8d0fa5f8996c479aeef52eb040179f3156cbd"},
+    {file = "aiohttp-3.11.16-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ecdb8173e6c7aa09eee342ac62e193e6904923bd232e76b4157ac0bfa670609f"},
+    {file = "aiohttp-3.11.16-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:a6db7458ab89c7d80bc1f4e930cc9df6edee2200127cfa6f6e080cf619eddfbd"},
+    {file = "aiohttp-3.11.16-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:2540ddc83cc724b13d1838026f6a5ad178510953302a49e6d647f6e1de82bc34"},
+    {file = "aiohttp-3.11.16-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:3b4e6db8dc4879015b9955778cfb9881897339c8fab7b3676f8433f849425913"},
+    {file = "aiohttp-3.11.16-cp313-cp313-win32.whl", hash = "sha256:493910ceb2764f792db4dc6e8e4b375dae1b08f72e18e8f10f18b34ca17d0979"},
+    {file = "aiohttp-3.11.16-cp313-cp313-win_amd64.whl", hash = "sha256:42864e70a248f5f6a49fdaf417d9bc62d6e4d8ee9695b24c5916cb4bb666c802"},
+    {file = "aiohttp-3.11.16-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bbcba75fe879ad6fd2e0d6a8d937f34a571f116a0e4db37df8079e738ea95c71"},
+    {file = "aiohttp-3.11.16-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:87a6e922b2b2401e0b0cf6b976b97f11ec7f136bfed445e16384fbf6fd5e8602"},
+    {file = "aiohttp-3.11.16-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ccf10f16ab498d20e28bc2b5c1306e9c1512f2840f7b6a67000a517a4b37d5ee"},
+    {file = "aiohttp-3.11.16-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb3d0cc5cdb926090748ea60172fa8a213cec728bd6c54eae18b96040fcd6227"},
+    {file = "aiohttp-3.11.16-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d07502cc14ecd64f52b2a74ebbc106893d9a9717120057ea9ea1fd6568a747e7"},
+    {file = "aiohttp-3.11.16-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:776c8e959a01e5e8321f1dec77964cb6101020a69d5a94cd3d34db6d555e01f7"},
+    {file = "aiohttp-3.11.16-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0902e887b0e1d50424112f200eb9ae3dfed6c0d0a19fc60f633ae5a57c809656"},
+    {file = "aiohttp-3.11.16-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e87fd812899aa78252866ae03a048e77bd11b80fb4878ce27c23cade239b42b2"},
+    {file = "aiohttp-3.11.16-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0a950c2eb8ff17361abd8c85987fd6076d9f47d040ebffce67dce4993285e973"},
+    {file = "aiohttp-3.11.16-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:c10d85e81d0b9ef87970ecbdbfaeec14a361a7fa947118817fcea8e45335fa46"},
+    {file = "aiohttp-3.11.16-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:7951decace76a9271a1ef181b04aa77d3cc309a02a51d73826039003210bdc86"},
+    {file = "aiohttp-3.11.16-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:14461157d8426bcb40bd94deb0450a6fa16f05129f7da546090cebf8f3123b0f"},
+    {file = "aiohttp-3.11.16-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9756d9b9d4547e091f99d554fbba0d2a920aab98caa82a8fb3d3d9bee3c9ae85"},
+    {file = "aiohttp-3.11.16-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:87944bd16b7fe6160607f6a17808abd25f17f61ae1e26c47a491b970fb66d8cb"},
+    {file = "aiohttp-3.11.16-cp39-cp39-win32.whl", hash = "sha256:92b7ee222e2b903e0a4b329a9943d432b3767f2d5029dbe4ca59fb75223bbe2e"},
+    {file = "aiohttp-3.11.16-cp39-cp39-win_amd64.whl", hash = "sha256:17ae4664031aadfbcb34fd40ffd90976671fa0c0286e6c4113989f78bebab37a"},
+    {file = "aiohttp-3.11.16.tar.gz", hash = "sha256:16f8a2c9538c14a557b4d309ed4d0a7c60f0253e8ed7b6c9a2859a7582f8b1b8"},
 ]
 
 [package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
-async-timeout = ">=4.0.0a3,<5.0"
-asynctest = {version = "0.13.0", markers = "python_version < \"3.8\""}
+async-timeout = {version = ">=4.0,<6.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
-charset-normalizer = ">=2.0,<4.0"
 frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
-typing-extensions = {version = ">=3.7.4", markers = "python_version < \"3.8\""}
-yarl = ">=1.0,<2.0"
+propcache = ">=0.2.0"
+yarl = ">=1.17.0,<2.0"
 
 [package.extras]
-speedups = ["Brotli", "aiodns", "cchardet"]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
 
 [[package]]
 name = "aiosignal"
-version = "1.3.1"
+version = "1.3.2"
 description = "aiosignal: a list of registered asynchronous callbacks"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
-    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
+    {file = "aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5"},
+    {file = "aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54"},
 ]
 
 [package.dependencies]
@@ -126,167 +133,161 @@ frozenlist = ">=1.1.0"
 
 [[package]]
 name = "annotated-types"
-version = "0.5.0"
+version = "0.7.0"
 description = "Reusable constraint types to use with typing.Annotated"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "annotated_types-0.5.0-py3-none-any.whl", hash = "sha256:58da39888f92c276ad970249761ebea80ba544b77acddaa1a4d6cf78287d45fd"},
-    {file = "annotated_types-0.5.0.tar.gz", hash = "sha256:47cdc3490d9ac1506ce92c7aaa76c579dc3509ff11e098fc867e5130ab7be802"},
+    {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
+    {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
 ]
 
-[package.dependencies]
-typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""}
-
 [[package]]
 name = "async-timeout"
-version = "4.0.3"
+version = "5.0.1"
 description = "Timeout context manager for asyncio programs"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version < \"3.11\""
 files = [
-    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
-    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
-]
-
-[package.dependencies]
-typing-extensions = {version = ">=3.6.5", markers = "python_version < \"3.8\""}
-
-[[package]]
-name = "asynctest"
-version = "0.13.0"
-description = "Enhance the standard unittest package with features for testing asyncio libraries"
-optional = false
-python-versions = ">=3.5"
-files = [
-    {file = "asynctest-0.13.0-py3-none-any.whl", hash = "sha256:5da6118a7e6d6b54d83a8f7197769d046922a44d2a99c21382f0a6e4fadae676"},
-    {file = "asynctest-0.13.0.tar.gz", hash = "sha256:c27862842d15d83e6a34eb0b2866c323880eb3a75e4485b079ea11748fd77fac"},
-]
-
-[[package]]
-name = "atomicwrites"
-version = "1.4.1"
-description = "Atomic file writes."
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-files = [
-    {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"},
+    {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"},
+    {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"},
 ]
 
 [[package]]
 name = "attrs"
-version = "23.1.0"
+version = "25.3.0"
 description = "Classes Without Boilerplate"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"},
-    {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"},
+    {file = "attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3"},
+    {file = "attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b"},
 ]
 
-[package.dependencies]
-importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
-
 [package.extras]
-cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
-dev = ["attrs[docs,tests]", "pre-commit"]
-docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
-tests = ["attrs[tests-no-zope]", "zope-interface"]
-tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier"]
+tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
 
 [[package]]
 name = "certifi"
-version = "2023.7.22"
+version = "2025.1.31"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
-    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
-    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
+    {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"},
+    {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"},
 ]
 
 [[package]]
 name = "charset-normalizer"
-version = "3.2.0"
+version = "3.4.1"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "charset-normalizer-3.2.0.tar.gz", hash = "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-win32.whl", hash = "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-win32.whl", hash = "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-win32.whl", hash = "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-win32.whl", hash = "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80"},
-    {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-win32.whl", hash = "sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30bf9fd9be89ecb2360c7d94a711f00c09b976258846efe40db3d05828e8089"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:97f68b8d6831127e4787ad15e6757232e14e12060bec17091b85eb1486b91d8d"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7974a0b5ecd505609e3b19742b60cee7aa2aa2fb3151bc917e6e2646d7667dcf"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc54db6c8593ef7d4b2a331b58653356cf04f67c960f584edb7c3d8c97e8f39e"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:311f30128d7d333eebd7896965bfcfbd0065f1716ec92bd5638d7748eb6f936a"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:7d053096f67cd1241601111b698f5cad775f97ab25d81567d3f59219b5f1adbd"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:807f52c1f798eef6cf26beb819eeb8819b1622ddfeef9d0977a8502d4db6d534"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:dccbe65bd2f7f7ec22c4ff99ed56faa1e9f785482b9bbd7c717e26fd723a1d1e"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:2fb9bd477fdea8684f78791a6de97a953c51831ee2981f8e4f583ff3b9d9687e"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:01732659ba9b5b873fc117534143e4feefecf3b2078b0a6a2e925271bb6f4cfa"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:7a4f97a081603d2050bfaffdefa5b02a9ec823f8348a572e39032caa8404a487"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7b1bef6280950ee6c177b326508f86cad7ad4dff12454483b51d8b7d673a2c5d"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-win32.whl", hash = "sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765"},
+    {file = "charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85"},
+    {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"},
 ]
 
 [[package]]
@@ -295,78 +296,84 @@ version = "0.4.6"
 description = "Cross-platform colored terminal text."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+groups = ["main", "dev"]
 files = [
     {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
+markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""}
 
 [[package]]
 name = "coverage"
-version = "7.2.7"
+version = "7.8.0"
 description = "Code coverage measurement for Python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["dev"]
 files = [
-    {file = "coverage-7.2.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d39b5b4f2a66ccae8b7263ac3c8170994b65266797fb96cbbfd3fb5b23921db8"},
-    {file = "coverage-7.2.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d040ef7c9859bb11dfeb056ff5b3872436e3b5e401817d87a31e1750b9ae2fb"},
-    {file = "coverage-7.2.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba90a9563ba44a72fda2e85302c3abc71c5589cea608ca16c22b9804262aaeb6"},
-    {file = "coverage-7.2.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d9405291c6928619403db1d10bd07888888ec1abcbd9748fdaa971d7d661b2"},
-    {file = "coverage-7.2.7-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31563e97dae5598556600466ad9beea39fb04e0229e61c12eaa206e0aa202063"},
-    {file = "coverage-7.2.7-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ebba1cd308ef115925421d3e6a586e655ca5a77b5bf41e02eb0e4562a111f2d1"},
-    {file = "coverage-7.2.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:cb017fd1b2603ef59e374ba2063f593abe0fc45f2ad9abdde5b4d83bd922a353"},
-    {file = "coverage-7.2.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62a5c7dad11015c66fbb9d881bc4caa5b12f16292f857842d9d1871595f4495"},
-    {file = "coverage-7.2.7-cp310-cp310-win32.whl", hash = "sha256:ee57190f24fba796e36bb6d3aa8a8783c643d8fa9760c89f7a98ab5455fbf818"},
-    {file = "coverage-7.2.7-cp310-cp310-win_amd64.whl", hash = "sha256:f75f7168ab25dd93110c8a8117a22450c19976afbc44234cbf71481094c1b850"},
-    {file = "coverage-7.2.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06a9a2be0b5b576c3f18f1a241f0473575c4a26021b52b2a85263a00f034d51f"},
-    {file = "coverage-7.2.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5baa06420f837184130752b7c5ea0808762083bf3487b5038d68b012e5937dbe"},
-    {file = "coverage-7.2.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdec9e8cbf13a5bf63290fc6013d216a4c7232efb51548594ca3631a7f13c3a3"},
-    {file = "coverage-7.2.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:52edc1a60c0d34afa421c9c37078817b2e67a392cab17d97283b64c5833f427f"},
-    {file = "coverage-7.2.7-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63426706118b7f5cf6bb6c895dc215d8a418d5952544042c8a2d9fe87fcf09cb"},
-    {file = "coverage-7.2.7-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:afb17f84d56068a7c29f5fa37bfd38d5aba69e3304af08ee94da8ed5b0865833"},
-    {file = "coverage-7.2.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:48c19d2159d433ccc99e729ceae7d5293fbffa0bdb94952d3579983d1c8c9d97"},
-    {file = "coverage-7.2.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0e1f928eaf5469c11e886fe0885ad2bf1ec606434e79842a879277895a50942a"},
-    {file = "coverage-7.2.7-cp311-cp311-win32.whl", hash = "sha256:33d6d3ea29d5b3a1a632b3c4e4f4ecae24ef170b0b9ee493883f2df10039959a"},
-    {file = "coverage-7.2.7-cp311-cp311-win_amd64.whl", hash = "sha256:5b7540161790b2f28143191f5f8ec02fb132660ff175b7747b95dcb77ac26562"},
-    {file = "coverage-7.2.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f2f67fe12b22cd130d34d0ef79206061bfb5eda52feb6ce0dba0644e20a03cf4"},
-    {file = "coverage-7.2.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a342242fe22407f3c17f4b499276a02b01e80f861f1682ad1d95b04018e0c0d4"},
-    {file = "coverage-7.2.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:171717c7cb6b453aebac9a2ef603699da237f341b38eebfee9be75d27dc38e01"},
-    {file = "coverage-7.2.7-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49969a9f7ffa086d973d91cec8d2e31080436ef0fb4a359cae927e742abfaaa6"},
-    {file = "coverage-7.2.7-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b46517c02ccd08092f4fa99f24c3b83d8f92f739b4657b0f146246a0ca6a831d"},
-    {file = "coverage-7.2.7-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:a3d33a6b3eae87ceaefa91ffdc130b5e8536182cd6dfdbfc1aa56b46ff8c86de"},
-    {file = "coverage-7.2.7-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:976b9c42fb2a43ebf304fa7d4a310e5f16cc99992f33eced91ef6f908bd8f33d"},
-    {file = "coverage-7.2.7-cp312-cp312-win32.whl", hash = "sha256:8de8bb0e5ad103888d65abef8bca41ab93721647590a3f740100cd65c3b00511"},
-    {file = "coverage-7.2.7-cp312-cp312-win_amd64.whl", hash = "sha256:9e31cb64d7de6b6f09702bb27c02d1904b3aebfca610c12772452c4e6c21a0d3"},
-    {file = "coverage-7.2.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:58c2ccc2f00ecb51253cbe5d8d7122a34590fac9646a960d1430d5b15321d95f"},
-    {file = "coverage-7.2.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d22656368f0e6189e24722214ed8d66b8022db19d182927b9a248a2a8a2f67eb"},
-    {file = "coverage-7.2.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a895fcc7b15c3fc72beb43cdcbdf0ddb7d2ebc959edac9cef390b0d14f39f8a9"},
-    {file = "coverage-7.2.7-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e84606b74eb7de6ff581a7915e2dab7a28a0517fbe1c9239eb227e1354064dcd"},
-    {file = "coverage-7.2.7-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:0a5f9e1dbd7fbe30196578ca36f3fba75376fb99888c395c5880b355e2875f8a"},
-    {file = "coverage-7.2.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:419bfd2caae268623dd469eff96d510a920c90928b60f2073d79f8fe2bbc5959"},
-    {file = "coverage-7.2.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2aee274c46590717f38ae5e4650988d1af340fe06167546cc32fe2f58ed05b02"},
-    {file = "coverage-7.2.7-cp37-cp37m-win32.whl", hash = "sha256:61b9a528fb348373c433e8966535074b802c7a5d7f23c4f421e6c6e2f1697a6f"},
-    {file = "coverage-7.2.7-cp37-cp37m-win_amd64.whl", hash = "sha256:b1c546aca0ca4d028901d825015dc8e4d56aac4b541877690eb76490f1dc8ed0"},
-    {file = "coverage-7.2.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:54b896376ab563bd38453cecb813c295cf347cf5906e8b41d340b0321a5433e5"},
-    {file = "coverage-7.2.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3d376df58cc111dc8e21e3b6e24606b5bb5dee6024f46a5abca99124b2229ef5"},
-    {file = "coverage-7.2.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e330fc79bd7207e46c7d7fd2bb4af2963f5f635703925543a70b99574b0fea9"},
-    {file = "coverage-7.2.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e9d683426464e4a252bf70c3498756055016f99ddaec3774bf368e76bbe02b6"},
-    {file = "coverage-7.2.7-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d13c64ee2d33eccf7437961b6ea7ad8673e2be040b4f7fd4fd4d4d28d9ccb1e"},
-    {file = "coverage-7.2.7-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b7aa5f8a41217360e600da646004f878250a0d6738bcdc11a0a39928d7dc2050"},
-    {file = "coverage-7.2.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8fa03bce9bfbeeef9f3b160a8bed39a221d82308b4152b27d82d8daa7041fee5"},
-    {file = "coverage-7.2.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:245167dd26180ab4c91d5e1496a30be4cd721a5cf2abf52974f965f10f11419f"},
-    {file = "coverage-7.2.7-cp38-cp38-win32.whl", hash = "sha256:d2c2db7fd82e9b72937969bceac4d6ca89660db0a0967614ce2481e81a0b771e"},
-    {file = "coverage-7.2.7-cp38-cp38-win_amd64.whl", hash = "sha256:2e07b54284e381531c87f785f613b833569c14ecacdcb85d56b25c4622c16c3c"},
-    {file = "coverage-7.2.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:537891ae8ce59ef63d0123f7ac9e2ae0fc8b72c7ccbe5296fec45fd68967b6c9"},
-    {file = "coverage-7.2.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:06fb182e69f33f6cd1d39a6c597294cff3143554b64b9825d1dc69d18cc2fff2"},
-    {file = "coverage-7.2.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:201e7389591af40950a6480bd9edfa8ed04346ff80002cec1a66cac4549c1ad7"},
-    {file = "coverage-7.2.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f6951407391b639504e3b3be51b7ba5f3528adbf1a8ac3302b687ecababf929e"},
-    {file = "coverage-7.2.7-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f48351d66575f535669306aa7d6d6f71bc43372473b54a832222803eb956fd1"},
-    {file = "coverage-7.2.7-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b29019c76039dc3c0fd815c41392a044ce555d9bcdd38b0fb60fb4cd8e475ba9"},
-    {file = "coverage-7.2.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:81c13a1fc7468c40f13420732805a4c38a105d89848b7c10af65a90beff25250"},
-    {file = "coverage-7.2.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:975d70ab7e3c80a3fe86001d8751f6778905ec723f5b110aed1e450da9d4b7f2"},
-    {file = "coverage-7.2.7-cp39-cp39-win32.whl", hash = "sha256:7ee7d9d4822c8acc74a5e26c50604dff824710bc8de424904c0982e25c39c6cb"},
-    {file = "coverage-7.2.7-cp39-cp39-win_amd64.whl", hash = "sha256:eb393e5ebc85245347950143969b241d08b52b88a3dc39479822e073a1a8eb27"},
-    {file = "coverage-7.2.7-pp37.pp38.pp39-none-any.whl", hash = "sha256:b7b4c971f05e6ae490fef852c218b0e79d4e52f79ef0c8475566584a8fb3e01d"},
-    {file = "coverage-7.2.7.tar.gz", hash = "sha256:924d94291ca674905fe9481f12294eb11f2d3d3fd1adb20314ba89e94f44ed59"},
+    {file = "coverage-7.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2931f66991175369859b5fd58529cd4b73582461877ecfd859b6549869287ffe"},
+    {file = "coverage-7.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52a523153c568d2c0ef8826f6cc23031dc86cffb8c6aeab92c4ff776e7951b28"},
+    {file = "coverage-7.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c8a5c139aae4c35cbd7cadca1df02ea8cf28a911534fc1b0456acb0b14234f3"},
+    {file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a26c0c795c3e0b63ec7da6efded5f0bc856d7c0b24b2ac84b4d1d7bc578d676"},
+    {file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:821f7bcbaa84318287115d54becb1915eece6918136c6f91045bb84e2f88739d"},
+    {file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a321c61477ff8ee705b8a5fed370b5710c56b3a52d17b983d9215861e37b642a"},
+    {file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ed2144b8a78f9d94d9515963ed273d620e07846acd5d4b0a642d4849e8d91a0c"},
+    {file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:042e7841a26498fff7a37d6fda770d17519982f5b7d8bf5278d140b67b61095f"},
+    {file = "coverage-7.8.0-cp310-cp310-win32.whl", hash = "sha256:f9983d01d7705b2d1f7a95e10bbe4091fabc03a46881a256c2787637b087003f"},
+    {file = "coverage-7.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a570cd9bd20b85d1a0d7b009aaf6c110b52b5755c17be6962f8ccd65d1dbd23"},
+    {file = "coverage-7.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7ac22a0bb2c7c49f441f7a6d46c9c80d96e56f5a8bc6972529ed43c8b694e27"},
+    {file = "coverage-7.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bf13d564d310c156d1c8e53877baf2993fb3073b2fc9f69790ca6a732eb4bfea"},
+    {file = "coverage-7.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5761c70c017c1b0d21b0815a920ffb94a670c8d5d409d9b38857874c21f70d7"},
+    {file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5ff52d790c7e1628241ffbcaeb33e07d14b007b6eb00a19320c7b8a7024c040"},
+    {file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d39fc4817fd67b3915256af5dda75fd4ee10621a3d484524487e33416c6f3543"},
+    {file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b44674870709017e4b4036e3d0d6c17f06a0e6d4436422e0ad29b882c40697d2"},
+    {file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f99eb72bf27cbb167b636eb1726f590c00e1ad375002230607a844d9e9a2318"},
+    {file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b571bf5341ba8c6bc02e0baeaf3b061ab993bf372d982ae509807e7f112554e9"},
+    {file = "coverage-7.8.0-cp311-cp311-win32.whl", hash = "sha256:e75a2ad7b647fd8046d58c3132d7eaf31b12d8a53c0e4b21fa9c4d23d6ee6d3c"},
+    {file = "coverage-7.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:3043ba1c88b2139126fc72cb48574b90e2e0546d4c78b5299317f61b7f718b78"},
+    {file = "coverage-7.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbb5cc845a0292e0c520656d19d7ce40e18d0e19b22cb3e0409135a575bf79fc"},
+    {file = "coverage-7.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4dfd9a93db9e78666d178d4f08a5408aa3f2474ad4d0e0378ed5f2ef71640cb6"},
+    {file = "coverage-7.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f017a61399f13aa6d1039f75cd467be388d157cd81f1a119b9d9a68ba6f2830d"},
+    {file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0915742f4c82208ebf47a2b154a5334155ed9ef9fe6190674b8a46c2fb89cb05"},
+    {file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a40fcf208e021eb14b0fac6bdb045c0e0cab53105f93ba0d03fd934c956143a"},
+    {file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a1f406a8e0995d654b2ad87c62caf6befa767885301f3b8f6f73e6f3c31ec3a6"},
+    {file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:77af0f6447a582fdc7de5e06fa3757a3ef87769fbb0fdbdeba78c23049140a47"},
+    {file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f2d32f95922927186c6dbc8bc60df0d186b6edb828d299ab10898ef3f40052fe"},
+    {file = "coverage-7.8.0-cp312-cp312-win32.whl", hash = "sha256:769773614e676f9d8e8a0980dd7740f09a6ea386d0f383db6821df07d0f08545"},
+    {file = "coverage-7.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:e5d2b9be5b0693cf21eb4ce0ec8d211efb43966f6657807f6859aab3814f946b"},
+    {file = "coverage-7.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ac46d0c2dd5820ce93943a501ac5f6548ea81594777ca585bf002aa8854cacd"},
+    {file = "coverage-7.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:771eb7587a0563ca5bb6f622b9ed7f9d07bd08900f7589b4febff05f469bea00"},
+    {file = "coverage-7.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42421e04069fb2cbcbca5a696c4050b84a43b05392679d4068acbe65449b5c64"},
+    {file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:554fec1199d93ab30adaa751db68acec2b41c5602ac944bb19187cb9a41a8067"},
+    {file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aaeb00761f985007b38cf463b1d160a14a22c34eb3f6a39d9ad6fc27cb73008"},
+    {file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:581a40c7b94921fffd6457ffe532259813fc68eb2bdda60fa8cc343414ce3733"},
+    {file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f319bae0321bc838e205bf9e5bc28f0a3165f30c203b610f17ab5552cff90323"},
+    {file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04bfec25a8ef1c5f41f5e7e5c842f6b615599ca8ba8391ec33a9290d9d2db3a3"},
+    {file = "coverage-7.8.0-cp313-cp313-win32.whl", hash = "sha256:dd19608788b50eed889e13a5d71d832edc34fc9dfce606f66e8f9f917eef910d"},
+    {file = "coverage-7.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:a9abbccd778d98e9c7e85038e35e91e67f5b520776781d9a1e2ee9d400869487"},
+    {file = "coverage-7.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:18c5ae6d061ad5b3e7eef4363fb27a0576012a7447af48be6c75b88494c6cf25"},
+    {file = "coverage-7.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:95aa6ae391a22bbbce1b77ddac846c98c5473de0372ba5c463480043a07bff42"},
+    {file = "coverage-7.8.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e013b07ba1c748dacc2a80e69a46286ff145935f260eb8c72df7185bf048f502"},
+    {file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d766a4f0e5aa1ba056ec3496243150698dc0481902e2b8559314368717be82b1"},
+    {file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad80e6b4a0c3cb6f10f29ae4c60e991f424e6b14219d46f1e7d442b938ee68a4"},
+    {file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b87eb6fc9e1bb8f98892a2458781348fa37e6925f35bb6ceb9d4afd54ba36c73"},
+    {file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d1ba00ae33be84066cfbe7361d4e04dec78445b2b88bdb734d0d1cbab916025a"},
+    {file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f3c38e4e5ccbdc9198aecc766cedbb134b2d89bf64533973678dfcf07effd883"},
+    {file = "coverage-7.8.0-cp313-cp313t-win32.whl", hash = "sha256:379fe315e206b14e21db5240f89dc0774bdd3e25c3c58c2c733c99eca96f1ada"},
+    {file = "coverage-7.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e4b6b87bb0c846a9315e3ab4be2d52fac905100565f4b92f02c445c8799e257"},
+    {file = "coverage-7.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa260de59dfb143af06dcf30c2be0b200bed2a73737a8a59248fcb9fa601ef0f"},
+    {file = "coverage-7.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96121edfa4c2dfdda409877ea8608dd01de816a4dc4a0523356067b305e4e17a"},
+    {file = "coverage-7.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b8af63b9afa1031c0ef05b217faa598f3069148eeee6bb24b79da9012423b82"},
+    {file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89b1f4af0d4afe495cd4787a68e00f30f1d15939f550e869de90a86efa7e0814"},
+    {file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94ec0be97723ae72d63d3aa41961a0b9a6f5a53ff599813c324548d18e3b9e8c"},
+    {file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8a1d96e780bdb2d0cbb297325711701f7c0b6f89199a57f2049e90064c29f6bd"},
+    {file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f1d8a2a57b47142b10374902777e798784abf400a004b14f1b0b9eaf1e528ba4"},
+    {file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cf60dd2696b457b710dd40bf17ad269d5f5457b96442f7f85722bdb16fa6c899"},
+    {file = "coverage-7.8.0-cp39-cp39-win32.whl", hash = "sha256:be945402e03de47ba1872cd5236395e0f4ad635526185a930735f66710e1bd3f"},
+    {file = "coverage-7.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:90e7fbc6216ecaffa5a880cdc9c77b7418c1dcb166166b78dbc630d07f278cc3"},
+    {file = "coverage-7.8.0-pp39.pp310.pp311-none-any.whl", hash = "sha256:b8194fb8e50d556d5849753de991d390c5a1edeeba50f68e3a9253fbd8bf8ccd"},
+    {file = "coverage-7.8.0-py3-none-any.whl", hash = "sha256:dbf364b4c5e7bae9250528167dfe40219b62e2d573c854d74be213e1e52069f7"},
+    {file = "coverage-7.8.0.tar.gz", hash = "sha256:7a3d62b3b03b4b6fd41a085f3574874cf946cb4604d2b4d3e8dca8cd570ca501"},
 ]
 
 [package.dependencies]
@@ -376,112 +383,150 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1
 toml = ["tomli"]
 
 [[package]]
-name = "filelock"
-version = "3.12.2"
-description = "A platform independent file lock."
+name = "exceptiongroup"
+version = "1.2.2"
+description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
+groups = ["dev"]
+markers = "python_version < \"3.11\""
 files = [
-    {file = "filelock-3.12.2-py3-none-any.whl", hash = "sha256:cbb791cdea2a72f23da6ac5b5269ab0a0d161e9ef0100e653b69049a7706d1ec"},
-    {file = "filelock-3.12.2.tar.gz", hash = "sha256:002740518d8aa59a26b0c76e10fb8c6e15eae825d34b6fdf670333fd7b938d81"},
+    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
+    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
 ]
 
 [package.extras]
-docs = ["furo (>=2023.5.20)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "diff-cover (>=7.5)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"]
+test = ["pytest (>=6)"]
+
+[[package]]
+name = "filelock"
+version = "3.18.0"
+description = "A platform independent file lock."
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de"},
+    {file = "filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2"},
+]
+
+[package.extras]
+docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
+typing = ["typing-extensions (>=4.12.2)"]
 
 [[package]]
 name = "frozenlist"
-version = "1.3.3"
+version = "1.5.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff8bf625fe85e119553b5383ba0fb6aa3d0ec2ae980295aaefa552374926b3f4"},
-    {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dfbac4c2dfcc082fcf8d942d1e49b6aa0766c19d3358bd86e2000bf0fa4a9cf0"},
-    {file = "frozenlist-1.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b1c63e8d377d039ac769cd0926558bb7068a1f7abb0f003e3717ee003ad85530"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7fdfc24dcfce5b48109867c13b4cb15e4660e7bd7661741a391f821f23dfdca7"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c926450857408e42f0bbc295e84395722ce74bae69a3b2aa2a65fe22cb14b99"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1841e200fdafc3d51f974d9d377c079a0694a8f06de2e67b48150328d66d5483"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f470c92737afa7d4c3aacc001e335062d582053d4dbe73cda126f2d7031068dd"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:783263a4eaad7c49983fe4b2e7b53fa9770c136c270d2d4bbb6d2192bf4d9caf"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:924620eef691990dfb56dc4709f280f40baee568c794b5c1885800c3ecc69816"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ae4dc05c465a08a866b7a1baf360747078b362e6a6dbeb0c57f234db0ef88ae0"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:bed331fe18f58d844d39ceb398b77d6ac0b010d571cba8267c2e7165806b00ce"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:02c9ac843e3390826a265e331105efeab489ffaf4dd86384595ee8ce6d35ae7f"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9545a33965d0d377b0bc823dcabf26980e77f1b6a7caa368a365a9497fb09420"},
-    {file = "frozenlist-1.3.3-cp310-cp310-win32.whl", hash = "sha256:d5cd3ab21acbdb414bb6c31958d7b06b85eeb40f66463c264a9b343a4e238642"},
-    {file = "frozenlist-1.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:b756072364347cb6aa5b60f9bc18e94b2f79632de3b0190253ad770c5df17db1"},
-    {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b4395e2f8d83fbe0c627b2b696acce67868793d7d9750e90e39592b3626691b7"},
-    {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:14143ae966a6229350021384870458e4777d1eae4c28d1a7aa47f24d030e6678"},
-    {file = "frozenlist-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d8860749e813a6f65bad8285a0520607c9500caa23fea6ee407e63debcdbef6"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23d16d9f477bb55b6154654e0e74557040575d9d19fe78a161bd33d7d76808e8"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb82dbba47a8318e75f679690190c10a5e1f447fbf9df41cbc4c3afd726d88cb"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9309869032abb23d196cb4e4db574232abe8b8be1339026f489eeb34a4acfd91"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a97b4fe50b5890d36300820abd305694cb865ddb7885049587a5678215782a6b"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c188512b43542b1e91cadc3c6c915a82a5eb95929134faf7fd109f14f9892ce4"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:303e04d422e9b911a09ad499b0368dc551e8c3cd15293c99160c7f1f07b59a48"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0771aed7f596c7d73444c847a1c16288937ef988dc04fb9f7be4b2aa91db609d"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:66080ec69883597e4d026f2f71a231a1ee9887835902dbe6b6467d5a89216cf6"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:41fe21dc74ad3a779c3d73a2786bdf622ea81234bdd4faf90b8b03cad0c2c0b4"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f20380df709d91525e4bee04746ba612a4df0972c1b8f8e1e8af997e678c7b81"},
-    {file = "frozenlist-1.3.3-cp311-cp311-win32.whl", hash = "sha256:f30f1928162e189091cf4d9da2eac617bfe78ef907a761614ff577ef4edfb3c8"},
-    {file = "frozenlist-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:a6394d7dadd3cfe3f4b3b186e54d5d8504d44f2d58dcc89d693698e8b7132b32"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8df3de3a9ab8325f94f646609a66cbeeede263910c5c0de0101079ad541af332"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0693c609e9742c66ba4870bcee1ad5ff35462d5ffec18710b4ac89337ff16e27"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd4210baef299717db0a600d7a3cac81d46ef0e007f88c9335db79f8979c0d3d"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:394c9c242113bfb4b9aa36e2b80a05ffa163a30691c7b5a29eba82e937895d5e"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6327eb8e419f7d9c38f333cde41b9ae348bec26d840927332f17e887a8dcb70d"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e24900aa13212e75e5b366cb9065e78bbf3893d4baab6052d1aca10d46d944c"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3843f84a6c465a36559161e6c59dce2f2ac10943040c2fd021cfb70d58c4ad56"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:84610c1502b2461255b4c9b7d5e9c48052601a8957cd0aea6ec7a7a1e1fb9420"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c21b9aa40e08e4f63a2f92ff3748e6b6c84d717d033c7b3438dd3123ee18f70e"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:efce6ae830831ab6a22b9b4091d411698145cb9b8fc869e1397ccf4b4b6455cb"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:40de71985e9042ca00b7953c4f41eabc3dc514a2d1ff534027f091bc74416401"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-win32.whl", hash = "sha256:180c00c66bde6146a860cbb81b54ee0df350d2daf13ca85b275123bbf85de18a"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9bbbcedd75acdfecf2159663b87f1bb5cfc80e7cd99f7ddd9d66eb98b14a8411"},
-    {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:034a5c08d36649591be1cbb10e09da9f531034acfe29275fc5454a3b101ce41a"},
-    {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ba64dc2b3b7b158c6660d49cdb1d872d1d0bf4e42043ad8d5006099479a194e5"},
-    {file = "frozenlist-1.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:47df36a9fe24054b950bbc2db630d508cca3aa27ed0566c0baf661225e52c18e"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:008a054b75d77c995ea26629ab3a0c0d7281341f2fa7e1e85fa6153ae29ae99c"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:841ea19b43d438a80b4de62ac6ab21cfe6827bb8a9dc62b896acc88eaf9cecba"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e235688f42b36be2b6b06fc37ac2126a73b75fb8d6bc66dd632aa35286238703"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca713d4af15bae6e5d79b15c10c8522859a9a89d3b361a50b817c98c2fb402a2"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ac5995f2b408017b0be26d4a1d7c61bce106ff3d9e3324374d66b5964325448"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a4ae8135b11652b08a8baf07631d3ebfe65a4c87909dbef5fa0cdde440444ee4"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4ea42116ceb6bb16dbb7d526e242cb6747b08b7710d9782aa3d6732bd8d27649"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:810860bb4bdce7557bc0febb84bbd88198b9dbc2022d8eebe5b3590b2ad6c842"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ee78feb9d293c323b59a6f2dd441b63339a30edf35abcb51187d2fc26e696d13"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0af2e7c87d35b38732e810befb9d797a99279cbb85374d42ea61c1e9d23094b3"},
-    {file = "frozenlist-1.3.3-cp38-cp38-win32.whl", hash = "sha256:899c5e1928eec13fd6f6d8dc51be23f0d09c5281e40d9cf4273d188d9feeaf9b"},
-    {file = "frozenlist-1.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:7f44e24fa70f6fbc74aeec3e971f60a14dde85da364aa87f15d1be94ae75aeef"},
-    {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2b07ae0c1edaa0a36339ec6cce700f51b14a3fc6545fdd32930d2c83917332cf"},
-    {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ebb86518203e12e96af765ee89034a1dbb0c3c65052d1b0c19bbbd6af8a145e1"},
-    {file = "frozenlist-1.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5cf820485f1b4c91e0417ea0afd41ce5cf5965011b3c22c400f6d144296ccbc0"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c11e43016b9024240212d2a65043b70ed8dfd3b52678a1271972702d990ac6d"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8fa3c6e3305aa1146b59a09b32b2e04074945ffcfb2f0931836d103a2c38f936"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:352bd4c8c72d508778cf05ab491f6ef36149f4d0cb3c56b1b4302852255d05d5"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65a5e4d3aa679610ac6e3569e865425b23b372277f89b5ef06cf2cdaf1ebf22b"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e2c1185858d7e10ff045c496bbf90ae752c28b365fef2c09cf0fa309291669"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f163d2fd041c630fed01bc48d28c3ed4a3b003c00acd396900e11ee5316b56bb"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:05cdb16d09a0832eedf770cb7bd1fe57d8cf4eaf5aced29c4e41e3f20b30a784"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8bae29d60768bfa8fb92244b74502b18fae55a80eac13c88eb0b496d4268fd2d"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eedab4c310c0299961ac285591acd53dc6723a1ebd90a57207c71f6e0c2153ab"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3bbdf44855ed8f0fbcd102ef05ec3012d6a4fd7c7562403f76ce6a52aeffb2b1"},
-    {file = "frozenlist-1.3.3-cp39-cp39-win32.whl", hash = "sha256:efa568b885bca461f7c7b9e032655c0c143d305bf01c30caf6db2854a4532b38"},
-    {file = "frozenlist-1.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:cfe33efc9cb900a4c46f91a5ceba26d6df370ffddd9ca386eb1d4f0ad97b9ea9"},
-    {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"},
+    {file = "frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba"},
+    {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab"},
+    {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5"},
+    {file = "frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb"},
+    {file = "frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5"},
+    {file = "frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45"},
+    {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2"},
+    {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf"},
+    {file = "frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942"},
+    {file = "frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d"},
+    {file = "frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6"},
+    {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631"},
+    {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f"},
+    {file = "frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8"},
+    {file = "frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0"},
+    {file = "frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840"},
+    {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9"},
+    {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03"},
+    {file = "frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c"},
+    {file = "frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:dd94994fc91a6177bfaafd7d9fd951bc8689b0a98168aa26b5f543868548d3ca"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0da8bbec082bf6bf18345b180958775363588678f64998c2b7609e34719b10"},
+    {file = "frozenlist-1.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73f2e31ea8dd7df61a359b731716018c2be196e5bb3b74ddba107f694fbd7604"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:828afae9f17e6de596825cf4228ff28fbdf6065974e5ac1410cecc22f699d2b3"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1577515d35ed5649d52ab4319db757bb881ce3b2b796d7283e6634d99ace307"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2150cc6305a2c2ab33299453e2968611dacb970d2283a14955923062c8d00b10"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a72b7a6e3cd2725eff67cd64c8f13335ee18fc3c7befc05aed043d24c7b9ccb9"},
+    {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c16d2fa63e0800723139137d667e1056bee1a1cf7965153d2d104b62855e9b99"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:17dcc32fc7bda7ce5875435003220a457bcfa34ab7924a49a1c19f55b6ee185c"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:97160e245ea33d8609cd2b8fd997c850b56db147a304a262abc2b3be021a9171"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f1e6540b7fa044eee0bb5111ada694cf3dc15f2b0347ca125ee9ca984d5e9e6e"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:91d6c171862df0a6c61479d9724f22efb6109111017c87567cfeb7b5d1449fdf"},
+    {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c1fac3e2ace2eb1052e9f7c7db480818371134410e1f5c55d65e8f3ac6d1407e"},
+    {file = "frozenlist-1.5.0-cp38-cp38-win32.whl", hash = "sha256:b97f7b575ab4a8af9b7bc1d2ef7f29d3afee2226bd03ca3875c16451ad5a7723"},
+    {file = "frozenlist-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:374ca2dabdccad8e2a76d40b1d037f5bd16824933bf7bcea3e59c891fd4a0923"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336"},
+    {file = "frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08"},
+    {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0"},
+    {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c"},
+    {file = "frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3"},
+    {file = "frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0"},
+    {file = "frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3"},
+    {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"},
 ]
 
 [[package]]
 name = "fsspec"
-version = "2023.1.0"
+version = "2025.3.2"
 description = "File-system specification"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "fsspec-2023.1.0-py3-none-any.whl", hash = "sha256:b833e2e541e9e8cde0ab549414187871243177feb3d344f9d27b25a93f5d8139"},
-    {file = "fsspec-2023.1.0.tar.gz", hash = "sha256:fbae7f20ff801eb5f7d0bedf81f25c787c0dfac5e982d98fa3884a9cde2b5411"},
+    {file = "fsspec-2025.3.2-py3-none-any.whl", hash = "sha256:2daf8dc3d1dfa65b6aa37748d112773a7a08416f6c70d96b264c96476ecaf711"},
+    {file = "fsspec-2025.3.2.tar.gz", hash = "sha256:e52c77ef398680bbd6a98c0e628fbc469491282981209907bbc8aea76a04fdc6"},
 ]
 
 [package.extras]
@@ -489,8 +534,10 @@ abfs = ["adlfs"]
 adl = ["adlfs"]
 arrow = ["pyarrow (>=1)"]
 dask = ["dask", "distributed"]
+dev = ["pre-commit", "ruff"]
+doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"]
 dropbox = ["dropbox", "dropboxdrivefs", "requests"]
-entrypoints = ["importlib-metadata"]
+full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
 fuse = ["fusepy"]
 gcs = ["gcsfs"]
 git = ["pygit2"]
@@ -498,30 +545,33 @@ github = ["requests"]
 gs = ["gcsfs"]
 gui = ["panel"]
 hdfs = ["pyarrow (>=1)"]
-http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"]
+http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"]
 libarchive = ["libarchive-c"]
 oci = ["ocifs"]
 s3 = ["s3fs"]
 sftp = ["paramiko"]
 smb = ["smbprotocol"]
 ssh = ["paramiko"]
+test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"]
+test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
+test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
 tqdm = ["tqdm"]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.16.4"
+version = "0.30.2"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.8.0"
+groups = ["main"]
 files = [
-    {file = "huggingface_hub-0.16.4-py3-none-any.whl", hash = "sha256:0d3df29932f334fead024afc7cb4cc5149d955238b8b5e42dcf9740d6995a349"},
-    {file = "huggingface_hub-0.16.4.tar.gz", hash = "sha256:608c7d4f3d368b326d1747f91523dbd1f692871e8e2e7a4750314a2dd8b63e14"},
+    {file = "huggingface_hub-0.30.2-py3-none-any.whl", hash = "sha256:68ff05969927058cfa41df4f2155d4bb48f5f54f719dd0390103eefa9b191e28"},
+    {file = "huggingface_hub-0.30.2.tar.gz", hash = "sha256:9a7897c5b6fd9dad3168a794a8998d6378210f5b9688d0dfc180b1a228dc2466"},
 ]
 
 [package.dependencies]
 filelock = "*"
-fsspec = "*"
-importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
+fsspec = ">=2023.5.0"
 packaging = ">=20.9"
 pyyaml = ">=5.1"
 requests = "*"
@@ -529,314 +579,429 @@ tqdm = ">=4.42.1"
 typing-extensions = ">=3.7.4.3"
 
 [package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
-inference = ["aiohttp", "pydantic"]
-quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"]
+hf-transfer = ["hf-transfer (>=0.1.4)"]
+hf-xet = ["hf-xet (>=0.1.4)"]
+inference = ["aiohttp"]
+quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.9.0)"]
 tensorflow = ["graphviz", "pydot", "tensorflow"]
-testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
-torch = ["torch"]
-typing = ["pydantic", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+tensorflow-testing = ["keras (<3.0)", "tensorflow"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
+torch = ["safetensors[torch]", "torch"]
+typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
 
 [[package]]
 name = "idna"
-version = "3.4"
+version = "3.10"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
-python-versions = ">=3.5"
+python-versions = ">=3.6"
+groups = ["main"]
 files = [
-    {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
-    {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
+    {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
+    {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
 ]
 
-[[package]]
-name = "importlib-metadata"
-version = "6.7.0"
-description = "Read metadata from Python packages"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "importlib_metadata-6.7.0-py3-none-any.whl", hash = "sha256:cb52082e659e97afc5dac71e79de97d8681de3aa07ff18578330904a9d18e5b5"},
-    {file = "importlib_metadata-6.7.0.tar.gz", hash = "sha256:1aaf550d4f73e5d6783e7acb77aec43d49da8017410afae93822cc9cca98c4d4"},
-]
-
-[package.dependencies]
-typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
-zipp = ">=0.5"
-
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-perf = ["ipython"]
-testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"]
+all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
 
 [[package]]
 name = "iniconfig"
-version = "2.0.0"
+version = "2.1.0"
 description = "brain-dead simple config-ini parsing"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["dev"]
 files = [
-    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
-    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+    {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
+    {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
 ]
 
 [[package]]
 name = "multidict"
-version = "6.0.4"
+version = "6.4.3"
 description = "multidict implementation"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
-    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
-    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
-    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
-    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
-    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
-    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
-    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
-    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
-    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
-    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
-    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
-    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+    {file = "multidict-6.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:32a998bd8a64ca48616eac5a8c1cc4fa38fb244a3facf2eeb14abe186e0f6cc5"},
+    {file = "multidict-6.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a54ec568f1fc7f3c313c2f3b16e5db346bf3660e1309746e7fccbbfded856188"},
+    {file = "multidict-6.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a7be07e5df178430621c716a63151165684d3e9958f2bbfcb644246162007ab7"},
+    {file = "multidict-6.4.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b128dbf1c939674a50dd0b28f12c244d90e5015e751a4f339a96c54f7275e291"},
+    {file = "multidict-6.4.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b9cb19dfd83d35b6ff24a4022376ea6e45a2beba8ef3f0836b8a4b288b6ad685"},
+    {file = "multidict-6.4.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3cf62f8e447ea2c1395afa289b332e49e13d07435369b6f4e41f887db65b40bf"},
+    {file = "multidict-6.4.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:909f7d43ff8f13d1adccb6a397094adc369d4da794407f8dd592c51cf0eae4b1"},
+    {file = "multidict-6.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0bb8f8302fbc7122033df959e25777b0b7659b1fd6bcb9cb6bed76b5de67afef"},
+    {file = "multidict-6.4.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:224b79471b4f21169ea25ebc37ed6f058040c578e50ade532e2066562597b8a9"},
+    {file = "multidict-6.4.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a7bd27f7ab3204f16967a6f899b3e8e9eb3362c0ab91f2ee659e0345445e0078"},
+    {file = "multidict-6.4.3-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:99592bd3162e9c664671fd14e578a33bfdba487ea64bcb41d281286d3c870ad7"},
+    {file = "multidict-6.4.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:a62d78a1c9072949018cdb05d3c533924ef8ac9bcb06cbf96f6d14772c5cd451"},
+    {file = "multidict-6.4.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:3ccdde001578347e877ca4f629450973c510e88e8865d5aefbcb89b852ccc666"},
+    {file = "multidict-6.4.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:eccb67b0e78aa2e38a04c5ecc13bab325a43e5159a181a9d1a6723db913cbb3c"},
+    {file = "multidict-6.4.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8b6fcf6054fc4114a27aa865f8840ef3d675f9316e81868e0ad5866184a6cba5"},
+    {file = "multidict-6.4.3-cp310-cp310-win32.whl", hash = "sha256:f92c7f62d59373cd93bc9969d2da9b4b21f78283b1379ba012f7ee8127b3152e"},
+    {file = "multidict-6.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:b57e28dbc031d13916b946719f213c494a517b442d7b48b29443e79610acd887"},
+    {file = "multidict-6.4.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f6f19170197cc29baccd33ccc5b5d6a331058796485857cf34f7635aa25fb0cd"},
+    {file = "multidict-6.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f2882bf27037eb687e49591690e5d491e677272964f9ec7bc2abbe09108bdfb8"},
+    {file = "multidict-6.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fbf226ac85f7d6b6b9ba77db4ec0704fde88463dc17717aec78ec3c8546c70ad"},
+    {file = "multidict-6.4.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e329114f82ad4b9dd291bef614ea8971ec119ecd0f54795109976de75c9a852"},
+    {file = "multidict-6.4.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:1f4e0334d7a555c63f5c8952c57ab6f1c7b4f8c7f3442df689fc9f03df315c08"},
+    {file = "multidict-6.4.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:740915eb776617b57142ce0bb13b7596933496e2f798d3d15a20614adf30d229"},
+    {file = "multidict-6.4.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255dac25134d2b141c944b59a0d2f7211ca12a6d4779f7586a98b4b03ea80508"},
+    {file = "multidict-6.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4e8535bd4d741039b5aad4285ecd9b902ef9e224711f0b6afda6e38d7ac02c7"},
+    {file = "multidict-6.4.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c433a33be000dd968f5750722eaa0991037be0be4a9d453eba121774985bc8"},
+    {file = "multidict-6.4.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4eb33b0bdc50acd538f45041f5f19945a1f32b909b76d7b117c0c25d8063df56"},
+    {file = "multidict-6.4.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:75482f43465edefd8a5d72724887ccdcd0c83778ded8f0cb1e0594bf71736cc0"},
+    {file = "multidict-6.4.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ce5b3082e86aee80b3925ab4928198450d8e5b6466e11501fe03ad2191c6d777"},
+    {file = "multidict-6.4.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e413152e3212c4d39f82cf83c6f91be44bec9ddea950ce17af87fbf4e32ca6b2"},
+    {file = "multidict-6.4.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:8aac2eeff69b71f229a405c0a4b61b54bade8e10163bc7b44fcd257949620618"},
+    {file = "multidict-6.4.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ab583ac203af1d09034be41458feeab7863c0635c650a16f15771e1386abf2d7"},
+    {file = "multidict-6.4.3-cp311-cp311-win32.whl", hash = "sha256:1b2019317726f41e81154df636a897de1bfe9228c3724a433894e44cd2512378"},
+    {file = "multidict-6.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:43173924fa93c7486402217fab99b60baf78d33806af299c56133a3755f69589"},
+    {file = "multidict-6.4.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1f1c2f58f08b36f8475f3ec6f5aeb95270921d418bf18f90dffd6be5c7b0e676"},
+    {file = "multidict-6.4.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:26ae9ad364fc61b936fb7bf4c9d8bd53f3a5b4417142cd0be5c509d6f767e2f1"},
+    {file = "multidict-6.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:659318c6c8a85f6ecfc06b4e57529e5a78dfdd697260cc81f683492ad7e9435a"},
+    {file = "multidict-6.4.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1eb72c741fd24d5a28242ce72bb61bc91f8451877131fa3fe930edb195f7054"},
+    {file = "multidict-6.4.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3cd06d88cb7398252284ee75c8db8e680aa0d321451132d0dba12bc995f0adcc"},
+    {file = "multidict-6.4.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4543d8dc6470a82fde92b035a92529317191ce993533c3c0c68f56811164ed07"},
+    {file = "multidict-6.4.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30a3ebdc068c27e9d6081fca0e2c33fdf132ecea703a72ea216b81a66860adde"},
+    {file = "multidict-6.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b038f10e23f277153f86f95c777ba1958bcd5993194fda26a1d06fae98b2f00c"},
+    {file = "multidict-6.4.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c605a2b2dc14282b580454b9b5d14ebe0668381a3a26d0ac39daa0ca115eb2ae"},
+    {file = "multidict-6.4.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8bd2b875f4ca2bb527fe23e318ddd509b7df163407b0fb717df229041c6df5d3"},
+    {file = "multidict-6.4.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c2e98c840c9c8e65c0e04b40c6c5066c8632678cd50c8721fdbcd2e09f21a507"},
+    {file = "multidict-6.4.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:66eb80dd0ab36dbd559635e62fba3083a48a252633164857a1d1684f14326427"},
+    {file = "multidict-6.4.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c23831bdee0a2a3cf21be057b5e5326292f60472fb6c6f86392bbf0de70ba731"},
+    {file = "multidict-6.4.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:1535cec6443bfd80d028052e9d17ba6ff8a5a3534c51d285ba56c18af97e9713"},
+    {file = "multidict-6.4.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3b73e7227681f85d19dec46e5b881827cd354aabe46049e1a61d2f9aaa4e285a"},
+    {file = "multidict-6.4.3-cp312-cp312-win32.whl", hash = "sha256:8eac0c49df91b88bf91f818e0a24c1c46f3622978e2c27035bfdca98e0e18124"},
+    {file = "multidict-6.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:11990b5c757d956cd1db7cb140be50a63216af32cd6506329c2c59d732d802db"},
+    {file = "multidict-6.4.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a76534263d03ae0cfa721fea40fd2b5b9d17a6f85e98025931d41dc49504474"},
+    {file = "multidict-6.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:805031c2f599eee62ac579843555ed1ce389ae00c7e9f74c2a1b45e0564a88dd"},
+    {file = "multidict-6.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c56c179839d5dcf51d565132185409d1d5dd8e614ba501eb79023a6cab25576b"},
+    {file = "multidict-6.4.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c64f4ddb3886dd8ab71b68a7431ad4aa01a8fa5be5b11543b29674f29ca0ba3"},
+    {file = "multidict-6.4.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3002a856367c0b41cad6784f5b8d3ab008eda194ed7864aaa58f65312e2abcac"},
+    {file = "multidict-6.4.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d75e621e7d887d539d6e1d789f0c64271c250276c333480a9e1de089611f790"},
+    {file = "multidict-6.4.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:995015cf4a3c0d72cbf453b10a999b92c5629eaf3a0c3e1efb4b5c1f602253bb"},
+    {file = "multidict-6.4.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2b0fabae7939d09d7d16a711468c385272fa1b9b7fb0d37e51143585d8e72e0"},
+    {file = "multidict-6.4.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:61ed4d82f8a1e67eb9eb04f8587970d78fe7cddb4e4d6230b77eda23d27938f9"},
+    {file = "multidict-6.4.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:062428944a8dc69df9fdc5d5fc6279421e5f9c75a9ee3f586f274ba7b05ab3c8"},
+    {file = "multidict-6.4.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:b90e27b4674e6c405ad6c64e515a505c6d113b832df52fdacb6b1ffd1fa9a1d1"},
+    {file = "multidict-6.4.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7d50d4abf6729921e9613d98344b74241572b751c6b37feed75fb0c37bd5a817"},
+    {file = "multidict-6.4.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:43fe10524fb0a0514be3954be53258e61d87341008ce4914f8e8b92bee6f875d"},
+    {file = "multidict-6.4.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:236966ca6c472ea4e2d3f02f6673ebfd36ba3f23159c323f5a496869bc8e47c9"},
+    {file = "multidict-6.4.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:422a5ec315018e606473ba1f5431e064cf8b2a7468019233dcf8082fabad64c8"},
+    {file = "multidict-6.4.3-cp313-cp313-win32.whl", hash = "sha256:f901a5aace8e8c25d78960dcc24c870c8d356660d3b49b93a78bf38eb682aac3"},
+    {file = "multidict-6.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:1c152c49e42277bc9a2f7b78bd5fa10b13e88d1b0328221e7aef89d5c60a99a5"},
+    {file = "multidict-6.4.3-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:be8751869e28b9c0d368d94f5afcb4234db66fe8496144547b4b6d6a0645cfc6"},
+    {file = "multidict-6.4.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0d4b31f8a68dccbcd2c0ea04f0e014f1defc6b78f0eb8b35f2265e8716a6df0c"},
+    {file = "multidict-6.4.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:032efeab3049e37eef2ff91271884303becc9e54d740b492a93b7e7266e23756"},
+    {file = "multidict-6.4.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e78006af1a7c8a8007e4f56629d7252668344442f66982368ac06522445e375"},
+    {file = "multidict-6.4.3-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:daeac9dd30cda8703c417e4fddccd7c4dc0c73421a0b54a7da2713be125846be"},
+    {file = "multidict-6.4.3-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f6f90700881438953eae443a9c6f8a509808bc3b185246992c4233ccee37fea"},
+    {file = "multidict-6.4.3-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f84627997008390dd15762128dcf73c3365f4ec0106739cde6c20a07ed198ec8"},
+    {file = "multidict-6.4.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3307b48cd156153b117c0ea54890a3bdbf858a5b296ddd40dc3852e5f16e9b02"},
+    {file = "multidict-6.4.3-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ead46b0fa1dcf5af503a46e9f1c2e80b5d95c6011526352fa5f42ea201526124"},
+    {file = "multidict-6.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:1748cb2743bedc339d63eb1bca314061568793acd603a6e37b09a326334c9f44"},
+    {file = "multidict-6.4.3-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:acc9fa606f76fc111b4569348cc23a771cb52c61516dcc6bcef46d612edb483b"},
+    {file = "multidict-6.4.3-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:31469d5832b5885adeb70982e531ce86f8c992334edd2f2254a10fa3182ac504"},
+    {file = "multidict-6.4.3-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:ba46b51b6e51b4ef7bfb84b82f5db0dc5e300fb222a8a13b8cd4111898a869cf"},
+    {file = "multidict-6.4.3-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:389cfefb599edf3fcfd5f64c0410da686f90f5f5e2c4d84e14f6797a5a337af4"},
+    {file = "multidict-6.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:64bc2bbc5fba7b9db5c2c8d750824f41c6994e3882e6d73c903c2afa78d091e4"},
+    {file = "multidict-6.4.3-cp313-cp313t-win32.whl", hash = "sha256:0ecdc12ea44bab2807d6b4a7e5eef25109ab1c82a8240d86d3c1fc9f3b72efd5"},
+    {file = "multidict-6.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:7146a8742ea71b5d7d955bffcef58a9e6e04efba704b52a460134fefd10a8208"},
+    {file = "multidict-6.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5427a2679e95a642b7f8b0f761e660c845c8e6fe3141cddd6b62005bd133fc21"},
+    {file = "multidict-6.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:24a8caa26521b9ad09732972927d7b45b66453e6ebd91a3c6a46d811eeb7349b"},
+    {file = "multidict-6.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6b5a272bc7c36a2cd1b56ddc6bff02e9ce499f9f14ee4a45c45434ef083f2459"},
+    {file = "multidict-6.4.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edf74dc5e212b8c75165b435c43eb0d5e81b6b300a938a4eb82827119115e840"},
+    {file = "multidict-6.4.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9f35de41aec4b323c71f54b0ca461ebf694fb48bec62f65221f52e0017955b39"},
+    {file = "multidict-6.4.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae93e0ff43b6f6892999af64097b18561691ffd835e21a8348a441e256592e1f"},
+    {file = "multidict-6.4.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e3929269e9d7eff905d6971d8b8c85e7dbc72c18fb99c8eae6fe0a152f2e343"},
+    {file = "multidict-6.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb6214fe1750adc2a1b801a199d64b5a67671bf76ebf24c730b157846d0e90d2"},
+    {file = "multidict-6.4.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6d79cf5c0c6284e90f72123f4a3e4add52d6c6ebb4a9054e88df15b8d08444c6"},
+    {file = "multidict-6.4.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2427370f4a255262928cd14533a70d9738dfacadb7563bc3b7f704cc2360fc4e"},
+    {file = "multidict-6.4.3-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:fbd8d737867912b6c5f99f56782b8cb81f978a97b4437a1c476de90a3e41c9a1"},
+    {file = "multidict-6.4.3-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0ee1bf613c448997f73fc4efb4ecebebb1c02268028dd4f11f011f02300cf1e8"},
+    {file = "multidict-6.4.3-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:578568c4ba5f2b8abd956baf8b23790dbfdc953e87d5b110bce343b4a54fc9e7"},
+    {file = "multidict-6.4.3-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:a059ad6b80de5b84b9fa02a39400319e62edd39d210b4e4f8c4f1243bdac4752"},
+    {file = "multidict-6.4.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:dd53893675b729a965088aaadd6a1f326a72b83742b056c1065bdd2e2a42b4df"},
+    {file = "multidict-6.4.3-cp39-cp39-win32.whl", hash = "sha256:abcfed2c4c139f25c2355e180bcc077a7cae91eefbb8b3927bb3f836c9586f1f"},
+    {file = "multidict-6.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:b1b389ae17296dd739015d5ddb222ee99fd66adeae910de21ac950e00979d897"},
+    {file = "multidict-6.4.3-py3-none-any.whl", hash = "sha256:59fe01ee8e2a1e8ceb3f6dbb216b09c8d9f4ef1c22c4fc825d045a147fa2ebc9"},
+    {file = "multidict-6.4.3.tar.gz", hash = "sha256:3ada0b058c9f213c5f95ba301f922d402ac234f1111a7d8fd70f1b99f3c281ec"},
 ]
 
+[package.dependencies]
+typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
+
 [[package]]
 name = "packaging"
-version = "23.1"
+version = "24.2"
 description = "Core utilities for Python packages"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["main", "dev"]
 files = [
-    {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
-    {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
+    {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
+    {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
 ]
 
 [[package]]
 name = "pluggy"
-version = "1.2.0"
+version = "1.5.0"
 description = "plugin and hook calling mechanisms for python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["dev"]
 files = [
-    {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"},
-    {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"},
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
 ]
 
-[package.dependencies]
-importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
-
 [package.extras]
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
-name = "py"
-version = "1.11.0"
-description = "library with cross-python path, ini-parsing, io, code, log facilities"
+name = "propcache"
+version = "0.3.1"
+description = "Accelerated property cache"
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
-    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
+    {file = "propcache-0.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f27785888d2fdd918bc36de8b8739f2d6c791399552333721b58193f68ea3e98"},
+    {file = "propcache-0.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4e89cde74154c7b5957f87a355bb9c8ec929c167b59c83d90654ea36aeb6180"},
+    {file = "propcache-0.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:730178f476ef03d3d4d255f0c9fa186cb1d13fd33ffe89d39f2cda4da90ceb71"},
+    {file = "propcache-0.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:967a8eec513dbe08330f10137eacb427b2ca52118769e82ebcfcab0fba92a649"},
+    {file = "propcache-0.3.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b9145c35cc87313b5fd480144f8078716007656093d23059e8993d3a8fa730f"},
+    {file = "propcache-0.3.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9e64e948ab41411958670f1093c0a57acfdc3bee5cf5b935671bbd5313bcf229"},
+    {file = "propcache-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:319fa8765bfd6a265e5fa661547556da381e53274bc05094fc9ea50da51bfd46"},
+    {file = "propcache-0.3.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c66d8ccbc902ad548312b96ed8d5d266d0d2c6d006fd0f66323e9d8f2dd49be7"},
+    {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2d219b0dbabe75e15e581fc1ae796109b07c8ba7d25b9ae8d650da582bed01b0"},
+    {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:cd6a55f65241c551eb53f8cf4d2f4af33512c39da5d9777694e9d9c60872f519"},
+    {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9979643ffc69b799d50d3a7b72b5164a2e97e117009d7af6dfdd2ab906cb72cd"},
+    {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:4cf9e93a81979f1424f1a3d155213dc928f1069d697e4353edb8a5eba67c6259"},
+    {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2fce1df66915909ff6c824bbb5eb403d2d15f98f1518e583074671a30fe0c21e"},
+    {file = "propcache-0.3.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4d0dfdd9a2ebc77b869a0b04423591ea8823f791293b527dc1bb896c1d6f1136"},
+    {file = "propcache-0.3.1-cp310-cp310-win32.whl", hash = "sha256:1f6cc0ad7b4560e5637eb2c994e97b4fa41ba8226069c9277eb5ea7101845b42"},
+    {file = "propcache-0.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:47ef24aa6511e388e9894ec16f0fbf3313a53ee68402bc428744a367ec55b833"},
+    {file = "propcache-0.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7f30241577d2fef2602113b70ef7231bf4c69a97e04693bde08ddab913ba0ce5"},
+    {file = "propcache-0.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:43593c6772aa12abc3af7784bff4a41ffa921608dd38b77cf1dfd7f5c4e71371"},
+    {file = "propcache-0.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a75801768bbe65499495660b777e018cbe90c7980f07f8aa57d6be79ea6f71da"},
+    {file = "propcache-0.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6f1324db48f001c2ca26a25fa25af60711e09b9aaf4b28488602776f4f9a744"},
+    {file = "propcache-0.3.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cdb0f3e1eb6dfc9965d19734d8f9c481b294b5274337a8cb5cb01b462dcb7e0"},
+    {file = "propcache-0.3.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1eb34d90aac9bfbced9a58b266f8946cb5935869ff01b164573a7634d39fbcb5"},
+    {file = "propcache-0.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f35c7070eeec2cdaac6fd3fe245226ed2a6292d3ee8c938e5bb645b434c5f256"},
+    {file = "propcache-0.3.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b23c11c2c9e6d4e7300c92e022046ad09b91fd00e36e83c44483df4afa990073"},
+    {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3e19ea4ea0bf46179f8a3652ac1426e6dcbaf577ce4b4f65be581e237340420d"},
+    {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:bd39c92e4c8f6cbf5f08257d6360123af72af9f4da75a690bef50da77362d25f"},
+    {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:b0313e8b923b3814d1c4a524c93dfecea5f39fa95601f6a9b1ac96cd66f89ea0"},
+    {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e861ad82892408487be144906a368ddbe2dc6297074ade2d892341b35c59844a"},
+    {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:61014615c1274df8da5991a1e5da85a3ccb00c2d4701ac6f3383afd3ca47ab0a"},
+    {file = "propcache-0.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:71ebe3fe42656a2328ab08933d420df5f3ab121772eef78f2dc63624157f0ed9"},
+    {file = "propcache-0.3.1-cp311-cp311-win32.whl", hash = "sha256:58aa11f4ca8b60113d4b8e32d37e7e78bd8af4d1a5b5cb4979ed856a45e62005"},
+    {file = "propcache-0.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:9532ea0b26a401264b1365146c440a6d78269ed41f83f23818d4b79497aeabe7"},
+    {file = "propcache-0.3.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f78eb8422acc93d7b69964012ad7048764bb45a54ba7a39bb9e146c72ea29723"},
+    {file = "propcache-0.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:89498dd49c2f9a026ee057965cdf8192e5ae070ce7d7a7bd4b66a8e257d0c976"},
+    {file = "propcache-0.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:09400e98545c998d57d10035ff623266927cb784d13dd2b31fd33b8a5316b85b"},
+    {file = "propcache-0.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa8efd8c5adc5a2c9d3b952815ff8f7710cefdcaf5f2c36d26aff51aeca2f12f"},
+    {file = "propcache-0.3.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2fe5c910f6007e716a06d269608d307b4f36e7babee5f36533722660e8c4a70"},
+    {file = "propcache-0.3.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a0ab8cf8cdd2194f8ff979a43ab43049b1df0b37aa64ab7eca04ac14429baeb7"},
+    {file = "propcache-0.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:563f9d8c03ad645597b8d010ef4e9eab359faeb11a0a2ac9f7b4bc8c28ebef25"},
+    {file = "propcache-0.3.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb6e0faf8cb6b4beea5d6ed7b5a578254c6d7df54c36ccd3d8b3eb00d6770277"},
+    {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1c5c7ab7f2bb3f573d1cb921993006ba2d39e8621019dffb1c5bc94cdbae81e8"},
+    {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:050b571b2e96ec942898f8eb46ea4bfbb19bd5502424747e83badc2d4a99a44e"},
+    {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e1c4d24b804b3a87e9350f79e2371a705a188d292fd310e663483af6ee6718ee"},
+    {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:e4fe2a6d5ce975c117a6bb1e8ccda772d1e7029c1cca1acd209f91d30fa72815"},
+    {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:feccd282de1f6322f56f6845bf1207a537227812f0a9bf5571df52bb418d79d5"},
+    {file = "propcache-0.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ec314cde7314d2dd0510c6787326bbffcbdc317ecee6b7401ce218b3099075a7"},
+    {file = "propcache-0.3.1-cp312-cp312-win32.whl", hash = "sha256:7d2d5a0028d920738372630870e7d9644ce437142197f8c827194fca404bf03b"},
+    {file = "propcache-0.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:88c423efef9d7a59dae0614eaed718449c09a5ac79a5f224a8b9664d603f04a3"},
+    {file = "propcache-0.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f1528ec4374617a7a753f90f20e2f551121bb558fcb35926f99e3c42367164b8"},
+    {file = "propcache-0.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc1915ec523b3b494933b5424980831b636fe483d7d543f7afb7b3bf00f0c10f"},
+    {file = "propcache-0.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a110205022d077da24e60b3df8bcee73971be9575dec5573dd17ae5d81751111"},
+    {file = "propcache-0.3.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d249609e547c04d190e820d0d4c8ca03ed4582bcf8e4e160a6969ddfb57b62e5"},
+    {file = "propcache-0.3.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ced33d827625d0a589e831126ccb4f5c29dfdf6766cac441d23995a65825dcb"},
+    {file = "propcache-0.3.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4114c4ada8f3181af20808bedb250da6bae56660e4b8dfd9cd95d4549c0962f7"},
+    {file = "propcache-0.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:975af16f406ce48f1333ec5e912fe11064605d5c5b3f6746969077cc3adeb120"},
+    {file = "propcache-0.3.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a34aa3a1abc50740be6ac0ab9d594e274f59960d3ad253cd318af76b996dd654"},
+    {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9cec3239c85ed15bfaded997773fdad9fb5662b0a7cbc854a43f291eb183179e"},
+    {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:05543250deac8e61084234d5fc54f8ebd254e8f2b39a16b1dce48904f45b744b"},
+    {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5cb5918253912e088edbf023788de539219718d3b10aef334476b62d2b53de53"},
+    {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f3bbecd2f34d0e6d3c543fdb3b15d6b60dd69970c2b4c822379e5ec8f6f621d5"},
+    {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aca63103895c7d960a5b9b044a83f544b233c95e0dcff114389d64d762017af7"},
+    {file = "propcache-0.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a0a9898fdb99bf11786265468571e628ba60af80dc3f6eb89a3545540c6b0ef"},
+    {file = "propcache-0.3.1-cp313-cp313-win32.whl", hash = "sha256:3a02a28095b5e63128bcae98eb59025924f121f048a62393db682f049bf4ac24"},
+    {file = "propcache-0.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:813fbb8b6aea2fc9659815e585e548fe706d6f663fa73dff59a1677d4595a037"},
+    {file = "propcache-0.3.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a444192f20f5ce8a5e52761a031b90f5ea6288b1eef42ad4c7e64fef33540b8f"},
+    {file = "propcache-0.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fbe94666e62ebe36cd652f5fc012abfbc2342de99b523f8267a678e4dfdee3c"},
+    {file = "propcache-0.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f011f104db880f4e2166bcdcf7f58250f7a465bc6b068dc84c824a3d4a5c94dc"},
+    {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e584b6d388aeb0001d6d5c2bd86b26304adde6d9bb9bfa9c4889805021b96de"},
+    {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a17583515a04358b034e241f952f1715243482fc2c2945fd99a1b03a0bd77d6"},
+    {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5aed8d8308215089c0734a2af4f2e95eeb360660184ad3912686c181e500b2e7"},
+    {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d8e309ff9a0503ef70dc9a0ebd3e69cf7b3894c9ae2ae81fc10943c37762458"},
+    {file = "propcache-0.3.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b655032b202028a582d27aeedc2e813299f82cb232f969f87a4fde491a233f11"},
+    {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9f64d91b751df77931336b5ff7bafbe8845c5770b06630e27acd5dbb71e1931c"},
+    {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:19a06db789a4bd896ee91ebc50d059e23b3639c25d58eb35be3ca1cbe967c3bf"},
+    {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:bef100c88d8692864651b5f98e871fb090bd65c8a41a1cb0ff2322db39c96c27"},
+    {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:87380fb1f3089d2a0b8b00f006ed12bd41bd858fabfa7330c954c70f50ed8757"},
+    {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e474fc718e73ba5ec5180358aa07f6aded0ff5f2abe700e3115c37d75c947e18"},
+    {file = "propcache-0.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:17d1c688a443355234f3c031349da69444be052613483f3e4158eef751abcd8a"},
+    {file = "propcache-0.3.1-cp313-cp313t-win32.whl", hash = "sha256:359e81a949a7619802eb601d66d37072b79b79c2505e6d3fd8b945538411400d"},
+    {file = "propcache-0.3.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e7fb9a84c9abbf2b2683fa3e7b0d7da4d8ecf139a1c635732a8bda29c5214b0e"},
+    {file = "propcache-0.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ed5f6d2edbf349bd8d630e81f474d33d6ae5d07760c44d33cd808e2f5c8f4ae6"},
+    {file = "propcache-0.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:668ddddc9f3075af019f784456267eb504cb77c2c4bd46cc8402d723b4d200bf"},
+    {file = "propcache-0.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0c86e7ceea56376216eba345aa1fc6a8a6b27ac236181f840d1d7e6a1ea9ba5c"},
+    {file = "propcache-0.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83be47aa4e35b87c106fc0c84c0fc069d3f9b9b06d3c494cd404ec6747544894"},
+    {file = "propcache-0.3.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:27c6ac6aa9fc7bc662f594ef380707494cb42c22786a558d95fcdedb9aa5d035"},
+    {file = "propcache-0.3.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64a956dff37080b352c1c40b2966b09defb014347043e740d420ca1eb7c9b908"},
+    {file = "propcache-0.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82de5da8c8893056603ac2d6a89eb8b4df49abf1a7c19d536984c8dd63f481d5"},
+    {file = "propcache-0.3.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c3c3a203c375b08fd06a20da3cf7aac293b834b6f4f4db71190e8422750cca5"},
+    {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b303b194c2e6f171cfddf8b8ba30baefccf03d36a4d9cab7fd0bb68ba476a3d7"},
+    {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:916cd229b0150129d645ec51614d38129ee74c03293a9f3f17537be0029a9641"},
+    {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a461959ead5b38e2581998700b26346b78cd98540b5524796c175722f18b0294"},
+    {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:069e7212890b0bcf9b2be0a03afb0c2d5161d91e1bf51569a64f629acc7defbf"},
+    {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ef2e4e91fb3945769e14ce82ed53007195e616a63aa43b40fb7ebaaf907c8d4c"},
+    {file = "propcache-0.3.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8638f99dca15b9dff328fb6273e09f03d1c50d9b6512f3b65a4154588a7595fe"},
+    {file = "propcache-0.3.1-cp39-cp39-win32.whl", hash = "sha256:6f173bbfe976105aaa890b712d1759de339d8a7cef2fc0a1714cc1a1e1c47f64"},
+    {file = "propcache-0.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:603f1fe4144420374f1a69b907494c3acbc867a581c2d49d4175b0de7cc64566"},
+    {file = "propcache-0.3.1-py3-none-any.whl", hash = "sha256:9a8ecf38de50a7f518c21568c80f985e776397b902f1ce0b01f799aba1608b40"},
+    {file = "propcache-0.3.1.tar.gz", hash = "sha256:40d980c33765359098837527e18eddefc9a24cea5b45e078a7f3bb5b032c6ecf"},
 ]
 
 [[package]]
 name = "pydantic"
-version = "2.5.3"
+version = "2.11.3"
 description = "Data validation using Python type hints"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "pydantic-2.5.3-py3-none-any.whl", hash = "sha256:d0caf5954bee831b6bfe7e338c32b9e30c85dfe080c843680783ac2b631673b4"},
-    {file = "pydantic-2.5.3.tar.gz", hash = "sha256:b3ef57c62535b0941697cce638c08900d87fcb67e29cfa99e8a68f747f393f7a"},
+    {file = "pydantic-2.11.3-py3-none-any.whl", hash = "sha256:a082753436a07f9ba1289c6ffa01cd93db3548776088aa917cc43b63f68fa60f"},
+    {file = "pydantic-2.11.3.tar.gz", hash = "sha256:7471657138c16adad9322fe3070c0116dd6c3ad8d649300e3cbdfe91f4db4ec3"},
 ]
 
 [package.dependencies]
-annotated-types = ">=0.4.0"
-importlib-metadata = {version = "*", markers = "python_version == \"3.7\""}
-pydantic-core = "2.14.6"
-typing-extensions = ">=4.6.1"
+annotated-types = ">=0.6.0"
+pydantic-core = "2.33.1"
+typing-extensions = ">=4.12.2"
+typing-inspection = ">=0.4.0"
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
+timezone = ["tzdata"]
 
 [[package]]
 name = "pydantic-core"
-version = "2.14.6"
-description = ""
+version = "2.33.1"
+description = "Core functionality for Pydantic validation and serialization"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "pydantic_core-2.14.6-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:72f9a942d739f09cd42fffe5dc759928217649f070056f03c70df14f5770acf9"},
-    {file = "pydantic_core-2.14.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6a31d98c0d69776c2576dda4b77b8e0c69ad08e8b539c25c7d0ca0dc19a50d6c"},
-    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5aa90562bc079c6c290f0512b21768967f9968e4cfea84ea4ff5af5d917016e4"},
-    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:370ffecb5316ed23b667d99ce4debe53ea664b99cc37bfa2af47bc769056d534"},
-    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f85f3843bdb1fe80e8c206fe6eed7a1caeae897e496542cee499c374a85c6e08"},
-    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9862bf828112e19685b76ca499b379338fd4c5c269d897e218b2ae8fcb80139d"},
-    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:036137b5ad0cb0004c75b579445a1efccd072387a36c7f217bb8efd1afbe5245"},
-    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:92879bce89f91f4b2416eba4429c7b5ca22c45ef4a499c39f0c5c69257522c7c"},
-    {file = "pydantic_core-2.14.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0c08de15d50fa190d577e8591f0329a643eeaed696d7771760295998aca6bc66"},
-    {file = "pydantic_core-2.14.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:36099c69f6b14fc2c49d7996cbf4f87ec4f0e66d1c74aa05228583225a07b590"},
-    {file = "pydantic_core-2.14.6-cp310-none-win32.whl", hash = "sha256:7be719e4d2ae6c314f72844ba9d69e38dff342bc360379f7c8537c48e23034b7"},
-    {file = "pydantic_core-2.14.6-cp310-none-win_amd64.whl", hash = "sha256:36fa402dcdc8ea7f1b0ddcf0df4254cc6b2e08f8cd80e7010d4c4ae6e86b2a87"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:dea7fcd62915fb150cdc373212141a30037e11b761fbced340e9db3379b892d4"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ffff855100bc066ff2cd3aa4a60bc9534661816b110f0243e59503ec2df38421"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b027c86c66b8627eb90e57aee1f526df77dc6d8b354ec498be9a757d513b92b"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:00b1087dabcee0b0ffd104f9f53d7d3eaddfaa314cdd6726143af6bc713aa27e"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:75ec284328b60a4e91010c1acade0c30584f28a1f345bc8f72fe8b9e46ec6a96"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e1f4744eea1501404b20b0ac059ff7e3f96a97d3e3f48ce27a139e053bb370b"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2602177668f89b38b9f84b7b3435d0a72511ddef45dc14446811759b82235a1"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6c8edaea3089bf908dd27da8f5d9e395c5b4dc092dbcce9b65e7156099b4b937"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:478e9e7b360dfec451daafe286998d4a1eeaecf6d69c427b834ae771cad4b622"},
-    {file = "pydantic_core-2.14.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b6ca36c12a5120bad343eef193cc0122928c5c7466121da7c20f41160ba00ba2"},
-    {file = "pydantic_core-2.14.6-cp311-none-win32.whl", hash = "sha256:2b8719037e570639e6b665a4050add43134d80b687288ba3ade18b22bbb29dd2"},
-    {file = "pydantic_core-2.14.6-cp311-none-win_amd64.whl", hash = "sha256:78ee52ecc088c61cce32b2d30a826f929e1708f7b9247dc3b921aec367dc1b23"},
-    {file = "pydantic_core-2.14.6-cp311-none-win_arm64.whl", hash = "sha256:a19b794f8fe6569472ff77602437ec4430f9b2b9ec7a1105cfd2232f9ba355e6"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:667aa2eac9cd0700af1ddb38b7b1ef246d8cf94c85637cbb03d7757ca4c3fdec"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cdee837710ef6b56ebd20245b83799fce40b265b3b406e51e8ccc5b85b9099b7"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c5bcf3414367e29f83fd66f7de64509a8fd2368b1edf4351e862910727d3e51"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:26a92ae76f75d1915806b77cf459811e772d8f71fd1e4339c99750f0e7f6324f"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a983cca5ed1dd9a35e9e42ebf9f278d344603bfcb174ff99a5815f953925140a"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cb92f9061657287eded380d7dc455bbf115430b3aa4741bdc662d02977e7d0af"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4ace1e220b078c8e48e82c081e35002038657e4b37d403ce940fa679e57113b"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef633add81832f4b56d3b4c9408b43d530dfca29e68fb1b797dcb861a2c734cd"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e90d6cc4aad2cc1f5e16ed56e46cebf4877c62403a311af20459c15da76fd91"},
-    {file = "pydantic_core-2.14.6-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e8a5ac97ea521d7bde7621d86c30e86b798cdecd985723c4ed737a2aa9e77d0c"},
-    {file = "pydantic_core-2.14.6-cp312-none-win32.whl", hash = "sha256:f27207e8ca3e5e021e2402ba942e5b4c629718e665c81b8b306f3c8b1ddbb786"},
-    {file = "pydantic_core-2.14.6-cp312-none-win_amd64.whl", hash = "sha256:b3e5fe4538001bb82e2295b8d2a39356a84694c97cb73a566dc36328b9f83b40"},
-    {file = "pydantic_core-2.14.6-cp312-none-win_arm64.whl", hash = "sha256:64634ccf9d671c6be242a664a33c4acf12882670b09b3f163cd00a24cffbd74e"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:24368e31be2c88bd69340fbfe741b405302993242ccb476c5c3ff48aeee1afe0"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:e33b0834f1cf779aa839975f9d8755a7c2420510c0fa1e9fa0497de77cd35d2c"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6af4b3f52cc65f8a0bc8b1cd9676f8c21ef3e9132f21fed250f6958bd7223bed"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d15687d7d7f40333bd8266f3814c591c2e2cd263fa2116e314f60d82086e353a"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:095b707bb287bfd534044166ab767bec70a9bba3175dcdc3371782175c14e43c"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94fc0e6621e07d1e91c44e016cc0b189b48db053061cc22d6298a611de8071bb"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ce830e480f6774608dedfd4a90c42aac4a7af0a711f1b52f807130c2e434c06"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a306cdd2ad3a7d795d8e617a58c3a2ed0f76c8496fb7621b6cd514eb1532cae8"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:2f5fa187bde8524b1e37ba894db13aadd64faa884657473b03a019f625cee9a8"},
-    {file = "pydantic_core-2.14.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:438027a975cc213a47c5d70672e0d29776082155cfae540c4e225716586be75e"},
-    {file = "pydantic_core-2.14.6-cp37-none-win32.whl", hash = "sha256:f96ae96a060a8072ceff4cfde89d261837b4294a4f28b84a28765470d502ccc6"},
-    {file = "pydantic_core-2.14.6-cp37-none-win_amd64.whl", hash = "sha256:e646c0e282e960345314f42f2cea5e0b5f56938c093541ea6dbf11aec2862391"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:db453f2da3f59a348f514cfbfeb042393b68720787bbef2b4c6068ea362c8149"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3860c62057acd95cc84044e758e47b18dcd8871a328ebc8ccdefd18b0d26a21b"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36026d8f99c58d7044413e1b819a67ca0e0b8ebe0f25e775e6c3d1fabb3c38fb"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8ed1af8692bd8d2a29d702f1a2e6065416d76897d726e45a1775b1444f5928a7"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:314ccc4264ce7d854941231cf71b592e30d8d368a71e50197c905874feacc8a8"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:982487f8931067a32e72d40ab6b47b1628a9c5d344be7f1a4e668fb462d2da42"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dbe357bc4ddda078f79d2a36fc1dd0494a7f2fad83a0a684465b6f24b46fe80"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2f6ffc6701a0eb28648c845f4945a194dc7ab3c651f535b81793251e1185ac3d"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7f5025db12fc6de7bc1104d826d5aee1d172f9ba6ca936bf6474c2148ac336c1"},
-    {file = "pydantic_core-2.14.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dab03ed811ed1c71d700ed08bde8431cf429bbe59e423394f0f4055f1ca0ea60"},
-    {file = "pydantic_core-2.14.6-cp38-none-win32.whl", hash = "sha256:dfcbebdb3c4b6f739a91769aea5ed615023f3c88cb70df812849aef634c25fbe"},
-    {file = "pydantic_core-2.14.6-cp38-none-win_amd64.whl", hash = "sha256:99b14dbea2fdb563d8b5a57c9badfcd72083f6006caf8e126b491519c7d64ca8"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:4ce8299b481bcb68e5c82002b96e411796b844d72b3e92a3fbedfe8e19813eab"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b9a9d92f10772d2a181b5ca339dee066ab7d1c9a34ae2421b2a52556e719756f"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd9e98b408384989ea4ab60206b8e100d8687da18b5c813c11e92fd8212a98e0"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4f86f1f318e56f5cbb282fe61eb84767aee743ebe32c7c0834690ebea50c0a6b"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86ce5fcfc3accf3a07a729779d0b86c5d0309a4764c897d86c11089be61da160"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dcf1978be02153c6a31692d4fbcc2a3f1db9da36039ead23173bc256ee3b91b"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eedf97be7bc3dbc8addcef4142f4b4164066df0c6f36397ae4aaed3eb187d8ab"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d5f916acf8afbcab6bacbb376ba7dc61f845367901ecd5e328fc4d4aef2fcab0"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8a14c192c1d724c3acbfb3f10a958c55a2638391319ce8078cb36c02283959b9"},
-    {file = "pydantic_core-2.14.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0348b1dc6b76041516e8a854ff95b21c55f5a411c3297d2ca52f5528e49d8411"},
-    {file = "pydantic_core-2.14.6-cp39-none-win32.whl", hash = "sha256:de2a0645a923ba57c5527497daf8ec5df69c6eadf869e9cd46e86349146e5975"},
-    {file = "pydantic_core-2.14.6-cp39-none-win_amd64.whl", hash = "sha256:aca48506a9c20f68ee61c87f2008f81f8ee99f8d7f0104bff3c47e2d148f89d9"},
-    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:d5c28525c19f5bb1e09511669bb57353d22b94cf8b65f3a8d141c389a55dec95"},
-    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:78d0768ee59baa3de0f4adac9e3748b4b1fffc52143caebddfd5ea2961595277"},
-    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b93785eadaef932e4fe9c6e12ba67beb1b3f1e5495631419c784ab87e975670"},
-    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a874f21f87c485310944b2b2734cd6d318765bcbb7515eead33af9641816506e"},
-    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89f4477d915ea43b4ceea6756f63f0288941b6443a2b28c69004fe07fde0d0d"},
-    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:172de779e2a153d36ee690dbc49c6db568d7b33b18dc56b69a7514aecbcf380d"},
-    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:dfcebb950aa7e667ec226a442722134539e77c575f6cfaa423f24371bb8d2e94"},
-    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:55a23dcd98c858c0db44fc5c04fc7ed81c4b4d33c653a7c45ddaebf6563a2f66"},
-    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:4241204e4b36ab5ae466ecec5c4c16527a054c69f99bba20f6f75232a6a534e2"},
-    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e574de99d735b3fc8364cba9912c2bec2da78775eba95cbb225ef7dda6acea24"},
-    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1302a54f87b5cd8528e4d6d1bf2133b6aa7c6122ff8e9dc5220fbc1e07bffebd"},
-    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8e81e4b55930e5ffab4a68db1af431629cf2e4066dbdbfef65348b8ab804ea8"},
-    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c99462ffc538717b3e60151dfaf91125f637e801f5ab008f81c402f1dff0cd0f"},
-    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e4cf2d5829f6963a5483ec01578ee76d329eb5caf330ecd05b3edd697e7d768a"},
-    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:cf10b7d58ae4a1f07fccbf4a0a956d705356fea05fb4c70608bb6fa81d103cda"},
-    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:399ac0891c284fa8eb998bcfa323f2234858f5d2efca3950ae58c8f88830f145"},
-    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c6a5c79b28003543db3ba67d1df336f253a87d3112dac3a51b94f7d48e4c0e1"},
-    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:599c87d79cab2a6a2a9df4aefe0455e61e7d2aeede2f8577c1b7c0aec643ee8e"},
-    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43e166ad47ba900f2542a80d83f9fc65fe99eb63ceec4debec160ae729824052"},
-    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3a0b5db001b98e1c649dd55afa928e75aa4087e587b9524a4992316fa23c9fba"},
-    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:747265448cb57a9f37572a488a57d873fd96bf51e5bb7edb52cfb37124516da4"},
-    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:7ebe3416785f65c28f4f9441e916bfc8a54179c8dea73c23023f7086fa601c5d"},
-    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:86c963186ca5e50d5c8287b1d1c9d3f8f024cbe343d048c5bd282aec2d8641f2"},
-    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:e0641b506486f0b4cd1500a2a65740243e8670a2549bb02bc4556a83af84ae03"},
-    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71d72ca5eaaa8d38c8df16b7deb1a2da4f650c41b58bb142f3fb75d5ad4a611f"},
-    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27e524624eace5c59af499cd97dc18bb201dc6a7a2da24bfc66ef151c69a5f2a"},
-    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3dde6cac75e0b0902778978d3b1646ca9f438654395a362cb21d9ad34b24acf"},
-    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:00646784f6cd993b1e1c0e7b0fdcbccc375d539db95555477771c27555e3c556"},
-    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:23598acb8ccaa3d1d875ef3b35cb6376535095e9405d91a3d57a8c7db5d29341"},
-    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7f41533d7e3cf9520065f610b41ac1c76bc2161415955fbcead4981b22c7611e"},
-    {file = "pydantic_core-2.14.6.tar.gz", hash = "sha256:1fd0c1d395372843fba13a51c28e3bb9d59bd7aebfeb17358ffaaa1e4dbbe948"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3077cfdb6125cc8dab61b155fdd714663e401f0e6883f9632118ec12cf42df26"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8ffab8b2908d152e74862d276cf5017c81a2f3719f14e8e3e8d6b83fda863927"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5183e4f6a2d468787243ebcd70cf4098c247e60d73fb7d68d5bc1e1beaa0c4db"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:398a38d323f37714023be1e0285765f0a27243a8b1506b7b7de87b647b517e48"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87d3776f0001b43acebfa86f8c64019c043b55cc5a6a2e313d728b5c95b46969"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c566dd9c5f63d22226409553531f89de0cac55397f2ab8d97d6f06cfce6d947e"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0d5f3acc81452c56895e90643a625302bd6be351e7010664151cc55b7b97f89"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d3a07fadec2a13274a8d861d3d37c61e97a816beae717efccaa4b36dfcaadcde"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f99aeda58dce827f76963ee87a0ebe75e648c72ff9ba1174a253f6744f518f65"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:902dbc832141aa0ec374f4310f1e4e7febeebc3256f00dc359a9ac3f264a45dc"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fe44d56aa0b00d66640aa84a3cbe80b7a3ccdc6f0b1ca71090696a6d4777c091"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-win32.whl", hash = "sha256:ed3eb16d51257c763539bde21e011092f127a2202692afaeaccb50db55a31383"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-win_amd64.whl", hash = "sha256:694ad99a7f6718c1a498dc170ca430687a39894a60327f548e02a9c7ee4b6504"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e966fc3caaf9f1d96b349b0341c70c8d6573bf1bac7261f7b0ba88f96c56c24"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bfd0adeee563d59c598ceabddf2c92eec77abcb3f4a391b19aa7366170bd9e30"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91815221101ad3c6b507804178a7bb5cb7b2ead9ecd600041669c8d805ebd595"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9fea9c1869bb4742d174a57b4700c6dadea951df8b06de40c2fedb4f02931c2e"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d20eb4861329bb2484c021b9d9a977566ab16d84000a57e28061151c62b349a"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb935c5591573ae3201640579f30128ccc10739b45663f93c06796854405505"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c964fd24e6166420d18fb53996d8c9fd6eac9bf5ae3ec3d03015be4414ce497f"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:681d65e9011f7392db5aa002b7423cc442d6a673c635668c227c6c8d0e5a4f77"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e100c52f7355a48413e2999bfb4e139d2977a904495441b374f3d4fb4a170961"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:048831bd363490be79acdd3232f74a0e9951b11b2b4cc058aeb72b22fdc3abe1"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bdc84017d28459c00db6f918a7272a5190bec3090058334e43a76afb279eac7c"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-win32.whl", hash = "sha256:32cd11c5914d1179df70406427097c7dcde19fddf1418c787540f4b730289896"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-win_amd64.whl", hash = "sha256:2ea62419ba8c397e7da28a9170a16219d310d2cf4970dbc65c32faf20d828c83"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-win_arm64.whl", hash = "sha256:fc903512177361e868bc1f5b80ac8c8a6e05fcdd574a5fb5ffeac5a9982b9e89"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1293d7febb995e9d3ec3ea09caf1a26214eec45b0f29f6074abb004723fc1de8"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:99b56acd433386c8f20be5c4000786d1e7ca0523c8eefc995d14d79c7a081498"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35a5ec3fa8c2fe6c53e1b2ccc2454398f95d5393ab398478f53e1afbbeb4d939"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b172f7b9d2f3abc0efd12e3386f7e48b576ef309544ac3a63e5e9cdd2e24585d"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9097b9f17f91eea659b9ec58148c0747ec354a42f7389b9d50701610d86f812e"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc77ec5b7e2118b152b0d886c7514a4653bcb58c6b1d760134a9fab915f777b3"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3d15245b08fa4a84cefc6c9222e6f37c98111c8679fbd94aa145f9a0ae23d"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef99779001d7ac2e2461d8ab55d3373fe7315caefdbecd8ced75304ae5a6fc6b"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fc6bf8869e193855e8d91d91f6bf59699a5cdfaa47a404e278e776dd7f168b39"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:b1caa0bc2741b043db7823843e1bde8aaa58a55a58fda06083b0569f8b45693a"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ec259f62538e8bf364903a7d0d0239447059f9434b284f5536e8402b7dd198db"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-win32.whl", hash = "sha256:e14f369c98a7c15772b9da98987f58e2b509a93235582838bd0d1d8c08b68fda"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-win_amd64.whl", hash = "sha256:1c607801d85e2e123357b3893f82c97a42856192997b95b4d8325deb1cd0c5f4"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-win_arm64.whl", hash = "sha256:8d13f0276806ee722e70a1c93da19748594f19ac4299c7e41237fc791d1861ea"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:70af6a21237b53d1fe7b9325b20e65cbf2f0a848cf77bed492b029139701e66a"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:282b3fe1bbbe5ae35224a0dbd05aed9ccabccd241e8e6b60370484234b456266"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b315e596282bbb5822d0c7ee9d255595bd7506d1cb20c2911a4da0b970187d3"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1dfae24cf9921875ca0ca6a8ecb4bb2f13c855794ed0d468d6abbec6e6dcd44a"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6dd8ecfde08d8bfadaea669e83c63939af76f4cf5538a72597016edfa3fad516"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f593494876eae852dc98c43c6f260f45abdbfeec9e4324e31a481d948214764"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:948b73114f47fd7016088e5186d13faf5e1b2fe83f5e320e371f035557fd264d"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e11f3864eb516af21b01e25fac915a82e9ddad3bb0fb9e95a246067398b435a4"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:549150be302428b56fdad0c23c2741dcdb5572413776826c965619a25d9c6bde"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:495bc156026efafd9ef2d82372bd38afce78ddd82bf28ef5276c469e57c0c83e"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ec79de2a8680b1a67a07490bddf9636d5c2fab609ba8c57597e855fa5fa4dacd"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-win32.whl", hash = "sha256:ee12a7be1742f81b8a65b36c6921022301d466b82d80315d215c4c691724986f"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-win_amd64.whl", hash = "sha256:ede9b407e39949d2afc46385ce6bd6e11588660c26f80576c11c958e6647bc40"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-win_arm64.whl", hash = "sha256:aa687a23d4b7871a00e03ca96a09cad0f28f443690d300500603bd0adba4b523"},
+    {file = "pydantic_core-2.33.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:401d7b76e1000d0dd5538e6381d28febdcacb097c8d340dde7d7fc6e13e9f95d"},
+    {file = "pydantic_core-2.33.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7aeb055a42d734c0255c9e489ac67e75397d59c6fbe60d155851e9782f276a9c"},
+    {file = "pydantic_core-2.33.1-cp313-cp313t-win_amd64.whl", hash = "sha256:338ea9b73e6e109f15ab439e62cb3b78aa752c7fd9536794112e14bee02c8d18"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5ab77f45d33d264de66e1884fca158bc920cb5e27fd0764a72f72f5756ae8bdb"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e7aaba1b4b03aaea7bb59e1b5856d734be011d3e6d98f5bcaa98cb30f375f2ad"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7fb66263e9ba8fea2aa85e1e5578980d127fb37d7f2e292773e7bc3a38fb0c7b"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3f2648b9262607a7fb41d782cc263b48032ff7a03a835581abbf7a3bec62bcf5"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:723c5630c4259400818b4ad096735a829074601805d07f8cafc366d95786d331"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d100e3ae783d2167782391e0c1c7a20a31f55f8015f3293647544df3f9c67824"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:177d50460bc976a0369920b6c744d927b0ecb8606fb56858ff542560251b19e5"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3edde68d1a1f9af1273b2fe798997b33f90308fb6d44d8550c89fc6a3647cf6"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a62c3c3ef6a7e2c45f7853b10b5bc4ddefd6ee3cd31024754a1a5842da7d598d"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:c91dbb0ab683fa0cd64a6e81907c8ff41d6497c346890e26b23de7ee55353f96"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9f466e8bf0a62dc43e068c12166281c2eca72121dd2adc1040f3aa1e21ef8599"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-win32.whl", hash = "sha256:ab0277cedb698749caada82e5d099dc9fed3f906a30d4c382d1a21725777a1e5"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-win_amd64.whl", hash = "sha256:5773da0ee2d17136b1f1c6fbde543398d452a6ad2a7b54ea1033e2daa739b8d2"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c834f54f8f4640fd7e4b193f80eb25a0602bba9e19b3cd2fc7ffe8199f5ae02"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:049e0de24cf23766f12cc5cc71d8abc07d4a9deb9061b334b62093dedc7cb068"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a28239037b3d6f16916a4c831a5a0eadf856bdd6d2e92c10a0da3a59eadcf3e"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d3da303ab5f378a268fa7d45f37d7d85c3ec19769f28d2cc0c61826a8de21fe"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:25626fb37b3c543818c14821afe0fd3830bc327a43953bc88db924b68c5723f1"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3ab2d36e20fbfcce8f02d73c33a8a7362980cff717926bbae030b93ae46b56c7"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:2f9284e11c751b003fd4215ad92d325d92c9cb19ee6729ebd87e3250072cdcde"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:048c01eee07d37cbd066fc512b9d8b5ea88ceeb4e629ab94b3e56965ad655add"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5ccd429694cf26af7997595d627dd2637e7932214486f55b8a357edaac9dae8c"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3a371dc00282c4b84246509a5ddc808e61b9864aa1eae9ecc92bb1268b82db4a"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f59295ecc75a1788af8ba92f2e8c6eeaa5a94c22fc4d151e8d9638814f85c8fc"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08530b8ac922003033f399128505f513e30ca770527cc8bbacf75a84fcc2c74b"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bae370459da6a5466978c0eacf90690cb57ec9d533f8e63e564ef3822bfa04fe"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e3de2777e3b9f4d603112f78006f4ae0acb936e95f06da6cb1a45fbad6bdb4b5"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3a64e81e8cba118e108d7126362ea30e021291b7805d47e4896e52c791be2761"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:52928d8c1b6bda03cc6d811e8923dffc87a2d3c8b3bfd2ce16471c7147a24850"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:1b30d92c9412beb5ac6b10a3eb7ef92ccb14e3f2a8d7732e2d739f58b3aa7544"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f995719707e0e29f0f41a8aa3bcea6e761a36c9136104d3189eafb83f5cec5e5"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7edbc454a29fc6aeae1e1eecba4f07b63b8d76e76a748532233c4c167b4cb9ea"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:ad05b683963f69a1d5d2c2bdab1274a31221ca737dbbceaa32bcb67359453cdd"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df6a94bf9452c6da9b5d76ed229a5683d0306ccb91cca8e1eea883189780d568"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7965c13b3967909a09ecc91f21d09cfc4576bf78140b988904e94f130f188396"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3f1fdb790440a34f6ecf7679e1863b825cb5ffde858a9197f851168ed08371e5"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:5277aec8d879f8d05168fdd17ae811dd313b8ff894aeeaf7cd34ad28b4d77e33"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:8ab581d3530611897d863d1a649fb0644b860286b4718db919bfd51ece41f10b"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0483847fa9ad5e3412265c1bd72aad35235512d9ce9d27d81a56d935ef489672"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:de9e06abe3cc5ec6a2d5f75bc99b0bdca4f5c719a5b34026f8c57efbdecd2ee3"},
+    {file = "pydantic_core-2.33.1.tar.gz", hash = "sha256:bcc9c6fdb0ced789245b02b7d6603e17d1563064ddcfc36f046b61c0c05dd9df"},
 ]
 
 [package.dependencies]
@@ -844,134 +1009,139 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
 [[package]]
 name = "pytest"
-version = "6.2.5"
+version = "8.3.5"
 description = "pytest: simple powerful testing with Python"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
+groups = ["dev"]
 files = [
-    {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"},
-    {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"},
+    {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
+    {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
 ]
 
 [package.dependencies]
-atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
-attrs = ">=19.2.0"
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
-importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
+exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
 iniconfig = "*"
 packaging = "*"
-pluggy = ">=0.12,<2.0"
-py = ">=1.8.2"
-toml = "*"
+pluggy = ">=1.5,<2"
+tomli = {version = ">=1", markers = "python_version < \"3.11\""}
 
 [package.extras]
-testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
 name = "pytest-asyncio"
-version = "0.17.2"
+version = "0.26.0"
 description = "Pytest support for asyncio"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["dev"]
 files = [
-    {file = "pytest-asyncio-0.17.2.tar.gz", hash = "sha256:6d895b02432c028e6957d25fc936494e78c6305736e785d9fee408b1efbc7ff4"},
-    {file = "pytest_asyncio-0.17.2-py3-none-any.whl", hash = "sha256:e0fe5dbea40516b661ef1bcfe0bd9461c2847c4ef4bb40012324f2454fb7d56d"},
+    {file = "pytest_asyncio-0.26.0-py3-none-any.whl", hash = "sha256:7b51ed894f4fbea1340262bdae5135797ebbe21d8638978e35d31c6d19f72fb0"},
+    {file = "pytest_asyncio-0.26.0.tar.gz", hash = "sha256:c4df2a697648241ff39e7f0e4a73050b03f123f760673956cf0d72a4990e312f"},
 ]
 
 [package.dependencies]
-pytest = ">=6.1.0"
-typing-extensions = {version = ">=4.0", markers = "python_version < \"3.8\""}
+pytest = ">=8.2,<9"
+typing-extensions = {version = ">=4.12", markers = "python_version < \"3.10\""}
 
 [package.extras]
-testing = ["coverage (==6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (==0.931)"]
+docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1)"]
+testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"]
 
 [[package]]
 name = "pytest-cov"
-version = "3.0.0"
+version = "6.1.1"
 description = "Pytest plugin for measuring coverage."
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.9"
+groups = ["dev"]
 files = [
-    {file = "pytest-cov-3.0.0.tar.gz", hash = "sha256:e7f0f5b1617d2210a2cabc266dfe2f4c75a8d32fb89eafb7ad9d06f6d076d470"},
-    {file = "pytest_cov-3.0.0-py3-none-any.whl", hash = "sha256:578d5d15ac4a25e5f961c938b85a05b09fdaae9deef3bb6de9a6e766622ca7a6"},
+    {file = "pytest_cov-6.1.1-py3-none-any.whl", hash = "sha256:bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde"},
+    {file = "pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a"},
 ]
 
 [package.dependencies]
-coverage = {version = ">=5.2.1", extras = ["toml"]}
+coverage = {version = ">=7.5", extras = ["toml"]}
 pytest = ">=4.6"
 
 [package.extras]
-testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"]
+testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"]
 
 [[package]]
 name = "pyyaml"
-version = "6.0.1"
+version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
-    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
-    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
-    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
-    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
-    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
-    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
+    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
+    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
+    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
 
 [[package]]
 name = "requests"
-version = "2.31.0"
+version = "2.32.3"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
-    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
 ]
 
 [package.dependencies]
@@ -984,180 +1154,220 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
-[[package]]
-name = "toml"
-version = "0.10.2"
-description = "Python Library for Tom's Obvious, Minimal Language"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
-files = [
-    {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
-    {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
-]
-
 [[package]]
 name = "tomli"
-version = "2.0.1"
+version = "2.2.1"
 description = "A lil' TOML parser"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["dev"]
+markers = "python_full_version <= \"3.11.0a6\""
 files = [
-    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
-    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+    {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"},
+    {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"},
+    {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a"},
+    {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee"},
+    {file = "tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e"},
+    {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4"},
+    {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106"},
+    {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8"},
+    {file = "tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff"},
+    {file = "tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b"},
+    {file = "tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea"},
+    {file = "tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8"},
+    {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192"},
+    {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222"},
+    {file = "tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77"},
+    {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6"},
+    {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd"},
+    {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e"},
+    {file = "tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98"},
+    {file = "tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4"},
+    {file = "tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7"},
+    {file = "tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c"},
+    {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13"},
+    {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281"},
+    {file = "tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272"},
+    {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140"},
+    {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2"},
+    {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744"},
+    {file = "tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec"},
+    {file = "tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69"},
+    {file = "tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc"},
+    {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"},
 ]
 
 [[package]]
 name = "tqdm"
-version = "4.66.1"
+version = "4.67.1"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
-    {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
+    {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"},
+    {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"},
 ]
 
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
 [package.extras]
-dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+dev = ["nbval", "pytest (>=6)", "pytest-asyncio (>=0.24)", "pytest-cov", "pytest-timeout"]
+discord = ["requests"]
 notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 
 [[package]]
 name = "typing-extensions"
-version = "4.7.1"
-description = "Backported and Experimental Type Hints for Python 3.7+"
+version = "4.13.2"
+description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["main", "dev"]
 files = [
-    {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"},
-    {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"},
+    {file = "typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c"},
+    {file = "typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef"},
 ]
+markers = {dev = "python_version < \"3.10\""}
+
+[[package]]
+name = "typing-inspection"
+version = "0.4.0"
+description = "Runtime typing introspection tools"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "typing_inspection-0.4.0-py3-none-any.whl", hash = "sha256:50e72559fcd2a6367a19f7a7e610e6afcb9fac940c650290eed893d61386832f"},
+    {file = "typing_inspection-0.4.0.tar.gz", hash = "sha256:9765c87de36671694a67904bf2c96e395be9c6439bb6c87b5142569dcdd65122"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.12.0"
 
 [[package]]
 name = "urllib3"
-version = "2.0.5"
+version = "2.4.0"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "urllib3-2.0.5-py3-none-any.whl", hash = "sha256:ef16afa8ba34a1f989db38e1dbbe0c302e4289a47856990d0682e374563ce35e"},
-    {file = "urllib3-2.0.5.tar.gz", hash = "sha256:13abf37382ea2ce6fb744d4dad67838eec857c9f4f57009891805e0b5e123594"},
+    {file = "urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813"},
+    {file = "urllib3-2.4.0.tar.gz", hash = "sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466"},
 ]
 
 [package.extras]
 brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
-secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"]
+h2 = ["h2 (>=4,<5)"]
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
 
 [[package]]
 name = "yarl"
-version = "1.9.2"
+version = "1.19.0"
 description = "Yet another URL library"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82"},
-    {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82aa6264b36c50acfb2424ad5ca537a2060ab6de158a5bd2a72a032cc75b9eb8"},
-    {file = "yarl-1.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c0c77533b5ed4bcc38e943178ccae29b9bcf48ffd1063f5821192f23a1bd27b9"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee4afac41415d52d53a9833ebae7e32b344be72835bbb589018c9e938045a560"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9bf345c3a4f5ba7f766430f97f9cc1320786f19584acc7086491f45524a551ac"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a96c19c52ff442a808c105901d0bdfd2e28575b3d5f82e2f5fd67e20dc5f4ea"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:891c0e3ec5ec881541f6c5113d8df0315ce5440e244a716b95f2525b7b9f3608"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c3a53ba34a636a256d767c086ceb111358876e1fb6b50dfc4d3f4951d40133d5"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:566185e8ebc0898b11f8026447eacd02e46226716229cea8db37496c8cdd26e0"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2b0738fb871812722a0ac2154be1f049c6223b9f6f22eec352996b69775b36d4"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:32f1d071b3f362c80f1a7d322bfd7b2d11e33d2adf395cc1dd4df36c9c243095"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e9fdc7ac0d42bc3ea78818557fab03af6181e076a2944f43c38684b4b6bed8e3"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56ff08ab5df8429901ebdc5d15941b59f6253393cb5da07b4170beefcf1b2528"},
-    {file = "yarl-1.9.2-cp310-cp310-win32.whl", hash = "sha256:8ea48e0a2f931064469bdabca50c2f578b565fc446f302a79ba6cc0ee7f384d3"},
-    {file = "yarl-1.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:50f33040f3836e912ed16d212f6cc1efb3231a8a60526a407aeb66c1c1956dde"},
-    {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:646d663eb2232d7909e6601f1a9107e66f9791f290a1b3dc7057818fe44fc2b6"},
-    {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aff634b15beff8902d1f918012fc2a42e0dbae6f469fce134c8a0dc51ca423bb"},
-    {file = "yarl-1.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a83503934c6273806aed765035716216cc9ab4e0364f7f066227e1aaea90b8d0"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b25322201585c69abc7b0e89e72790469f7dad90d26754717f3310bfe30331c2"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22a94666751778629f1ec4280b08eb11815783c63f52092a5953faf73be24191"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ec53a0ea2a80c5cd1ab397925f94bff59222aa3cf9c6da938ce05c9ec20428d"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:159d81f22d7a43e6eabc36d7194cb53f2f15f498dbbfa8edc8a3239350f59fe7"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:832b7e711027c114d79dffb92576acd1bd2decc467dec60e1cac96912602d0e6"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:95d2ecefbcf4e744ea952d073c6922e72ee650ffc79028eb1e320e732898d7e8"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d4e2c6d555e77b37288eaf45b8f60f0737c9efa3452c6c44626a5455aeb250b9"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:783185c75c12a017cc345015ea359cc801c3b29a2966c2655cd12b233bf5a2be"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:b8cc1863402472f16c600e3e93d542b7e7542a540f95c30afd472e8e549fc3f7"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:822b30a0f22e588b32d3120f6d41e4ed021806418b4c9f0bc3048b8c8cb3f92a"},
-    {file = "yarl-1.9.2-cp311-cp311-win32.whl", hash = "sha256:a60347f234c2212a9f0361955007fcf4033a75bf600a33c88a0a8e91af77c0e8"},
-    {file = "yarl-1.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:be6b3fdec5c62f2a67cb3f8c6dbf56bbf3f61c0f046f84645cd1ca73532ea051"},
-    {file = "yarl-1.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38a3928ae37558bc1b559f67410df446d1fbfa87318b124bf5032c31e3447b74"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac9bb4c5ce3975aeac288cfcb5061ce60e0d14d92209e780c93954076c7c4367"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3da8a678ca8b96c8606bbb8bfacd99a12ad5dd288bc6f7979baddd62f71c63ef"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13414591ff516e04fcdee8dc051c13fd3db13b673c7a4cb1350e6b2ad9639ad3"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf74d08542c3a9ea97bb8f343d4fcbd4d8f91bba5ec9d5d7f792dbe727f88938"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e7221580dc1db478464cfeef9b03b95c5852cc22894e418562997df0d074ccc"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:494053246b119b041960ddcd20fd76224149cfea8ed8777b687358727911dd33"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:52a25809fcbecfc63ac9ba0c0fb586f90837f5425edfd1ec9f3372b119585e45"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:e65610c5792870d45d7b68c677681376fcf9cc1c289f23e8e8b39c1485384185"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:1b1bba902cba32cdec51fca038fd53f8beee88b77efc373968d1ed021024cc04"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:662e6016409828ee910f5d9602a2729a8a57d74b163c89a837de3fea050c7582"},
-    {file = "yarl-1.9.2-cp37-cp37m-win32.whl", hash = "sha256:f364d3480bffd3aa566e886587eaca7c8c04d74f6e8933f3f2c996b7f09bee1b"},
-    {file = "yarl-1.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6a5883464143ab3ae9ba68daae8e7c5c95b969462bbe42e2464d60e7e2698368"},
-    {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5610f80cf43b6202e2c33ba3ec2ee0a2884f8f423c8f4f62906731d876ef4fac"},
-    {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9a4e67ad7b646cd6f0938c7ebfd60e481b7410f574c560e455e938d2da8e0f4"},
-    {file = "yarl-1.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:83fcc480d7549ccebe9415d96d9263e2d4226798c37ebd18c930fce43dfb9574"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fcd436ea16fee7d4207c045b1e340020e58a2597301cfbcfdbe5abd2356c2fb"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84e0b1599334b1e1478db01b756e55937d4614f8654311eb26012091be109d59"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3458a24e4ea3fd8930e934c129b676c27452e4ebda80fbe47b56d8c6c7a63a9e"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:838162460b3a08987546e881a2bfa573960bb559dfa739e7800ceeec92e64417"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4e2d08f07a3d7d3e12549052eb5ad3eab1c349c53ac51c209a0e5991bbada78"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:de119f56f3c5f0e2fb4dee508531a32b069a5f2c6e827b272d1e0ff5ac040333"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:149ddea5abf329752ea5051b61bd6c1d979e13fbf122d3a1f9f0c8be6cb6f63c"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:674ca19cbee4a82c9f54e0d1eee28116e63bc6fd1e96c43031d11cbab8b2afd5"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:9b3152f2f5677b997ae6c804b73da05a39daa6a9e85a512e0e6823d81cdad7cc"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5415d5a4b080dc9612b1b63cba008db84e908b95848369aa1da3686ae27b6d2b"},
-    {file = "yarl-1.9.2-cp38-cp38-win32.whl", hash = "sha256:f7a3d8146575e08c29ed1cd287068e6d02f1c7bdff8970db96683b9591b86ee7"},
-    {file = "yarl-1.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:63c48f6cef34e6319a74c727376e95626f84ea091f92c0250a98e53e62c77c72"},
-    {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:75df5ef94c3fdc393c6b19d80e6ef1ecc9ae2f4263c09cacb178d871c02a5ba9"},
-    {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c027a6e96ef77d401d8d5a5c8d6bc478e8042f1e448272e8d9752cb0aff8b5c8"},
-    {file = "yarl-1.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3b078dbe227f79be488ffcfc7a9edb3409d018e0952cf13f15fd6512847f3f7"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59723a029760079b7d991a401386390c4be5bfec1e7dd83e25a6a0881859e716"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b03917871bf859a81ccb180c9a2e6c1e04d2f6a51d953e6a5cdd70c93d4e5a2a"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c1012fa63eb6c032f3ce5d2171c267992ae0c00b9e164efe4d73db818465fac3"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74dcbfe780e62f4b5a062714576f16c2f3493a0394e555ab141bf0d746bb955"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c56986609b057b4839968ba901944af91b8e92f1725d1a2d77cbac6972b9ed1"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2c315df3293cd521033533d242d15eab26583360b58f7ee5d9565f15fee1bef4"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b7232f8dfbd225d57340e441d8caf8652a6acd06b389ea2d3222b8bc89cbfca6"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:53338749febd28935d55b41bf0bcc79d634881195a39f6b2f767870b72514caf"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:066c163aec9d3d073dc9ffe5dd3ad05069bcb03fcaab8d221290ba99f9f69ee3"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8288d7cd28f8119b07dd49b7230d6b4562f9b61ee9a4ab02221060d21136be80"},
-    {file = "yarl-1.9.2-cp39-cp39-win32.whl", hash = "sha256:b124e2a6d223b65ba8768d5706d103280914d61f5cae3afbc50fc3dfcc016623"},
-    {file = "yarl-1.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:61016e7d582bc46a5378ffdd02cd0314fb8ba52f40f9cf4d9a5e7dbef88dee18"},
-    {file = "yarl-1.9.2.tar.gz", hash = "sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571"},
+    {file = "yarl-1.19.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0bae32f8ebd35c04d6528cedb4a26b8bf25339d3616b04613b97347f919b76d3"},
+    {file = "yarl-1.19.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8015a076daf77823e7ebdcba474156587391dab4e70c732822960368c01251e6"},
+    {file = "yarl-1.19.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9973ac95327f5d699eb620286c39365990b240031672b5c436a4cd00539596c5"},
+    {file = "yarl-1.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd4b5fbd7b9dde785cfeb486b8cca211a0b138d4f3a7da27db89a25b3c482e5c"},
+    {file = "yarl-1.19.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:75460740005de5a912b19f657848aef419387426a40f581b1dc9fac0eb9addb5"},
+    {file = "yarl-1.19.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57abd66ca913f2cfbb51eb3dbbbac3648f1f6983f614a4446e0802e241441d2a"},
+    {file = "yarl-1.19.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46ade37911b7c99ce28a959147cb28bffbd14cea9e7dd91021e06a8d2359a5aa"},
+    {file = "yarl-1.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8346ec72ada749a6b5d82bff7be72578eab056ad7ec38c04f668a685abde6af0"},
+    {file = "yarl-1.19.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e4cb14a6ee5b6649ccf1c6d648b4da9220e8277d4d4380593c03cc08d8fe937"},
+    {file = "yarl-1.19.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:66fc1c2926a73a2fb46e4b92e3a6c03904d9bc3a0b65e01cb7d2b84146a8bd3b"},
+    {file = "yarl-1.19.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:5a70201dd1e0a4304849b6445a9891d7210604c27e67da59091d5412bc19e51c"},
+    {file = "yarl-1.19.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4807aab1bdeab6ae6f296be46337a260ae4b1f3a8c2fcd373e236b4b2b46efd"},
+    {file = "yarl-1.19.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ae584afe81a1de4c1bb06672481050f0d001cad13163e3c019477409f638f9b7"},
+    {file = "yarl-1.19.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:30eaf4459df6e91f21b2999d1ee18f891bcd51e3cbe1de301b4858c84385895b"},
+    {file = "yarl-1.19.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0e617d45d03c8dec0dfce6f51f3e1b8a31aa81aaf4a4d1442fdb232bcf0c6d8c"},
+    {file = "yarl-1.19.0-cp310-cp310-win32.whl", hash = "sha256:32ba32d0fa23893fd8ea8d05bdb05de6eb19d7f2106787024fd969f4ba5466cb"},
+    {file = "yarl-1.19.0-cp310-cp310-win_amd64.whl", hash = "sha256:545575ecfcd465891b51546c2bcafdde0acd2c62c2097d8d71902050b20e4922"},
+    {file = "yarl-1.19.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:163ff326680de5f6d4966954cf9e3fe1bf980f5fee2255e46e89b8cf0f3418b5"},
+    {file = "yarl-1.19.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a626c4d9cca298d1be8625cff4b17004a9066330ac82d132bbda64a4c17c18d3"},
+    {file = "yarl-1.19.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:961c3e401ea7f13d02b8bb7cb0c709152a632a6e14cdc8119e9c6ee5596cd45d"},
+    {file = "yarl-1.19.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a39d7b807ab58e633ed760f80195cbd145b58ba265436af35f9080f1810dfe64"},
+    {file = "yarl-1.19.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c4228978fb59c6b10f60124ba8e311c26151e176df364e996f3f8ff8b93971b5"},
+    {file = "yarl-1.19.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ba536b17ecf3c74a94239ec1137a3ad3caea8c0e4deb8c8d2ffe847d870a8c5"},
+    {file = "yarl-1.19.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a251e00e445d2e9df7b827c9843c0b87f58a3254aaa3f162fb610747491fe00f"},
+    {file = "yarl-1.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9b92431d8b4d4ca5ccbfdbac95b05a3a6cd70cd73aa62f32f9627acfde7549c"},
+    {file = "yarl-1.19.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec2f56edaf476f70b5831bbd59700b53d9dd011b1f77cd4846b5ab5c5eafdb3f"},
+    {file = "yarl-1.19.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:acf9b92c4245ac8b59bc7ec66a38d3dcb8d1f97fac934672529562bb824ecadb"},
+    {file = "yarl-1.19.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:57711f1465c06fee8825b95c0b83e82991e6d9425f9a042c3c19070a70ac92bf"},
+    {file = "yarl-1.19.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:528e86f5b1de0ad8dd758ddef4e0ed24f5d946d4a1cef80ffb2d4fca4e10f122"},
+    {file = "yarl-1.19.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:3b77173663e075d9e5a57e09d711e9da2f3266be729ecca0b8ae78190990d260"},
+    {file = "yarl-1.19.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:d8717924cf0a825b62b1a96fc7d28aab7f55a81bf5338b8ef41d7a76ab9223e9"},
+    {file = "yarl-1.19.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0df9f0221a78d858793f40cbea3915c29f969c11366646a92ca47e080a14f881"},
+    {file = "yarl-1.19.0-cp311-cp311-win32.whl", hash = "sha256:8b3ade62678ee2c7c10dcd6be19045135e9badad53108f7d2ed14896ee396045"},
+    {file = "yarl-1.19.0-cp311-cp311-win_amd64.whl", hash = "sha256:0626ee31edb23ac36bdffe607231de2cca055ad3a5e2dc5da587ef8bc6a321bc"},
+    {file = "yarl-1.19.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:7b687c334da3ff8eab848c9620c47a253d005e78335e9ce0d6868ed7e8fd170b"},
+    {file = "yarl-1.19.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b0fe766febcf523a2930b819c87bb92407ae1368662c1bc267234e79b20ff894"},
+    {file = "yarl-1.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:742ceffd3c7beeb2b20d47cdb92c513eef83c9ef88c46829f88d5b06be6734ee"},
+    {file = "yarl-1.19.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2af682a1e97437382ee0791eacbf540318bd487a942e068e7e0a6c571fadbbd3"},
+    {file = "yarl-1.19.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:63702f1a098d0eaaea755e9c9d63172be1acb9e2d4aeb28b187092bcc9ca2d17"},
+    {file = "yarl-1.19.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3560dcba3c71ae7382975dc1e912ee76e50b4cd7c34b454ed620d55464f11876"},
+    {file = "yarl-1.19.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:68972df6a0cc47c8abaf77525a76ee5c5f6ea9bbdb79b9565b3234ded3c5e675"},
+    {file = "yarl-1.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5684e7ff93ea74e47542232bd132f608df4d449f8968fde6b05aaf9e08a140f9"},
+    {file = "yarl-1.19.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8182ad422bfacdebd4759ce3adc6055c0c79d4740aea1104e05652a81cd868c6"},
+    {file = "yarl-1.19.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:aee5b90a5a9b71ac57400a7bdd0feaa27c51e8f961decc8d412e720a004a1791"},
+    {file = "yarl-1.19.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:8c0b2371858d5a814b08542d5d548adb03ff2d7ab32f23160e54e92250961a72"},
+    {file = "yarl-1.19.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:cd430c2b7df4ae92498da09e9b12cad5bdbb140d22d138f9e507de1aa3edfea3"},
+    {file = "yarl-1.19.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a93208282c0ccdf73065fd76c6c129bd428dba5ff65d338ae7d2ab27169861a0"},
+    {file = "yarl-1.19.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:b8179280cdeb4c36eb18d6534a328f9d40da60d2b96ac4a295c5f93e2799e9d9"},
+    {file = "yarl-1.19.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eda3c2b42dc0c389b7cfda2c4df81c12eeb552019e0de28bde8f913fc3d1fcf3"},
+    {file = "yarl-1.19.0-cp312-cp312-win32.whl", hash = "sha256:57f3fed859af367b9ca316ecc05ce79ce327d6466342734305aa5cc380e4d8be"},
+    {file = "yarl-1.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:5507c1f7dd3d41251b67eecba331c8b2157cfd324849879bebf74676ce76aff7"},
+    {file = "yarl-1.19.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:59281b9ed27bc410e0793833bcbe7fc149739d56ffa071d1e0fe70536a4f7b61"},
+    {file = "yarl-1.19.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d27a6482ad5e05e8bafd47bf42866f8a1c0c3345abcb48d4511b3c29ecc197dc"},
+    {file = "yarl-1.19.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7a8e19fd5a6fdf19a91f2409665c7a089ffe7b9b5394ab33c0eec04cbecdd01f"},
+    {file = "yarl-1.19.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cda34ab19099c3a1685ad48fe45172536610c312b993310b5f1ca3eb83453b36"},
+    {file = "yarl-1.19.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7908a25d33f94852b479910f9cae6cdb9e2a509894e8d5f416c8342c0253c397"},
+    {file = "yarl-1.19.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e66c14d162bac94973e767b24de5d7e6c5153f7305a64ff4fcba701210bcd638"},
+    {file = "yarl-1.19.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c03607bf932aa4cfae371e2dc9ca8b76faf031f106dac6a6ff1458418140c165"},
+    {file = "yarl-1.19.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9931343d1c1f4e77421687b6b94bbebd8a15a64ab8279adf6fbb047eff47e536"},
+    {file = "yarl-1.19.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:262087a8a0d73e1d169d45c2baf968126f93c97cf403e1af23a7d5455d52721f"},
+    {file = "yarl-1.19.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:70f384921c24e703d249a6ccdabeb57dd6312b568b504c69e428a8dd3e8e68ca"},
+    {file = "yarl-1.19.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:756b9ea5292a2c180d1fe782a377bc4159b3cfefaca7e41b5b0a00328ef62fa9"},
+    {file = "yarl-1.19.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cbeb9c145d534c240a63b6ecc8a8dd451faeb67b3dc61d729ec197bb93e29497"},
+    {file = "yarl-1.19.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:087ae8f8319848c18e0d114d0f56131a9c017f29200ab1413b0137ad7c83e2ae"},
+    {file = "yarl-1.19.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:362f5480ba527b6c26ff58cff1f229afe8b7fdd54ee5ffac2ab827c1a75fc71c"},
+    {file = "yarl-1.19.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f408d4b4315e814e5c3668094e33d885f13c7809cbe831cbdc5b1bb8c7a448f4"},
+    {file = "yarl-1.19.0-cp313-cp313-win32.whl", hash = "sha256:24e4c367ad69988a2283dd45ea88172561ca24b2326b9781e164eb46eea68345"},
+    {file = "yarl-1.19.0-cp313-cp313-win_amd64.whl", hash = "sha256:0110f91c57ab43d1538dfa92d61c45e33b84df9257bd08fcfcda90cce931cbc9"},
+    {file = "yarl-1.19.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85ac908cd5a97bbd3048cca9f1bf37b932ea26c3885099444f34b0bf5d5e9fa6"},
+    {file = "yarl-1.19.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6ba0931b559f1345df48a78521c31cfe356585670e8be22af84a33a39f7b9221"},
+    {file = "yarl-1.19.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5bc503e1c1fee1b86bcb58db67c032957a52cae39fe8ddd95441f414ffbab83e"},
+    {file = "yarl-1.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d995122dcaf180fd4830a9aa425abddab7c0246107c21ecca2fa085611fa7ce9"},
+    {file = "yarl-1.19.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:217f69e60a14da4eed454a030ea8283f8fbd01a7d6d81e57efb865856822489b"},
+    {file = "yarl-1.19.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aad67c8f13a4b79990082f72ef09c078a77de2b39899aabf3960a48069704973"},
+    {file = "yarl-1.19.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dff065a1a8ed051d7e641369ba1ad030d5a707afac54cf4ede7069b959898835"},
+    {file = "yarl-1.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ada882e26b16ee651ab6544ce956f2f4beaed38261238f67c2a96db748e17741"},
+    {file = "yarl-1.19.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:67a56b1acc7093451ea2de0687aa3bd4e58d6b4ef6cbeeaad137b45203deaade"},
+    {file = "yarl-1.19.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:e97d2f0a06b39e231e59ebab0e6eec45c7683b339e8262299ac952707bdf7688"},
+    {file = "yarl-1.19.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:a5288adb7c59d0f54e4ad58d86fb06d4b26e08a59ed06d00a1aac978c0e32884"},
+    {file = "yarl-1.19.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1efbf4d03e6eddf5da27752e0b67a8e70599053436e9344d0969532baa99df53"},
+    {file = "yarl-1.19.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:f228f42f29cc87db67020f7d71624102b2c837686e55317b16e1d3ef2747a993"},
+    {file = "yarl-1.19.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c515f7dd60ca724e4c62b34aeaa603188964abed2eb66bb8e220f7f104d5a187"},
+    {file = "yarl-1.19.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4815ec6d3d68a96557fa71bd36661b45ac773fb50e5cfa31a7e843edb098f060"},
+    {file = "yarl-1.19.0-cp39-cp39-win32.whl", hash = "sha256:9fac2dd1c5ecb921359d9546bc23a6dcc18c6acd50c6d96f118188d68010f497"},
+    {file = "yarl-1.19.0-cp39-cp39-win_amd64.whl", hash = "sha256:5864f539ce86b935053bfa18205fa08ce38e9a40ea4d51b19ce923345f0ed5db"},
+    {file = "yarl-1.19.0-py3-none-any.whl", hash = "sha256:a727101eb27f66727576630d02985d8a065d09cd0b5fcbe38a5793f71b2a97ef"},
+    {file = "yarl-1.19.0.tar.gz", hash = "sha256:01e02bb80ae0dbed44273c304095295106e1d9470460e773268a27d11e594892"},
 ]
 
 [package.dependencies]
 idna = ">=2.0"
 multidict = ">=4.0"
-typing-extensions = {version = ">=3.7.4", markers = "python_version < \"3.8\""}
-
-[[package]]
-name = "zipp"
-version = "3.15.0"
-description = "Backport of pathlib-compatible object wrapper for zip files"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "zipp-3.15.0-py3-none-any.whl", hash = "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"},
-    {file = "zipp-3.15.0.tar.gz", hash = "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b"},
-]
-
-[package.extras]
-docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
+propcache = ">=0.2.1"
 
 [metadata]
-lock-version = "2.0"
-python-versions = "^3.7"
-content-hash = "b7fab8703967f2616ea59a98a437cd30f97f0c8d2a06e399d688814a2a2c64f8"
+lock-version = "2.1"
+python-versions = "^3.9"
+content-hash = "f136e898d37b7c7db1ccceb1822ade280d3542ca19cdd9dcf583cb9aefef11c6"
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 47ef9d71..1448d761 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -11,15 +11,15 @@ repository = "https://github.com/huggingface/text-generation-inference"
 
 
 [tool.poetry.dependencies]
-python = "^3.7"
+python = "^3.9"
 pydantic = "> 2, < 3"
-aiohttp = "^3.8"
+aiohttp = "^3.11"
 huggingface-hub = ">= 0.12, < 1.0"
 
-[tool.poetry.dev-dependencies]
-pytest = "^6.2.5"
-pytest-asyncio = "^0.17.2"
-pytest-cov = "^3.0.0"
+[tool.poetry.group.dev.dependencies]
+pytest = "^8"
+pytest-asyncio = "^0.26"
+pytest-cov = "^6.0.0"
 
 [tool.pytest.ini_options]
 asyncio_mode = "auto"

From 73e797528df1ceadabf687f1f15882f759f12a62 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Mon, 14 Apr 2025 22:13:53 +0530
Subject: [PATCH 177/183] L4 fixes (#3161)

add fix
---
 router/src/config.rs                             | 9 ++++++---
 router/src/lib.rs                                | 2 +-
 router/src/validation.rs                         | 6 +++++-
 server/text_generation_server/models/__init__.py | 1 -
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/router/src/config.rs b/router/src/config.rs
index fcda2612..93b6f4fa 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -229,10 +229,13 @@ impl Llama4 {
     pub fn pixel_shuffle_ratio(&self) -> f64 {
         self.vision_config.pixel_shuffle_ratio
     }
-    pub fn get_aspect_ratios(&self, height: usize, width: usize) -> (usize, usize) {
+    pub fn get_aspect_ratios(
+        &self,
+        height: usize,
+        width: usize,
+        max_chunks: usize,
+    ) -> (usize, usize) {
         let patch_size = self.vision_config.image_size;
-        // How to avoid hardcoding this?
-        let max_chunks = 15;
         let supported = find_supported_resolutions(max_chunks, patch_size);
         let (target_h, target_w) = get_best_fit(height, width, &supported, false);
         (target_h / patch_size, target_w / patch_size)
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 50adb5cf..3c1a01b3 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -204,7 +204,7 @@ pub struct Gemma3Processor {
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Llama4Processor {
     #[serde(default)]
-    do_image_splitting: bool,
+    max_patches: usize,
 }
 
 #[derive(Debug, Clone, Deserialize, Default)]
diff --git a/router/src/validation.rs b/router/src/validation.rs
index dfe9dd4d..b29391b7 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -698,10 +698,14 @@ fn image_tokens(
             let image_height = config.image_size();
             let patch_size = config.patch_size();
             let pixel_shuffle_ratio = config.pixel_shuffle_ratio();
+            let max_patches = match preprocessor_config {
+                Some(HubPreprocessorConfig::Llama4Processor(cfg)) => cfg.max_patches,
+                _ => panic!("Expected Llama4Processor in preprocessor_config"),
+            };
             let downsample_ratio =
                 (1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize;
 
-            let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width);
+            let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width, max_patches);
             let image_width = image_height; // Assuming pixel shape: [H][W][C]
 
             let num_patches_per_chunk =
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 84437bf3..291ee5fb 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -1041,7 +1041,6 @@ def get_model(
                 trust_remote_code=trust_remote_code,
                 processor_kwargs={
                     "use_fast": True,
-                    "size": {"height": 336, "width": 336},
                 },
             )
     elif model_type == BAICHUAN:

From 449cee49ca6eda19ff8fdb30c581a5f27de5c90b Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 15 Apr 2025 10:09:37 +0200
Subject: [PATCH 178/183] setuptools <= 70.0 is vulnerable: CVE-2024-6345
 (#3171)

---
 backends/neuron/server/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/neuron/server/pyproject.toml b/backends/neuron/server/pyproject.toml
index ed790fdb..6bf4e5ee 100644
--- a/backends/neuron/server/pyproject.toml
+++ b/backends/neuron/server/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=61.0"]
+requires = ["setuptools>=78.1"]
 build-backend = "setuptools.build_meta"
 
 [project]

From 459fbdebe3bfc055bb71ad0913a498e93f30ebe1 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Tue, 15 Apr 2025 17:08:01 +0800
Subject: [PATCH 179/183] transformers flash llm/vlm enabling in ipex (#3152)

* transformers flash llm/vlm enabling in xpu

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* ipex cpu could also support in function

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 Dockerfile_intel                                         | 6 ++----
 server/text_generation_server/models/__init__.py         | 3 ++-
 .../models/transformers_flash_causal_lm.py               | 9 ++++++---
 .../models/transformers_flash_vlm.py                     | 8 ++++++--
 server/text_generation_server/utils/dist.py              | 7 +++++++
 5 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/Dockerfile_intel b/Dockerfile_intel
index 5bf7632c..b2a905ec 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -87,7 +87,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/
 
 RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d
 
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-ocloc
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-ocloc libnl-genl-3-200
 
 # Text Generation Inference base env
 ENV HF_HOME=/data \
@@ -98,9 +98,7 @@ ENV HF_HOME=/data \
 
 
 WORKDIR /usr/src
-RUN pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/test/xpu
-
-RUN pip install triton-xpu==3.2.0b1 --no-cache-dir
+RUN pip install torch==2.6.0 torchvision==0.21.0 --index-url https://download.pytorch.org/whl/xpu
 
 # Install server
 COPY proto proto
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 291ee5fb..93a2f8bf 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -201,7 +201,8 @@ except ImportError as e:
 if MAMBA_AVAILABLE:
     __all__.append(Mamba)
 
-FLASH_TRANSFORMERS_BACKEND = torch.cuda.is_available()
+FLASH_TRANSFORMERS_BACKEND = torch.cuda.is_available() or SYSTEM == "ipex"
+
 try:
     from text_generation_server.models.transformers_flash_causal_lm import (
         TransformersFlashCausalLM,
diff --git a/server/text_generation_server/models/transformers_flash_causal_lm.py b/server/text_generation_server/models/transformers_flash_causal_lm.py
index 77659dd0..8c21ed58 100644
--- a/server/text_generation_server/models/transformers_flash_causal_lm.py
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@@ -12,7 +12,7 @@ from text_generation_server.utils import initialize_torch_distributed
 from text_generation_server.layers.attention import paged_attention, attention, Seqlen
 from text_generation_server.layers.attention.kv_cache import KVScales, KVCache
 from text_generation_server.models.globals import ATTENTION
-
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -115,8 +115,11 @@ class TransformersFlashCausalLM(FlashCausalLM):
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = default_dtype if dtype is None else dtype
-        elif hasattr(torch, "xpu") and torch.xpu.is_available():
-            device = torch.device("xpu")
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
             dtype = default_dtype if dtype is None else dtype
         else:
             raise ValueError(
diff --git a/server/text_generation_server/models/transformers_flash_vlm.py b/server/text_generation_server/models/transformers_flash_vlm.py
index a7beb68b..280fa0bd 100644
--- a/server/text_generation_server/models/transformers_flash_vlm.py
+++ b/server/text_generation_server/models/transformers_flash_vlm.py
@@ -14,6 +14,7 @@ from text_generation_server.layers.attention import paged_attention, attention,
 from text_generation_server.layers.attention.kv_cache import KVScales, KVCache
 from text_generation_server.models.globals import ATTENTION
 import torch.nn.functional as F
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -174,8 +175,11 @@ class TransformersFlashVlmCausalLM(VlmCausalLM):
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = default_dtype if dtype is None else dtype
-        elif hasattr(torch, "xpu") and torch.xpu.is_available():
-            device = torch.device("xpu")
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
             dtype = default_dtype if dtype is None else dtype
         else:
             raise ValueError(
diff --git a/server/text_generation_server/utils/dist.py b/server/text_generation_server/utils/dist.py
index 613c4784..4a1bef6d 100644
--- a/server/text_generation_server/utils/dist.py
+++ b/server/text_generation_server/utils/dist.py
@@ -73,6 +73,13 @@ def initialize_torch_distributed():
             if SYSTEM == "ipex":
                 import intel_extension_for_pytorch as ipex
 
+                if torch.xpu.is_available():
+                    assert (
+                        WORLD_SIZE <= torch.xpu.device_count()
+                    ), "Each process is one xpu"
+                    device = RANK % torch.xpu.device_count()
+                    torch.xpu.set_device(device)
+
                 ipex.distributed.init_process_group(
                     backend="ccl",
                     world_size=WORLD_SIZE,

From 16b4b7974afd630f9e3798b1e214c7cc2d11be22 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 15 Apr 2025 11:49:06 +0200
Subject: [PATCH 180/183] Upgrading the dependencies in Gaudi backend. (#3170)

* Upgrading the dependencies in Gaudi backend.

* Upgrading transformers version.
---
 backends/gaudi/server/poetry.lock      | 3883 +++++++++---------------
 backends/gaudi/server/pyproject.toml   |   35 +-
 backends/gaudi/server/requirements.txt |  197 +-
 3 files changed, 1555 insertions(+), 2560 deletions(-)

diff --git a/backends/gaudi/server/poetry.lock b/backends/gaudi/server/poetry.lock
index 1987c3b3..b9b2e138 100644
--- a/backends/gaudi/server/poetry.lock
+++ b/backends/gaudi/server/poetry.lock
@@ -1,19 +1,20 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.
 
 [[package]]
 name = "accelerate"
-version = "0.29.3"
+version = "0.33.0"
 description = "Accelerate"
-optional = true
+optional = false
 python-versions = ">=3.8.0"
+groups = ["main"]
 files = [
-    {file = "accelerate-0.29.3-py3-none-any.whl", hash = "sha256:99d633d4b6126817c5e554487406748be95c8d1d1e659dd2fd60657e35f532dd"},
-    {file = "accelerate-0.29.3.tar.gz", hash = "sha256:1a5a845b06b24b41736b219b2b20fd021ca5dff4070a252445fd6de736e347ac"},
+    {file = "accelerate-0.33.0-py3-none-any.whl", hash = "sha256:0a7f33d60ba09afabd028d4f0856dd19c5a734b7a596d637d9dd6e3d0eadbaf3"},
+    {file = "accelerate-0.33.0.tar.gz", hash = "sha256:11ba481ed6ea09191775df55ce464aeeba67a024bd0261a44b77b30fb439e26a"},
 ]
 
 [package.dependencies]
-huggingface-hub = "*"
-numpy = ">=1.17"
+huggingface-hub = ">=0.21.0"
+numpy = ">=1.17,<2.0.0"
 packaging = ">=20.0"
 psutil = "*"
 pyyaml = "*"
@@ -21,151 +22,15 @@ safetensors = ">=0.3.1"
 torch = ">=1.10.0"
 
 [package.extras]
-dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "deepspeed", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.2.1,<0.3.0)", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
+deepspeed = ["deepspeed (<=0.14.0)"]
+dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.2.1,<0.3.0)", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
 quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.2.1,<0.3.0)"]
 rich = ["rich"]
 sagemaker = ["sagemaker"]
-test-dev = ["bitsandbytes", "datasets", "deepspeed", "evaluate", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
+test-dev = ["bitsandbytes", "datasets", "diffusers", "evaluate", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
 test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist"]
 test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"]
-testing = ["bitsandbytes", "datasets", "deepspeed", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
-
-[[package]]
-name = "aiohappyeyeballs"
-version = "2.4.3"
-description = "Happy Eyeballs for asyncio"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"},
-    {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"},
-]
-
-[[package]]
-name = "aiohttp"
-version = "3.10.10"
-description = "Async http client/server framework (asyncio)"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"},
-    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"},
-    {file = "aiohttp-3.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:333cf6cf8e65f6a1e06e9eb3e643a0c515bb850d470902274239fea02033e9a8"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:274cfa632350225ce3fdeb318c23b4a10ec25c0e2c880eff951a3842cf358ac1"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9e5e4a85bdb56d224f412d9c98ae4cbd032cc4f3161818f692cd81766eee65a"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b606353da03edcc71130b52388d25f9a30a126e04caef1fd637e31683033abd"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab5a5a0c7a7991d90446a198689c0535be89bbd6b410a1f9a66688f0880ec026"},
-    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:578a4b875af3e0daaf1ac6fa983d93e0bbfec3ead753b6d6f33d467100cdc67b"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8105fd8a890df77b76dd3054cddf01a879fc13e8af576805d667e0fa0224c35d"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3bcd391d083f636c06a68715e69467963d1f9600f85ef556ea82e9ef25f043f7"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fbc6264158392bad9df19537e872d476f7c57adf718944cc1e4495cbabf38e2a"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e48d5021a84d341bcaf95c8460b152cfbad770d28e5fe14a768988c461b821bc"},
-    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2609e9ab08474702cc67b7702dbb8a80e392c54613ebe80db7e8dbdb79837c68"},
-    {file = "aiohttp-3.10.10-cp310-cp310-win32.whl", hash = "sha256:84afcdea18eda514c25bc68b9af2a2b1adea7c08899175a51fe7c4fb6d551257"},
-    {file = "aiohttp-3.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:9c72109213eb9d3874f7ac8c0c5fa90e072d678e117d9061c06e30c85b4cf0e6"},
-    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c30a0eafc89d28e7f959281b58198a9fa5e99405f716c0289b7892ca345fe45f"},
-    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:258c5dd01afc10015866114e210fb7365f0d02d9d059c3c3415382ab633fcbcb"},
-    {file = "aiohttp-3.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:15ecd889a709b0080f02721255b3f80bb261c2293d3c748151274dfea93ac871"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3935f82f6f4a3820270842e90456ebad3af15810cf65932bd24da4463bc0a4c"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:413251f6fcf552a33c981c4709a6bba37b12710982fec8e558ae944bfb2abd38"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1720b4f14c78a3089562b8875b53e36b51c97c51adc53325a69b79b4b48ebcb"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:679abe5d3858b33c2cf74faec299fda60ea9de62916e8b67e625d65bf069a3b7"},
-    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79019094f87c9fb44f8d769e41dbb664d6e8fcfd62f665ccce36762deaa0e911"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe2fb38c2ed905a2582948e2de560675e9dfbee94c6d5ccdb1301c6d0a5bf092"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a3f00003de6eba42d6e94fabb4125600d6e484846dbf90ea8e48a800430cc142"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1bbb122c557a16fafc10354b9d99ebf2f2808a660d78202f10ba9d50786384b9"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:30ca7c3b94708a9d7ae76ff281b2f47d8eaf2579cd05971b5dc681db8caac6e1"},
-    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:df9270660711670e68803107d55c2b5949c2e0f2e4896da176e1ecfc068b974a"},
-    {file = "aiohttp-3.10.10-cp311-cp311-win32.whl", hash = "sha256:aafc8ee9b742ce75044ae9a4d3e60e3d918d15a4c2e08a6c3c3e38fa59b92d94"},
-    {file = "aiohttp-3.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:362f641f9071e5f3ee6f8e7d37d5ed0d95aae656adf4ef578313ee585b585959"},
-    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9294bbb581f92770e6ed5c19559e1e99255e4ca604a22c5c6397b2f9dd3ee42c"},
-    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a8fa23fe62c436ccf23ff930149c047f060c7126eae3ccea005f0483f27b2e28"},
-    {file = "aiohttp-3.10.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c6a5b8c7926ba5d8545c7dd22961a107526562da31a7a32fa2456baf040939f"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:007ec22fbc573e5eb2fb7dec4198ef8f6bf2fe4ce20020798b2eb5d0abda6138"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9627cc1a10c8c409b5822a92d57a77f383b554463d1884008e051c32ab1b3742"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50edbcad60d8f0e3eccc68da67f37268b5144ecc34d59f27a02f9611c1d4eec7"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a45d85cf20b5e0d0aa5a8dca27cce8eddef3292bc29d72dcad1641f4ed50aa16"},
-    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b00807e2605f16e1e198f33a53ce3c4523114059b0c09c337209ae55e3823a8"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f2d4324a98062be0525d16f768a03e0bbb3b9fe301ceee99611dc9a7953124e6"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438cd072f75bb6612f2aca29f8bd7cdf6e35e8f160bc312e49fbecab77c99e3a"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:baa42524a82f75303f714108fea528ccacf0386af429b69fff141ffef1c534f9"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a7d8d14fe962153fc681f6366bdec33d4356f98a3e3567782aac1b6e0e40109a"},
-    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c1277cd707c465cd09572a774559a3cc7c7a28802eb3a2a9472588f062097205"},
-    {file = "aiohttp-3.10.10-cp312-cp312-win32.whl", hash = "sha256:59bb3c54aa420521dc4ce3cc2c3fe2ad82adf7b09403fa1f48ae45c0cbde6628"},
-    {file = "aiohttp-3.10.10-cp312-cp312-win_amd64.whl", hash = "sha256:0e1b370d8007c4ae31ee6db7f9a2fe801a42b146cec80a86766e7ad5c4a259cf"},
-    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ad7593bb24b2ab09e65e8a1d385606f0f47c65b5a2ae6c551db67d6653e78c28"},
-    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1eb89d3d29adaf533588f209768a9c02e44e4baf832b08118749c5fad191781d"},
-    {file = "aiohttp-3.10.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3fe407bf93533a6fa82dece0e74dbcaaf5d684e5a51862887f9eaebe6372cd79"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50aed5155f819873d23520919e16703fc8925e509abbb1a1491b0087d1cd969e"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f05e9727ce409358baa615dbeb9b969db94324a79b5a5cea45d39bdb01d82e6"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dffb610a30d643983aeb185ce134f97f290f8935f0abccdd32c77bed9388b42"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6658732517ddabe22c9036479eabce6036655ba87a0224c612e1ae6af2087e"},
-    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:741a46d58677d8c733175d7e5aa618d277cd9d880301a380fd296975a9cdd7bc"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e00e3505cd80440f6c98c6d69269dcc2a119f86ad0a9fd70bccc59504bebd68a"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ffe595f10566f8276b76dc3a11ae4bb7eba1aac8ddd75811736a15b0d5311414"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdfcf6443637c148c4e1a20c48c566aa694fa5e288d34b20fcdc58507882fed3"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d183cf9c797a5291e8301790ed6d053480ed94070637bfaad914dd38b0981f67"},
-    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"},
-    {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"},
-    {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"},
-    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1b66ccafef7336a1e1f0e389901f60c1d920102315a56df85e49552308fc0486"},
-    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:acd48d5b80ee80f9432a165c0ac8cbf9253eaddb6113269a5e18699b33958dbb"},
-    {file = "aiohttp-3.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3455522392fb15ff549d92fbf4b73b559d5e43dc522588f7eb3e54c3f38beee7"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c3b868724137f713a38376fef8120c166d1eadd50da1855c112fe97954aed8"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:da1dee8948d2137bb51fbb8a53cce6b1bcc86003c6b42565f008438b806cccd8"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5ce2ce7c997e1971b7184ee37deb6ea9922ef5163c6ee5aa3c274b05f9e12fa"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28529e08fde6f12eba8677f5a8608500ed33c086f974de68cc65ab218713a59d"},
-    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7db54c7914cc99d901d93a34704833568d86c20925b2762f9fa779f9cd2e70f"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:03a42ac7895406220124c88911ebee31ba8b2d24c98507f4a8bf826b2937c7f2"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7e338c0523d024fad378b376a79faff37fafb3c001872a618cde1d322400a572"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:038f514fe39e235e9fef6717fbf944057bfa24f9b3db9ee551a7ecf584b5b480"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:64f6c17757251e2b8d885d728b6433d9d970573586a78b78ba8929b0f41d045a"},
-    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:93429602396f3383a797a2a70e5f1de5df8e35535d7806c9f91df06f297e109b"},
-    {file = "aiohttp-3.10.10-cp38-cp38-win32.whl", hash = "sha256:c823bc3971c44ab93e611ab1a46b1eafeae474c0c844aff4b7474287b75fe49c"},
-    {file = "aiohttp-3.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:54ca74df1be3c7ca1cf7f4c971c79c2daf48d9aa65dea1a662ae18926f5bc8ce"},
-    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"},
-    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"},
-    {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"},
-    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"},
-    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"},
-    {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"},
-    {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"},
-    {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"},
-]
-
-[package.dependencies]
-aiohappyeyeballs = ">=2.3.0"
-aiosignal = ">=1.1.2"
-async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
-attrs = ">=17.3.0"
-frozenlist = ">=1.1.1"
-multidict = ">=4.5,<7.0"
-yarl = ">=1.12.0,<2.0"
-
-[package.extras]
-speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
-
-[[package]]
-name = "aiosignal"
-version = "1.3.1"
-description = "aiosignal: a list of registered asynchronous callbacks"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
-    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
-]
-
-[package.dependencies]
-frozenlist = ">=1.1.0"
+testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
 
 [[package]]
 name = "annotated-types"
@@ -173,194 +38,156 @@ version = "0.7.0"
 description = "Reusable constraint types to use with typing.Annotated"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
     {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
 ]
 
-[[package]]
-name = "async-timeout"
-version = "4.0.3"
-description = "Timeout context manager for asyncio programs"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
-    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
-]
-
 [[package]]
 name = "attrs"
-version = "24.2.0"
+version = "25.3.0"
 description = "Classes Without Boilerplate"
 optional = true
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
-    {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
+    {file = "attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3"},
+    {file = "attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b"},
 ]
 
 [package.extras]
 benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
 cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier"]
 tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
 tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
 
-[[package]]
-name = "bitsandbytes"
-version = "0.43.3"
-description = "k-bit optimizers and matrix multiplication routines."
-optional = true
-python-versions = "*"
-files = [
-    {file = "bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:cc99507c352be0715098b2c7577b690dd158972dc4ea10c7495bac104c7c79f0"},
-    {file = "bitsandbytes-0.43.3-py3-none-win_amd64.whl", hash = "sha256:257f6552f2144748a84e6c44e1f7a98f3da888f675ed74e18fd7f7eb13c6cafa"},
-]
-
-[package.dependencies]
-numpy = "*"
-torch = "*"
-
-[package.extras]
-benchmark = ["matplotlib", "pandas"]
-test = ["scipy"]
-
 [[package]]
 name = "certifi"
-version = "2024.8.30"
+version = "2025.1.31"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
-    {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"},
-    {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
+    {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"},
+    {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"},
 ]
 
 [[package]]
 name = "charset-normalizer"
-version = "3.4.0"
+version = "3.4.1"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"},
-    {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"},
-    {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"},
-    {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
-    {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"},
-    {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"},
-    {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
-    {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
-    {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
-    {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f"},
+    {file = "charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-win32.whl", hash = "sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b"},
+    {file = "charset_normalizer-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35"},
+    {file = "charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407"},
+    {file = "charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30bf9fd9be89ecb2360c7d94a711f00c09b976258846efe40db3d05828e8089"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:97f68b8d6831127e4787ad15e6757232e14e12060bec17091b85eb1486b91d8d"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7974a0b5ecd505609e3b19742b60cee7aa2aa2fb3151bc917e6e2646d7667dcf"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc54db6c8593ef7d4b2a331b58653356cf04f67c960f584edb7c3d8c97e8f39e"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:311f30128d7d333eebd7896965bfcfbd0065f1716ec92bd5638d7748eb6f936a"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:7d053096f67cd1241601111b698f5cad775f97ab25d81567d3f59219b5f1adbd"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:807f52c1f798eef6cf26beb819eeb8819b1622ddfeef9d0977a8502d4db6d534"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:dccbe65bd2f7f7ec22c4ff99ed56faa1e9f785482b9bbd7c717e26fd723a1d1e"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:2fb9bd477fdea8684f78791a6de97a953c51831ee2981f8e4f583ff3b9d9687e"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:01732659ba9b5b873fc117534143e4feefecf3b2078b0a6a2e925271bb6f4cfa"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:7a4f97a081603d2050bfaffdefa5b02a9ec823f8348a572e39032caa8404a487"},
+    {file = "charset_normalizer-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7b1bef6280950ee6c177b326508f86cad7ad4dff12454483b51d8b7d673a2c5d"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e"},
+    {file = "charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-win32.whl", hash = "sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5"},
+    {file = "charset_normalizer-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765"},
+    {file = "charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85"},
+    {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"},
 ]
 
 [[package]]
 name = "click"
-version = "8.1.7"
+version = "8.1.8"
 description = "Composable command line interface toolkit"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
-    {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
+    {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
+    {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
 ]
 
 [package.dependencies]
@@ -368,13 +195,14 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
 [[package]]
 name = "cloudpickle"
-version = "3.1.0"
+version = "3.1.1"
 description = "Pickler class to extend the standard pickle.Pickler functionality"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"},
-    {file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"},
+    {file = "cloudpickle-3.1.1-py3-none-any.whl", hash = "sha256:c8c5a44295039331ee9dad40ba100a9c7297b6f988e50e87ccdf3765a668350e"},
+    {file = "cloudpickle-3.1.1.tar.gz", hash = "sha256:b216fa8ae4019d5482a8ac3c95d8f6346115d8835911fd4aefd1a445e4242c64"},
 ]
 
 [[package]]
@@ -383,83 +211,61 @@ version = "0.4.6"
 description = "Cross-platform colored terminal text."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+groups = ["main", "dev"]
 files = [
     {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
-
-[[package]]
-name = "datasets"
-version = "2.14.4"
-description = "HuggingFace community-driven open-source library of datasets"
-optional = true
-python-versions = ">=3.8.0"
-files = [
-    {file = "datasets-2.14.4-py3-none-any.whl", hash = "sha256:29336bd316a7d827ccd4da2236596279b20ca2ac78f64c04c9483da7cbc2459b"},
-    {file = "datasets-2.14.4.tar.gz", hash = "sha256:ef29c2b5841de488cd343cfc26ab979bff77efa4d2285af51f1ad7db5c46a83b"},
-]
-
-[package.dependencies]
-aiohttp = "*"
-dill = ">=0.3.0,<0.3.8"
-fsspec = {version = ">=2021.11.1", extras = ["http"]}
-huggingface-hub = ">=0.14.0,<1.0.0"
-multiprocess = "*"
-numpy = ">=1.17"
-packaging = "*"
-pandas = "*"
-pyarrow = ">=8.0.0"
-pyyaml = ">=5.1"
-requests = ">=2.19.0"
-tqdm = ">=4.62.1"
-xxhash = "*"
-
-[package.extras]
-apache-beam = ["apache-beam (>=2.26.0,<2.44.0)"]
-audio = ["librosa", "soundfile (>=0.12.1)"]
-benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"]
-dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "black (>=23.1,<24.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "pyyaml (>=5.3.1)", "rarfile (>=4.0)", "ruff (>=0.0.241)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"]
-docs = ["s3fs", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos", "torch", "transformers"]
-jax = ["jax (>=0.2.8,!=0.3.2,<=0.3.25)", "jaxlib (>=0.1.65,<=0.3.25)"]
-metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"]
-quality = ["black (>=23.1,<24.0)", "pyyaml (>=5.3.1)", "ruff (>=0.0.241)"]
-s3 = ["s3fs"]
-tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos"]
-tensorflow-gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"]
-tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy (<2.0.0)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "transformers", "zstandard"]
-torch = ["torch"]
-vision = ["Pillow (>=6.2.1)"]
+markers = {main = "platform_system == \"Windows\" or sys_platform == \"win32\"", dev = "sys_platform == \"win32\""}
 
 [[package]]
 name = "deprecated"
-version = "1.2.14"
+version = "1.2.18"
 description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
+groups = ["main"]
 files = [
-    {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"},
-    {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"},
+    {file = "Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec"},
+    {file = "deprecated-1.2.18.tar.gz", hash = "sha256:422b6f6d859da6f2ef57857761bfb392480502a64c3028ca9bbe86085d72115d"},
 ]
 
 [package.dependencies]
 wrapt = ">=1.10,<2"
 
 [package.extras]
-dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
+dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "setuptools", "tox"]
 
 [[package]]
-name = "dill"
-version = "0.3.7"
-description = "serialize all of Python"
+name = "diffusers"
+version = "0.31.0"
+description = "State-of-the-art diffusion in PyTorch and JAX."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8.0"
+groups = ["main"]
 files = [
-    {file = "dill-0.3.7-py3-none-any.whl", hash = "sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e"},
-    {file = "dill-0.3.7.tar.gz", hash = "sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03"},
+    {file = "diffusers-0.31.0-py3-none-any.whl", hash = "sha256:cbc498ae63f4abfc7c3a07649cdcbee229ef2f9a9a1f0d19c9bbaf22f8d30c1f"},
+    {file = "diffusers-0.31.0.tar.gz", hash = "sha256:b1d01a73e45d43a0630c299173915dddd69fc50f2ae8f2ab5de4fd245eaed72f"},
 ]
 
+[package.dependencies]
+filelock = "*"
+huggingface-hub = ">=0.23.2"
+importlib-metadata = "*"
+numpy = "*"
+Pillow = "*"
+regex = "!=2019.12.17"
+requests = "*"
+safetensors = ">=0.3.1"
+
 [package.extras]
-graph = ["objgraph (>=1.7.2)"]
+dev = ["GitPython (<3.1.19)", "Jinja2", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (>=0.0.12)", "librosa", "parameterized", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.1.5)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "torch (>=1.4,<2.5.0)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"]
+docs = ["hf-doc-builder (>=0.3.0)"]
+flax = ["flax (>=0.4.1)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)"]
+quality = ["hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<=2.0.0)"]
+test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (>=0.0.12)", "librosa", "parameterized", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "torchvision", "transformers (>=4.41.2)"]
+torch = ["accelerate (>=0.31.0)", "torch (>=1.4,<2.5.0)"]
+training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "tensorboard"]
 
 [[package]]
 name = "diskcache"
@@ -467,28 +273,20 @@ version = "5.6.3"
 description = "Disk Cache -- Disk and file backed persistent cache."
 optional = true
 python-versions = ">=3"
+groups = ["main"]
 files = [
     {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"},
     {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"},
 ]
 
-[[package]]
-name = "einops"
-version = "0.6.1"
-description = "A new flavour of deep learning operations"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "einops-0.6.1-py3-none-any.whl", hash = "sha256:99149e46cc808956b174932fe563d920db4d6e5dadb8c6ecdaa7483b7ef7cfc3"},
-    {file = "einops-0.6.1.tar.gz", hash = "sha256:f95f8d00f4ded90dbc4b19b6f98b177332614b0357dde66997f3ae5d474dc8c8"},
-]
-
 [[package]]
 name = "exceptiongroup"
 version = "1.2.2"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
+groups = ["dev"]
+markers = "python_version < \"3.11\""
 files = [
     {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
     {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
@@ -499,120 +297,33 @@ test = ["pytest (>=6)"]
 
 [[package]]
 name = "filelock"
-version = "3.16.1"
+version = "3.18.0"
 description = "A platform independent file lock."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"},
-    {file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"},
+    {file = "filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de"},
+    {file = "filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2"},
 ]
 
 [package.extras]
-docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
+docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
 typing = ["typing-extensions (>=4.12.2)"]
 
-[[package]]
-name = "frozenlist"
-version = "1.4.1"
-description = "A list-like structure which implements collections.abc.MutableSequence"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
-    {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
-    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
-    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
-    {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
-    {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
-    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
-    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
-    {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
-    {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
-    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
-    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
-    {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
-    {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
-    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
-    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
-    {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
-    {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
-    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
-    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
-    {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
-    {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
-    {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
-]
-
 [[package]]
 name = "fsspec"
-version = "2024.10.0"
+version = "2025.3.2"
 description = "File-system specification"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "fsspec-2024.10.0-py3-none-any.whl", hash = "sha256:03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871"},
-    {file = "fsspec-2024.10.0.tar.gz", hash = "sha256:eda2d8a4116d4f2429db8550f2457da57279247dd930bb12f821b58391359493"},
+    {file = "fsspec-2025.3.2-py3-none-any.whl", hash = "sha256:2daf8dc3d1dfa65b6aa37748d112773a7a08416f6c70d96b264c96476ecaf711"},
+    {file = "fsspec-2025.3.2.tar.gz", hash = "sha256:e52c77ef398680bbd6a98c0e628fbc469491282981209907bbc8aea76a04fdc6"},
 ]
 
-[package.dependencies]
-aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional = true, markers = "extra == \"http\""}
-
 [package.extras]
 abfs = ["adlfs"]
 adl = ["adlfs"]
@@ -637,26 +348,27 @@ sftp = ["paramiko"]
 smb = ["smbprotocol"]
 ssh = ["paramiko"]
 test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"]
-test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
+test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
 test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
 tqdm = ["tqdm"]
 
 [[package]]
 name = "googleapis-common-protos"
-version = "1.65.0"
+version = "1.70.0"
 description = "Common protobufs used in Google APIs"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "googleapis_common_protos-1.65.0-py2.py3-none-any.whl", hash = "sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63"},
-    {file = "googleapis_common_protos-1.65.0.tar.gz", hash = "sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0"},
+    {file = "googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8"},
+    {file = "googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257"},
 ]
 
 [package.dependencies]
-protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
+protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<7.0.0"
 
 [package.extras]
-grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
+grpc = ["grpcio (>=1.44.0,<2.0.0)"]
 
 [[package]]
 name = "grpc-interceptor"
@@ -664,6 +376,7 @@ version = "0.15.4"
 description = "Simplifies gRPC interceptors"
 optional = false
 python-versions = ">=3.7,<4.0"
+groups = ["main"]
 files = [
     {file = "grpc-interceptor-0.15.4.tar.gz", hash = "sha256:1f45c0bcb58b6f332f37c637632247c9b02bc6af0fdceb7ba7ce8d2ebbfb0926"},
     {file = "grpc_interceptor-0.15.4-py3-none-any.whl", hash = "sha256:0035f33228693ed3767ee49d937bac424318db173fef4d2d0170b3215f254d9d"},
@@ -677,237 +390,212 @@ testing = ["protobuf (>=4.21.9)"]
 
 [[package]]
 name = "grpcio"
-version = "1.67.0"
+version = "1.72.0rc1"
 description = "HTTP/2-based RPC framework"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main", "dev"]
 files = [
-    {file = "grpcio-1.67.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:bd79929b3bb96b54df1296cd3bf4d2b770bd1df6c2bdf549b49bab286b925cdc"},
-    {file = "grpcio-1.67.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:16724ffc956ea42967f5758c2f043faef43cb7e48a51948ab593570570d1e68b"},
-    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:2b7183c80b602b0ad816315d66f2fb7887614ead950416d60913a9a71c12560d"},
-    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:efe32b45dd6d118f5ea2e5deaed417d8a14976325c93812dd831908522b402c9"},
-    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe89295219b9c9e47780a0f1c75ca44211e706d1c598242249fe717af3385ec8"},
-    {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aa8d025fae1595a207b4e47c2e087cb88d47008494db258ac561c00877d4c8f8"},
-    {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f95e15db43e75a534420e04822df91f645664bf4ad21dfaad7d51773c80e6bb4"},
-    {file = "grpcio-1.67.0-cp310-cp310-win32.whl", hash = "sha256:a6b9a5c18863fd4b6624a42e2712103fb0f57799a3b29651c0e5b8119a519d65"},
-    {file = "grpcio-1.67.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6eb68493a05d38b426604e1dc93bfc0137c4157f7ab4fac5771fd9a104bbaa6"},
-    {file = "grpcio-1.67.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:e91d154689639932305b6ea6f45c6e46bb51ecc8ea77c10ef25aa77f75443ad4"},
-    {file = "grpcio-1.67.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cb204a742997277da678611a809a8409657b1398aaeebf73b3d9563b7d154c13"},
-    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:ae6de510f670137e755eb2a74b04d1041e7210af2444103c8c95f193340d17ee"},
-    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74b900566bdf68241118f2918d312d3bf554b2ce0b12b90178091ea7d0a17b3d"},
-    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4e95e43447a02aa603abcc6b5e727d093d161a869c83b073f50b9390ecf0fa8"},
-    {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0bb94e66cd8f0baf29bd3184b6aa09aeb1a660f9ec3d85da615c5003154bc2bf"},
-    {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:82e5bd4b67b17c8c597273663794a6a46a45e44165b960517fe6d8a2f7f16d23"},
-    {file = "grpcio-1.67.0-cp311-cp311-win32.whl", hash = "sha256:7fc1d2b9fd549264ae585026b266ac2db53735510a207381be509c315b4af4e8"},
-    {file = "grpcio-1.67.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac11ecb34a86b831239cc38245403a8de25037b448464f95c3315819e7519772"},
-    {file = "grpcio-1.67.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:227316b5631260e0bef8a3ce04fa7db4cc81756fea1258b007950b6efc90c05d"},
-    {file = "grpcio-1.67.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d90cfdafcf4b45a7a076e3e2a58e7bc3d59c698c4f6470b0bb13a4d869cf2273"},
-    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:77196216d5dd6f99af1c51e235af2dd339159f657280e65ce7e12c1a8feffd1d"},
-    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15c05a26a0f7047f720da41dc49406b395c1470eef44ff7e2c506a47ac2c0591"},
-    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3840994689cc8cbb73d60485c594424ad8adb56c71a30d8948d6453083624b52"},
-    {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5a1e03c3102b6451028d5dc9f8591131d6ab3c8a0e023d94c28cb930ed4b5f81"},
-    {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:682968427a63d898759474e3b3178d42546e878fdce034fd7474ef75143b64e3"},
-    {file = "grpcio-1.67.0-cp312-cp312-win32.whl", hash = "sha256:d01793653248f49cf47e5695e0a79805b1d9d4eacef85b310118ba1dfcd1b955"},
-    {file = "grpcio-1.67.0-cp312-cp312-win_amd64.whl", hash = "sha256:985b2686f786f3e20326c4367eebdaed3e7aa65848260ff0c6644f817042cb15"},
-    {file = "grpcio-1.67.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c9a35b8bc50db35ab8e3e02a4f2a35cfba46c8705c3911c34ce343bd777813a"},
-    {file = "grpcio-1.67.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:42199e704095b62688998c2d84c89e59a26a7d5d32eed86d43dc90e7a3bd04aa"},
-    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:c4c425f440fb81f8d0237c07b9322fc0fb6ee2b29fbef5f62a322ff8fcce240d"},
-    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:323741b6699cd2b04a71cb38f502db98f90532e8a40cb675393d248126a268af"},
-    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:662c8e105c5e5cee0317d500eb186ed7a93229586e431c1bf0c9236c2407352c"},
-    {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:f6bd2ab135c64a4d1e9e44679a616c9bc944547357c830fafea5c3caa3de5153"},
-    {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:2f55c1e0e2ae9bdd23b3c63459ee4c06d223b68aeb1961d83c48fb63dc29bc03"},
-    {file = "grpcio-1.67.0-cp313-cp313-win32.whl", hash = "sha256:fd6bc27861e460fe28e94226e3673d46e294ca4673d46b224428d197c5935e69"},
-    {file = "grpcio-1.67.0-cp313-cp313-win_amd64.whl", hash = "sha256:cf51d28063338608cd8d3cd64677e922134837902b70ce00dad7f116e3998210"},
-    {file = "grpcio-1.67.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:7f200aca719c1c5dc72ab68be3479b9dafccdf03df530d137632c534bb6f1ee3"},
-    {file = "grpcio-1.67.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0892dd200ece4822d72dd0952f7112c542a487fc48fe77568deaaa399c1e717d"},
-    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:f4d613fbf868b2e2444f490d18af472ccb47660ea3df52f068c9c8801e1f3e85"},
-    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c69bf11894cad9da00047f46584d5758d6ebc9b5950c0dc96fec7e0bce5cde9"},
-    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9bca3ca0c5e74dea44bf57d27e15a3a3996ce7e5780d61b7c72386356d231db"},
-    {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:014dfc020e28a0d9be7e93a91f85ff9f4a87158b7df9952fe23cc42d29d31e1e"},
-    {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d4ea4509d42c6797539e9ec7496c15473177ce9abc89bc5c71e7abe50fc25737"},
-    {file = "grpcio-1.67.0-cp38-cp38-win32.whl", hash = "sha256:9d75641a2fca9ae1ae86454fd25d4c298ea8cc195dbc962852234d54a07060ad"},
-    {file = "grpcio-1.67.0-cp38-cp38-win_amd64.whl", hash = "sha256:cff8e54d6a463883cda2fab94d2062aad2f5edd7f06ae3ed030f2a74756db365"},
-    {file = "grpcio-1.67.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:62492bd534979e6d7127b8a6b29093161a742dee3875873e01964049d5250a74"},
-    {file = "grpcio-1.67.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eef1dce9d1a46119fd09f9a992cf6ab9d9178b696382439446ca5f399d7b96fe"},
-    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:f623c57a5321461c84498a99dddf9d13dac0e40ee056d884d6ec4ebcab647a78"},
-    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54d16383044e681f8beb50f905249e4e7261dd169d4aaf6e52eab67b01cbbbe2"},
-    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2a44e572fb762c668e4812156b81835f7aba8a721b027e2d4bb29fb50ff4d33"},
-    {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:391df8b0faac84d42f5b8dfc65f5152c48ed914e13c522fd05f2aca211f8bfad"},
-    {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfd9306511fdfc623a1ba1dc3bc07fbd24e6cfbe3c28b4d1e05177baa2f99617"},
-    {file = "grpcio-1.67.0-cp39-cp39-win32.whl", hash = "sha256:30d47dbacfd20cbd0c8be9bfa52fdb833b395d4ec32fe5cff7220afc05d08571"},
-    {file = "grpcio-1.67.0-cp39-cp39-win_amd64.whl", hash = "sha256:f55f077685f61f0fbd06ea355142b71e47e4a26d2d678b3ba27248abfe67163a"},
-    {file = "grpcio-1.67.0.tar.gz", hash = "sha256:e090b2553e0da1c875449c8e75073dd4415dd71c9bde6a406240fdf4c0ee467c"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-linux_armv7l.whl", hash = "sha256:db7db4b246a7fb21aeb70e7220be480948aa9c535eaa777ea0c840416ed8cac9"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:baf028e61662fd320c18fb50070b6e330fa24b2b3a4d113f4d57b41e0f5b5873"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:bf84cf17dfbf49ebe11b081b7a3c83b23625a80c979741e2e98b0ddb41080397"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3fd6f8700d34754b32d13af234da2e413f408c8b741c8039f11beb06d53c3f6a"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f05d243b8d814dd1c6fca19e4e0c5986fc70e2c3aa29e2c7c67e877e4c03ede6"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:390a70394e2c315d7c480496db259ec16c00baeebf759c8967247269f0fee981"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b08973c62eda11343e7131d78635d50ae0c138a8f39eb817ca83cca842527d04"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ce539397a258af1dee26118c40327004d023617bc99493baaf8e7938491f7361"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-win32.whl", hash = "sha256:4f97f628095bbdf6d4c2c15c1bc18f0514f90781528bc6082bb697ccc71d4f42"},
+    {file = "grpcio-1.72.0rc1-cp310-cp310-win_amd64.whl", hash = "sha256:dbcdf7a5463b61fca1586b54f7ea3c9dfd159f535224f457ae307f52d8d4a839"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-linux_armv7l.whl", hash = "sha256:23ebb3947783f10fec3e1d0b29b94db8e72f721900d1dd9c1d6db5876da69066"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:fd96b20846907ed4cd95bf1d628f16732f450114bde897eedb323fc3bc1eddb3"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:6df1ba4a5f5793ae210699e1b1745f77a4ac17f73510fc36ee12c215f02523b4"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3398957c611f0af7cee4fdd34268b6664be8689eae0327440efb794e544908b"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ef66029da9cbe94ba3047c1b04653e1d5096ca8d036eb6e24092f0e847d2c4f"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6566e3e3458805381f8714492e8f559f082f8955ccd1c98d71f8afc0612dc841"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3c799bfa92450e95d3f1f9cc4b7d8cbefc8bd4356d3f6573d2fb5e698353192a"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a251992531f3b16be3c013ec45a9caa69ecfe9b45335652d5681659f6d117233"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-win32.whl", hash = "sha256:c9e5f2c628dedf0886b774eee17e003a043941024e68ee2ebe76be6981a7baab"},
+    {file = "grpcio-1.72.0rc1-cp311-cp311-win_amd64.whl", hash = "sha256:8b9c0a84ff584da3f5c0cb04ee3d87c0bc70d41ab5a21d3b943963a94c622892"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-linux_armv7l.whl", hash = "sha256:188ac9d8cb05c250e212ba946a65a8541419bdfd803373d6b7fb8b10fe5ff991"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:8bd956711dc21235bc78a70bf04a28b3f747c6576b9bb79362803707fec9f705"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:b032b9cbb325e28ff847b6aae1df5a090aa49b682dc80c926b24a96de43c01aa"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1ca12a4388a40eb0411264af291184e2cca38176996b591ac047844abd81d40b"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7cefd52f392f4d6747b401f825901c48176737f7b03b17be0a0a638da194749"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1a24408fb051b70efa440b95f7e1acbb1c3067609934aa53a953d8d2cfc4d824"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:c7b37608d14792d3dacb9aba55b96a17a074e139c4567b0ac5c1926302add910"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:81ca42a96299ca617f3bc7b60660f15cabb98de6fce440ecd4d0640a5554345f"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-win32.whl", hash = "sha256:9ff2ef2a553d4edc8c620df3735b15a1e7dc05a60262e8c28445f2676fb09189"},
+    {file = "grpcio-1.72.0rc1-cp312-cp312-win_amd64.whl", hash = "sha256:3c9a6613662591c198d9e4e499f3336bc5c1c0e3fe3f0922cf48e74b37b3dcd1"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-linux_armv7l.whl", hash = "sha256:995e3e5c43cab6d0f1922b43b3c01a2624a4497ce91c3124e807497654301c59"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:8dfb0ff2ddd708dbecdffa37245b79aef707e789ffb0fc6a8be01608d982afcd"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:7e08eb53d6123995da63df90ce50e5b834de0a8ebfb1a3ac0890a2e246d2771c"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:71cb52c0956fe7868692b490fda341a52d8187fab94e1136f5bd253c8e3560ac"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcf76ce8d4a6829f112ad88c4e6d528dbef922e01834d4a5cc3718bf599f7e84"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:8852b6234a52b6b694a5f9a5a687d59127b3e71c8e345eebd6d483abbc412217"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:d1a0fee8420d9e453dc8cba1c7c067ca2d3054487cb6616ab8dad41f15e57465"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:a13149f4fd3904093fa2dba484744dd7205f536650a533ab24dd95cca393c14c"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-win32.whl", hash = "sha256:cebe148511a1965363fc6aafd60a488fe9dc5d74dd92a59a8ecba66ddd53c573"},
+    {file = "grpcio-1.72.0rc1-cp313-cp313-win_amd64.whl", hash = "sha256:843352c352970a1df5bbf7da68d2770781f4bff2c85a4a0d20cc6eaaadf26e59"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-linux_armv7l.whl", hash = "sha256:2083c0cdff47ff7d4b093d05d703baeeef8db3b2c1f43c9f9d4288a99e444cdd"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:42df7e0f9d66f5c9b246d8e1da74605bce27b10dec20b6fc204edd6e7178da2d"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:1190c2e4f221b5bd0e6eba3e44d6758ef48eeb2216dcb9734c158e8a5d8ce6a3"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6d6c8d2ea63e1cdaaa81271e5c867fcd9732050324df372ff9d3163968be68c8"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f6ee161b9d112232e5d6be437bf56383dca2334bd17e8b7a4a3f97f33722bdd"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9abbdf945e3b151603d642f2bc7a637b87af2e3480ed047689bad9eb4fa9c712"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2edab5d26319a1fed695ec658efe3846b75e0c7f3a6202b042099c9b11dc10fd"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:03b46e0041bee18a786ccef978bc29a26e4bd1b73a6ca0b21252387167843ff1"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-win32.whl", hash = "sha256:9b861cbfb63433e02b52f9971644095bec4a5fcd1e4d3f94e18cfad38f649d53"},
+    {file = "grpcio-1.72.0rc1-cp39-cp39-win_amd64.whl", hash = "sha256:2416792a567cba9f92bffc1a55ce0f2c8106956a2e32bfe8a22a8094a56b7108"},
+    {file = "grpcio-1.72.0rc1.tar.gz", hash = "sha256:221793dccd3332060f426975a041d319d6d57323d857d4afc25257ec4a5a67f3"},
 ]
 
 [package.extras]
-protobuf = ["grpcio-tools (>=1.67.0)"]
+protobuf = ["grpcio-tools (>=1.72.0rc1)"]
 
 [[package]]
 name = "grpcio-reflection"
-version = "1.62.3"
+version = "1.71.0"
 description = "Standard Protobuf Reflection Service for gRPC"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "grpcio-reflection-1.62.3.tar.gz", hash = "sha256:cb84682933c400bddf94dd94f928d1c6570f500b6dd255973d4bfb495b82585f"},
-    {file = "grpcio_reflection-1.62.3-py3-none-any.whl", hash = "sha256:a48ef37df81a3bada78261fc92ef382f061112f989d1312398b945cc69838b9c"},
+    {file = "grpcio_reflection-1.71.0-py3-none-any.whl", hash = "sha256:8c88bdd9c92fcdd4d5df119997be05ecd0d7e10d377ec4a5072db507d2894612"},
+    {file = "grpcio_reflection-1.71.0.tar.gz", hash = "sha256:51504e977057ffabe66d1ed55557b15e969c42bb3a1f28ee45d730dd5f983bb5"},
 ]
 
 [package.dependencies]
-grpcio = ">=1.62.3"
-protobuf = ">=4.21.6"
+grpcio = ">=1.71.0"
+protobuf = ">=5.26.1,<6.0dev"
 
 [[package]]
 name = "grpcio-status"
-version = "1.62.3"
+version = "1.71.0"
 description = "Status proto mapping for gRPC"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "grpcio-status-1.62.3.tar.gz", hash = "sha256:289bdd7b2459794a12cf95dc0cb727bd4a1742c37bd823f760236c937e53a485"},
-    {file = "grpcio_status-1.62.3-py3-none-any.whl", hash = "sha256:f9049b762ba8de6b1086789d8315846e094edac2c50beaf462338b301a8fd4b8"},
+    {file = "grpcio_status-1.71.0-py3-none-any.whl", hash = "sha256:843934ef8c09e3e858952887467f8256aac3910c55f077a359a65b2b3cde3e68"},
+    {file = "grpcio_status-1.71.0.tar.gz", hash = "sha256:11405fed67b68f406b3f3c7c5ae5104a79d2d309666d10d61b152e91d28fb968"},
 ]
 
 [package.dependencies]
 googleapis-common-protos = ">=1.5.5"
-grpcio = ">=1.62.3"
-protobuf = ">=4.21.6"
+grpcio = ">=1.71.0"
+protobuf = ">=5.26.1,<6.0dev"
 
 [[package]]
 name = "grpcio-tools"
-version = "1.62.3"
+version = "1.71.0"
 description = "Protobuf code generator for gRPC"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["dev"]
 files = [
-    {file = "grpcio-tools-1.62.3.tar.gz", hash = "sha256:7c7136015c3d62c3eef493efabaf9e3380e3e66d24ee8e94c01cb71377f57833"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2f968b049c2849540751ec2100ab05e8086c24bead769ca734fdab58698408c1"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:0a8c0c4724ae9c2181b7dbc9b186df46e4f62cb18dc184e46d06c0ebeccf569e"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5782883a27d3fae8c425b29a9d3dcf5f47d992848a1b76970da3b5a28d424b26"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3d812daffd0c2d2794756bd45a353f89e55dc8f91eb2fc840c51b9f6be62667"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b47d0dda1bdb0a0ba7a9a6de88e5a1ed61f07fad613964879954961e36d49193"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ca246dffeca0498be9b4e1ee169b62e64694b0f92e6d0be2573e65522f39eea9"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-win32.whl", hash = "sha256:6a56d344b0bab30bf342a67e33d386b0b3c4e65868ffe93c341c51e1a8853ca5"},
-    {file = "grpcio_tools-1.62.3-cp310-cp310-win_amd64.whl", hash = "sha256:710fecf6a171dcbfa263a0a3e7070e0df65ba73158d4c539cec50978f11dad5d"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:703f46e0012af83a36082b5f30341113474ed0d91e36640da713355cd0ea5d23"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:7cc83023acd8bc72cf74c2edbe85b52098501d5b74d8377bfa06f3e929803492"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ff7d58a45b75df67d25f8f144936a3e44aabd91afec833ee06826bd02b7fbe7"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f2483ea232bd72d98a6dc6d7aefd97e5bc80b15cd909b9e356d6f3e326b6e43"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:962c84b4da0f3b14b3cdb10bc3837ebc5f136b67d919aea8d7bb3fd3df39528a"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8ad0473af5544f89fc5a1ece8676dd03bdf160fb3230f967e05d0f4bf89620e3"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-win32.whl", hash = "sha256:db3bc9fa39afc5e4e2767da4459df82b095ef0cab2f257707be06c44a1c2c3e5"},
-    {file = "grpcio_tools-1.62.3-cp311-cp311-win_amd64.whl", hash = "sha256:e0898d412a434e768a0c7e365acabe13ff1558b767e400936e26b5b6ed1ee51f"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:d102b9b21c4e1e40af9a2ab3c6d41afba6bd29c0aa50ca013bf85c99cdc44ac5"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:0a52cc9444df978438b8d2332c0ca99000521895229934a59f94f37ed896b133"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141d028bf5762d4a97f981c501da873589df3f7e02f4c1260e1921e565b376fa"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47a5c093ab256dec5714a7a345f8cc89315cb57c298b276fa244f37a0ba507f0"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f6831fdec2b853c9daa3358535c55eed3694325889aa714070528cf8f92d7d6d"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e02d7c1a02e3814c94ba0cfe43d93e872c758bd8fd5c2797f894d0c49b4a1dfc"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-win32.whl", hash = "sha256:b881fd9505a84457e9f7e99362eeedd86497b659030cf57c6f0070df6d9c2b9b"},
-    {file = "grpcio_tools-1.62.3-cp312-cp312-win_amd64.whl", hash = "sha256:11c625eebefd1fd40a228fc8bae385e448c7e32a6ae134e43cf13bbc23f902b7"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:ec6fbded0c61afe6f84e3c2a43e6d656791d95747d6d28b73eff1af64108c434"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:bfda6ee8990997a9df95c5606f3096dae65f09af7ca03a1e9ca28f088caca5cf"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b77f9f9cee87cd798f0fe26b7024344d1b03a7cd2d2cba7035f8433b13986325"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e02d3b96f2d0e4bab9ceaa30f37d4f75571e40c6272e95364bff3125a64d184"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1da38070738da53556a4b35ab67c1b9884a5dd48fa2f243db35dc14079ea3d0c"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ace43b26d88a58dcff16c20d23ff72b04d0a415f64d2820f4ff06b1166f50557"},
-    {file = "grpcio_tools-1.62.3-cp37-cp37m-win_amd64.whl", hash = "sha256:350a80485e302daaa95d335a931f97b693e170e02d43767ab06552c708808950"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:c3a1ac9d394f8e229eb28eec2e04b9a6f5433fa19c9d32f1cb6066e3c5114a1d"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:11f363570dea661dde99e04a51bd108a5807b5df32a6f8bdf4860e34e94a4dbf"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9ad9950119d8ae27634e68b7663cc8d340ae535a0f80d85a55e56a6973ab1f"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c5d22b252dcef11dd1e0fbbe5bbfb9b4ae048e8880d33338215e8ccbdb03edc"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:27cd9ef5c5d68d5ed104b6dcb96fe9c66b82050e546c9e255716903c3d8f0373"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f4b1615adf67bd8bb71f3464146a6f9949972d06d21a4f5e87e73f6464d97f57"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-win32.whl", hash = "sha256:e18e15287c31baf574fcdf8251fb7f997d64e96c6ecf467906e576da0a079af6"},
-    {file = "grpcio_tools-1.62.3-cp38-cp38-win_amd64.whl", hash = "sha256:6c3064610826f50bd69410c63101954676edc703e03f9e8f978a135f1aaf97c1"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:8e62cc7164b0b7c5128e637e394eb2ef3db0e61fc798e80c301de3b2379203ed"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:c8ad5cce554e2fcaf8842dee5d9462583b601a3a78f8b76a153c38c963f58c10"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec279dcf3518201fc592c65002754f58a6b542798cd7f3ecd4af086422f33f29"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c989246c2aebc13253f08be32538a4039a64e12d9c18f6d662d7aee641dc8b5"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ca4f5eeadbb57cf03317d6a2857823239a63a59cc935f5bd6cf6e8b7af7a7ecc"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0cb3a3436ac119cbd37a7d3331d9bdf85dad21a6ac233a3411dff716dcbf401e"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-win32.whl", hash = "sha256:3eae6ea76d62fcac091e1f15c2dcedf1dc3f114f8df1a972a8a0745e89f4cf61"},
-    {file = "grpcio_tools-1.62.3-cp39-cp39-win_amd64.whl", hash = "sha256:eec73a005443061f4759b71a056f745e3b000dc0dc125c9f20560232dfbcbd14"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:f4ad7f0d756546902597053d70b3af2606fbd70d7972876cd75c1e241d22ae00"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:64bdb291df61cf570b5256777ad5fe2b1db6d67bc46e55dc56a0a862722ae329"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:8dd9795e982d77a4b496f7278b943c2563d9afde2069cdee78c111a40cc4d675"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1b5860c41a36b26fec4f52998f1a451d0525a5c9a4fb06b6ea3e9211abdb925"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3059c14035e5dc03d462f261e5900b9a077fd1a36976c3865b8507474520bad4"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f360981b215b1d5aff9235b37e7e1826246e35bbac32a53e41d4e990a37b8f4c"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bfe3888c3bbe16a5aa39409bc38744a31c0c3d2daa2b0095978c56e106c85b42"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:145985c0bf12131f0a1503e65763e0f060473f7f3928ed1ff3fb0e8aad5bc8ac"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-win32.whl", hash = "sha256:82c430edd939bb863550ee0fecf067d78feff828908a1b529bbe33cc57f2419c"},
+    {file = "grpcio_tools-1.71.0-cp310-cp310-win_amd64.whl", hash = "sha256:83e90724e3f02415c628e4ead1d6ffe063820aaaa078d9a39176793df958cd5a"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:1f19b16b49afa5d21473f49c0966dd430c88d089cd52ac02404d8cef67134efb"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-macosx_10_14_universal2.whl", hash = "sha256:459c8f5e00e390aecd5b89de67deb3ec7188a274bc6cb50e43cef35ab3a3f45d"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:edab7e6518de01196be37f96cb1e138c3819986bf5e2a6c9e1519b4d716b2f5a"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8b93b9f6adc7491d4c10144c0643409db298e5e63c997106a804f6f0248dbaf4"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ae5f2efa9e644c10bf1021600bfc099dfbd8e02b184d2d25dc31fcd6c2bc59e"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:65aa082f4435571d65d5ce07fc444f23c3eff4f3e34abef599ef8c9e1f6f360f"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:1331e726e08b7bdcbf2075fcf4b47dff07842b04845e6e220a08a4663e232d7f"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6693a7d3ba138b0e693b3d1f687cdd9db9e68976c3fa2b951c17a072fea8b583"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-win32.whl", hash = "sha256:6d11ed3ff7b6023b5c72a8654975324bb98c1092426ba5b481af406ff559df00"},
+    {file = "grpcio_tools-1.71.0-cp311-cp311-win_amd64.whl", hash = "sha256:072b2a5805ac97e4623b3aa8f7818275f3fb087f4aa131b0fce00471065f6eaa"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:61c0409d5bdac57a7bd0ce0ab01c1c916728fe4c8a03d77a25135ad481eb505c"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-macosx_10_14_universal2.whl", hash = "sha256:28784f39921d061d2164a9dcda5164a69d07bf29f91f0ea50b505958292312c9"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:192808cf553cedca73f0479cc61d5684ad61f24db7a5f3c4dfe1500342425866"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:989ee9da61098230d3d4c8f8f8e27c2de796f1ff21b1c90110e636d9acd9432b"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:541a756276c8a55dec991f6c0106ae20c8c8f5ce8d0bdbfcb01e2338d1a8192b"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:870c0097700d13c403e5517cb7750ab5b4a791ce3e71791c411a38c5468b64bd"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:abd57f615e88bf93c3c6fd31f923106e3beb12f8cd2df95b0d256fa07a7a0a57"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:753270e2d06d37e6d7af8967d1d059ec635ad215882041a36294f4e2fd502b2e"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-win32.whl", hash = "sha256:0e647794bd7138b8c215e86277a9711a95cf6a03ff6f9e555d54fdf7378b9f9d"},
+    {file = "grpcio_tools-1.71.0-cp312-cp312-win_amd64.whl", hash = "sha256:48debc879570972d28bfe98e4970eff25bb26da3f383e0e49829b2d2cd35ad87"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:9a78d07d6c301a25ef5ede962920a522556a1dfee1ccc05795994ceb867f766c"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-macosx_10_14_universal2.whl", hash = "sha256:580ac88141c9815557e63c9c04f5b1cdb19b4db8d0cb792b573354bde1ee8b12"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f7c678e68ece0ae908ecae1c4314a0c2c7f83e26e281738b9609860cc2c82d96"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:56ecd6cc89b5e5eed1de5eb9cafce86c9c9043ee3840888cc464d16200290b53"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e52a041afc20ab2431d756b6295d727bd7adee813b21b06a3483f4a7a15ea15f"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:2a1712f12102b60c8d92779b89d0504e0d6f3a59f2b933e5622b8583f5c02992"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:41878cb7a75477e62fdd45e7e9155b3af1b7a5332844021e2511deaf99ac9e6c"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:682e958b476049ccc14c71bedf3f979bced01f6e0c04852efc5887841a32ad6b"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-win32.whl", hash = "sha256:0ccfb837152b7b858b9f26bb110b3ae8c46675d56130f6c2f03605c4f129be13"},
+    {file = "grpcio_tools-1.71.0-cp313-cp313-win_amd64.whl", hash = "sha256:ffff9bc5eacb34dd26b487194f7d44a3e64e752fc2cf049d798021bf25053b87"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:834959b6eceb85de5217a411aba1643b5f782798680c122202d6a06177226644"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-macosx_10_14_universal2.whl", hash = "sha256:e3ae9556e2a1cd70e7d7b0e0459c35af71d51a7dae4cf36075068011a69f13ec"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:77fe6db1334e0ce318b2cb4e70afa94e0c173ed1a533d37aea69ad9f61ae8ea9"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57e3e2544c306b60ef2d76570bac4e977be1ad548641c9eec130c3bc47e80141"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af39e245fa56f7f5c2fe86b7d6c1b78f395c07e54d5613cbdbb3c24769a92b6e"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8f987d0053351217954543b174b0bddbf51d45b3cfcf8d6de97b0a43d264d753"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8e6cdbba4dae7b37b0d25d074614be9936fb720144420f03d9f142a80be69ba2"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3adc8b229e60c77bab5a5d62b415667133bd5ced7d59b5f71d6317c9143631e"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-win32.whl", hash = "sha256:f68334d28a267fabec6e70cb5986e9999cfbfd14db654094ddf9aedd804a293a"},
+    {file = "grpcio_tools-1.71.0-cp39-cp39-win_amd64.whl", hash = "sha256:1291a6136c07a86c3bb09f6c33f5cf227cc14956edd1b85cb572327a36e0aef8"},
+    {file = "grpcio_tools-1.71.0.tar.gz", hash = "sha256:38dba8e0d5e0fb23a034e09644fdc6ed862be2371887eee54901999e8f6792a8"},
 ]
 
 [package.dependencies]
-grpcio = ">=1.62.3"
-protobuf = ">=4.21.6,<5.0dev"
+grpcio = ">=1.71.0"
+protobuf = ">=5.26.1,<6.0dev"
 setuptools = "*"
 
 [[package]]
 name = "hf-transfer"
-version = "0.1.8"
+version = "0.1.9"
 description = "Speed up file transfers with the Hugging Face Hub."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "hf_transfer-0.1.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:70858f9e94286738ed300484a45beb5cfee6a7ddac4c5886f9c6fce7823ac5ab"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:38adc73f0a8526319d90f7cc5dc2d5e4bb66f487a513d94b98aa6725be732e4a"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44d2f0c08198d8d899fe9d66e86aee2dd844bd7ce33888f261373fcec81d2a54"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1de2a4ef36f9e60b3d3bec00193c0aafd75771709f2ca51b9b162373f5af3d32"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e319269e3606a5ff2979296841766649ac73598a4a8eee2a968f86c8071fea5a"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f6026cf3be6a53ea42f92172f60c1c0675baaa9073f865e671b661dde5fd157"},
-    {file = "hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f865c33ada5bd3650c2b46e59979f2d7755c3f517f8d0facc78576a0c7d26406"},
-    {file = "hf_transfer-0.1.8-cp310-none-win32.whl", hash = "sha256:2054730e8d8ed21917c64be7199e06424b2bd08df1c43a72766afaed7992f2d3"},
-    {file = "hf_transfer-0.1.8-cp310-none-win_amd64.whl", hash = "sha256:2b4f1a9446ba31170b5b1eca4e916504d18378a6b5fe959896bdac8a736a5ecb"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e27c15fcc5869ad7e52bbc0bdec6106b288d1c463f8d2da92f28615a3b181361"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:871a0032d011ebc6409a73a8406b98b84ff2cd3ed7d9e1af8cdf4d660b9fab9b"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:686fa756e1e0214bb6327d33c66732c52274d94a8460beb50604ad988b391cf6"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:36a03b1b2911b0cf15b1b9d971a34b32dadcc4f2fd979aaff5979d6ce4017c34"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:079db90c81f41f4cf3227dfaaa855a9b8e9aef45bc7c2be29ce7232cd83ff881"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac08a4524127fdd14c234d4bcbe49d1c498acf5335c781714823179bcc8dc039"},
-    {file = "hf_transfer-0.1.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:837432e73cb17274a6782b6216e8ce058aa325a475dc44a5a6a753d48b86d18a"},
-    {file = "hf_transfer-0.1.8-cp311-none-win32.whl", hash = "sha256:b180f9823dde35aba9bc0f1d0c04ac8a873baebd3732a7ffe4f11940abc7df0d"},
-    {file = "hf_transfer-0.1.8-cp311-none-win_amd64.whl", hash = "sha256:37907d2135cebcf8b6d419bb575148d89c224f16b69357f027bd29d0e85c6529"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:baf948f4f493949309cbe60529620b9b0aef854a22b6e526753364acc57c09b6"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0bce5c8bdefa478c5d5eaa646cc4ce1df5cfe764d98572ad0c6b8773e98d49f6"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54d6f8a1a86128d651a3799e1267c343d60f81f2c565d7c5416eb8e674e4cf0e"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f79fd1b0c2ed93efb4c5f684118d7a762ecdd218e170df8208c4e13d3dcd4959"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:414df35692670683bf5623498ef9d88a8df5d77e9516515da6e2b34d1054c11f"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c9798d5f951f66b96d40a7a53910260cb5874fda56cf5944dddb7c571f37ec3"},
-    {file = "hf_transfer-0.1.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:060c661691f85a61392e57579c80eb64b5ee277434e81fb582f605c1c8ff05d5"},
-    {file = "hf_transfer-0.1.8-cp312-none-win32.whl", hash = "sha256:f7840e32379820c3e1571a480238e05ea043e970c99d2e999578004a2eb17788"},
-    {file = "hf_transfer-0.1.8-cp312-none-win_amd64.whl", hash = "sha256:9a3204ec423cc5e659872e8179f8704ad9ce2abb1e6a991f8838aedf1dc07830"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09949e86ad63ee139e463fd0dfaf401515ae70445854199f61d545514c65f744"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bf1a74552845b93ea972e6e7131ef54e56056aa54137e93a40faf3fbcb2442ff"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:959bcb3afb4ee6f2a07031a947dba98ec0b64c001bc914fbd8fc32e13a287162"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e01eecdb8162bd61dab9090fbd9f8034dd8b5755ef727a21ca8a057f80cb91ee"},
-    {file = "hf_transfer-0.1.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50650a38e9d31f5ad8f010e4598bf304ecd99c17162e7d93f67e031571b864ee"},
-    {file = "hf_transfer-0.1.8-cp37-none-win32.whl", hash = "sha256:e29b9d1d378138f2f4eae0e93ca94af3b5d45f4532eef69f1ab97fe06f9c9d9e"},
-    {file = "hf_transfer-0.1.8-cp37-none-win_amd64.whl", hash = "sha256:cfd6cef43ae883103117a371f8ebae4e7f9637bc6fb480f1be5568e2fe22a8a7"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92a68f7a0043cca8a0de4decc760dca177530944cbab502afac503bd1b2fa01a"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e3138e408179f80a5480598e32f8e1abb564915cbde4d3bc8da52811c75dc3ea"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4544d148930ad34442d43b8fa911c8479c04a95b858b1d1f91e0b7da77082fad"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a851794b9f029965664f8c3002c957fccf21685e9397ceb4f9f19c986dee8ad3"},
-    {file = "hf_transfer-0.1.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:791aaf87c5319ac83edb6ab2994b3db19924c49d6ff667dd3d8a610b455ff70a"},
-    {file = "hf_transfer-0.1.8-cp38-none-win32.whl", hash = "sha256:8f71e5d35d3a3160dcca12fdcc8119033aeacaa6a32838a7ad9f9cb1008bbe58"},
-    {file = "hf_transfer-0.1.8-cp38-none-win_amd64.whl", hash = "sha256:543287b4ceb1e25501580b99690f7f0df9d3631d29306f37cbd97e918c732944"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:7ce02a18bd0bb2343e707ac85b68c946bc37623ee24150c69158f6b2b2c7a98f"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:64d7f8dbd64ba183ed1df75d47c84e075ff666ceaa335bff1de16b09eaac5b80"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e7858694e11419ae27e542fb8fc0d0e54d46ff7768fe73bc359d70b8f5aa578"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bed116cd9d1edfa32c0136d7cb8e5f1afd2b32df43c49085d428f108fc8e1c8f"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e385d0da9c6b3472ab29285d2d46c9f9903205b8d108f88a82f3f85aafae0ab"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98f75fa4b86ef15433cd907807ac77d1fb39d7e7b790bfd39c7ae9c385bf0200"},
-    {file = "hf_transfer-0.1.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1a63ad947d2901425ac0a3ed70c3696dfde27fadb0482ed763bdd5cc946b278"},
-    {file = "hf_transfer-0.1.8-cp39-none-win32.whl", hash = "sha256:3e74096915813ae842ea6a5bdf10c0fef960aa51a35a560955b3e61cdfe3db57"},
-    {file = "hf_transfer-0.1.8-cp39-none-win_amd64.whl", hash = "sha256:05ea16307bf4a5eb097cbc6e5057e4eb5e080a138af23ef639fd38857723c288"},
-    {file = "hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:928ff036c3e98e10dcfbdb4fcdfc4592d37a5cc8e365a7ba8dfd4337e849d675"},
-    {file = "hf_transfer-0.1.8-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d49ba3ce67035f460ae1924fe2feafec155cb535eec7f31ed5109c19064cd294"},
-    {file = "hf_transfer-0.1.8-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b01f5872c62cfee3ec9ca5c738818296f69f8adf84b4d8d15f2a5601d9dda339"},
-    {file = "hf_transfer-0.1.8-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:659d4212d50847a5165666bf43d67727679b4f694ef9c413613cc27093136527"},
-    {file = "hf_transfer-0.1.8.tar.gz", hash = "sha256:26d229468152e7a3ec12664cac86b8c2800695fd85f9c9a96677a775cc04f0b3"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:6e94e8822da79573c9b6ae4d6b2f847c59a7a06c5327d7db20751b68538dc4f6"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3ebc4ab9023414880c8b1d3c38174d1c9989eb5022d37e814fa91a3060123eb0"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8674026f21ed369aa2a0a4b46000aca850fc44cd2b54af33a172ce5325b4fc82"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a736dfbb2c84f5a2c975478ad200c0c8bfcb58a25a35db402678fb87ce17fa4"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:504b8427fd785dd8546d53b9fafe6e436bd7a3adf76b9dce556507650a7b4567"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c7fc1b85f4d0f76e452765d7648c9f4bfd0aedb9ced2ae1ebfece2d8cfaf8e2"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d991376f0eac70a60f0cbc95602aa708a6f7c8617f28b4945c1431d67b8e3c8"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e6ac4eddcd99575ed3735ed911ddf9d1697e2bd13aa3f0ad7e3904dd4863842e"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:57fd9880da1ee0f47250f735f791fab788f0aa1ee36afc49f761349869c8b4d9"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:5d561f0520f493c66b016d99ceabe69c23289aa90be38dd802d2aef279f15751"},
+    {file = "hf_transfer-0.1.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:a5b366d34cd449fe9b20ef25941e6eef0460a2f74e7389f02e673e1f88ebd538"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e66acf91df4a8b72f60223059df3003062a5ae111757187ed1a06750a30e911b"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:8669dbcc7a3e2e8d61d42cd24da9c50d57770bd74b445c65123291ca842a7e7a"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8fd0167c4407a3bc4cdd0307e65ada2294ec04f1813d8a69a5243e379b22e9d8"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee8b10afedcb75f71091bcc197c526a6ebf5c58bbbadb34fdeee6160f55f619f"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5828057e313de59300dd1abb489444bc452efe3f479d3c55b31a8f680936ba42"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc6bd19e1cc177c66bdef15ef8636ad3bde79d5a4f608c158021153b4573509d"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdca9bfb89e6f8f281890cc61a8aff2d3cecaff7e1a4d275574d96ca70098557"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:89a23f58b7b7effbc047b8ca286f131b17728c99a9f972723323003ffd1bb916"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:dc7fff1345980d6c0ebb92c811d24afa4b98b3e07ed070c8e38cc91fd80478c5"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1a6bd16c667ebe89a069ca163060127a794fa3a3525292c900b8c8cc47985b0d"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d2fde99d502093ade3ab1b53f80da18480e9902aa960dab7f74fb1b9e5bc5746"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-win32.whl", hash = "sha256:435cc3cdc8524ce57b074032b8fd76eed70a4224d2091232fa6a8cef8fd6803e"},
+    {file = "hf_transfer-0.1.9-cp38-abi3-win_amd64.whl", hash = "sha256:16f208fc678911c37e11aa7b586bc66a37d02e636208f18b6bc53d29b5df40ad"},
+    {file = "hf_transfer-0.1.9.tar.gz", hash = "sha256:035572865dab29d17e783fbf1e84cf1cb24f3fcf8f1b17db1cfc7fdf139f02bf"},
 ]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.26.1"
+version = "0.30.2"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
+groups = ["main"]
 files = [
-    {file = "huggingface_hub-0.26.1-py3-none-any.whl", hash = "sha256:5927a8fc64ae68859cd954b7cc29d1c8390a5e15caba6d3d349c973be8fdacf3"},
-    {file = "huggingface_hub-0.26.1.tar.gz", hash = "sha256:414c0d9b769eecc86c70f9d939d0f48bb28e8461dd1130021542eff0212db890"},
+    {file = "huggingface_hub-0.30.2-py3-none-any.whl", hash = "sha256:68ff05969927058cfa41df4f2155d4bb48f5f54f719dd0390103eefa9b191e28"},
+    {file = "huggingface_hub-0.30.2.tar.gz", hash = "sha256:9a7897c5b6fd9dad3168a794a8998d6378210f5b9688d0dfc180b1a228dc2466"},
 ]
 
 [package.dependencies]
@@ -920,13 +608,14 @@ tqdm = ">=4.42.1"
 typing-extensions = ">=3.7.4.3"
 
 [package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
 hf-transfer = ["hf-transfer (>=0.1.4)"]
+hf-xet = ["hf-xet (>=0.1.4)"]
 inference = ["aiohttp"]
-quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.5.0)"]
+quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.9.0)"]
 tensorflow = ["graphviz", "pydot", "tensorflow"]
 tensorflow-testing = ["keras (<3.0)", "tensorflow"]
 testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
@@ -939,6 +628,7 @@ version = "3.10"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
     {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
@@ -949,13 +639,14 @@ all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2
 
 [[package]]
 name = "importlib-metadata"
-version = "8.5.0"
+version = "8.6.1"
 description = "Read metadata from Python packages"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"},
-    {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"},
+    {file = "importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e"},
+    {file = "importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580"},
 ]
 
 [package.dependencies]
@@ -967,18 +658,19 @@ cover = ["pytest-cov"]
 doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
 enabler = ["pytest-enabler (>=2.2)"]
 perf = ["ipython"]
-test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
+test = ["flufl.flake8", "importlib_resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
 type = ["pytest-mypy"]
 
 [[package]]
 name = "iniconfig"
-version = "2.0.0"
+version = "2.1.0"
 description = "brain-dead simple config-ini parsing"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["dev"]
 files = [
-    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
-    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+    {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
+    {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
 ]
 
 [[package]]
@@ -987,6 +679,7 @@ version = "0.3.3"
 description = "a regex intersection checker"
 optional = true
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c"},
     {file = "interegular-0.3.3.tar.gz", hash = "sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600"},
@@ -994,13 +687,14 @@ files = [
 
 [[package]]
 name = "jinja2"
-version = "3.1.4"
+version = "3.1.6"
 description = "A very fast and expressive template engine."
-optional = true
+optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
-    {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
+    {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"},
+    {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"},
 ]
 
 [package.dependencies]
@@ -1013,8 +707,9 @@ i18n = ["Babel (>=2.7)"]
 name = "joblib"
 version = "1.4.2"
 description = "Lightweight pipelining with Python functions"
-optional = true
+optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
     {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
@@ -1026,6 +721,7 @@ version = "4.23.0"
 description = "An implementation of JSON Schema validation for Python"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"},
     {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"},
@@ -1047,6 +743,7 @@ version = "2024.10.1"
 description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"},
     {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"},
@@ -1061,6 +758,7 @@ version = "1.2.2"
 description = "a modern parsing library"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "lark-1.2.2-py3-none-any.whl", hash = "sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c"},
     {file = "lark-1.2.2.tar.gz", hash = "sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80"},
@@ -1078,6 +776,7 @@ version = "0.43.0"
 description = "lightweight wrapper around basic LLVM functionality"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"},
     {file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"},
@@ -1104,13 +803,14 @@ files = [
 
 [[package]]
 name = "loguru"
-version = "0.6.0"
+version = "0.7.3"
 description = "Python logging made (stupidly) simple"
 optional = false
-python-versions = ">=3.5"
+python-versions = "<4.0,>=3.5"
+groups = ["main"]
 files = [
-    {file = "loguru-0.6.0-py3-none-any.whl", hash = "sha256:4e2414d534a2ab57573365b3e6d0234dfb1d84b68b7f3b948e6fb743860a77c3"},
-    {file = "loguru-0.6.0.tar.gz", hash = "sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c"},
+    {file = "loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c"},
+    {file = "loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6"},
 ]
 
 [package.dependencies]
@@ -1118,7 +818,7 @@ colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
 win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
 
 [package.extras]
-dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils (==0.16)", "flake8 (>=3.7.7)", "isort (>=5.1.1)", "pytest (>=4.6.2)", "pytest-cov (>=2.7.1)", "sphinx-autobuild (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)", "tox (>=3.9.0)"]
+dev = ["Sphinx (==8.1.3)", "build (==1.2.2)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.5.0)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.13.0)", "mypy (==v1.4.1)", "myst-parser (==4.0.0)", "pre-commit (==4.0.1)", "pytest (==6.1.2)", "pytest (==8.3.2)", "pytest-cov (==2.12.1)", "pytest-cov (==5.0.0)", "pytest-cov (==6.0.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.1.0)", "sphinx-rtd-theme (==3.0.2)", "tox (==3.27.1)", "tox (==4.23.2)", "twine (==6.0.1)"]
 
 [[package]]
 name = "markdown-it-py"
@@ -1126,6 +826,7 @@ version = "3.0.0"
 description = "Python port of markdown-it. Markdown parsing, done right!"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
     {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
@@ -1150,6 +851,7 @@ version = "3.0.2"
 description = "Safely add untrusted strings to HTML/XML markup."
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"},
     {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"},
@@ -1214,167 +916,25 @@ files = [
     {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"},
 ]
 
-[[package]]
-name = "marlin-kernels"
-version = "0.2.0"
-description = "Marlin quantization kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:9a5afcf19b0f5917e43353cc19873fb3c4d4d0b924e2a95a37884f9ce208d0bd"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
-
-[[package]]
-name = "marlin-kernels"
-version = "0.2.0"
-description = "Marlin quantization kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:1e64fcc7ebadfaffa60091ee9201ae3daaf5c1be3be60c8c054143a3dcb72d5d"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
-
-[[package]]
-name = "marlin-kernels"
-version = "0.2.0"
-description = "Marlin quantization kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:e75f3ce9b1c13a4ed43a380d88e1d34d297259452db037ec1973ec33dc2eb78e"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
-
-[[package]]
-name = "marlin-kernels"
-version = "0.2.0"
-description = "Marlin quantization kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:2f99a27f70b391887ee6adffeeee7c3f4df7fac37393f9fb16d4cace2b3f6457"},
-]
-
-[package.dependencies]
-torch = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
-
 [[package]]
 name = "mdurl"
 version = "0.1.2"
 description = "Markdown URL utilities"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
     {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
 ]
 
-[[package]]
-name = "moe-kernels"
-version = "0.4.0"
-description = "MoE kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "moe_kernels-0.4.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:3fc0475bb3b9c09bbf08f6f6e9767d10eaba55b558f67a605fe70ae0cbb5e6a4"},
-]
-
-[package.dependencies]
-nvidia-ml-py = "*"
-torch = "*"
-triton = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.4.0/moe_kernels-0.4.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
-
-[[package]]
-name = "moe-kernels"
-version = "0.4.0"
-description = "MoE kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "moe_kernels-0.4.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:8ca72a064ceb84a23a3437cc6e6363907ad41588877f6acb1febc010fc7beb22"},
-]
-
-[package.dependencies]
-nvidia-ml-py = "*"
-torch = "*"
-triton = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.4.0/moe_kernels-0.4.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
-
-[[package]]
-name = "moe-kernels"
-version = "0.4.0"
-description = "MoE kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "moe_kernels-0.4.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:d302d6b16bb4905b2312dc68da6a6f51e87d0cd3c4bf1f23d995501162399a8e"},
-]
-
-[package.dependencies]
-nvidia-ml-py = "*"
-torch = "*"
-triton = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.4.0/moe_kernels-0.4.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
-
-[[package]]
-name = "moe-kernels"
-version = "0.4.0"
-description = "MoE kernels"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "moe_kernels-0.4.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:6aee3e723efa5113c338b40e6cb20fa62da6c442c65c1a6cc97751d34158a93a"},
-]
-
-[package.dependencies]
-nvidia-ml-py = "*"
-torch = "*"
-triton = "*"
-
-[package.source]
-type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.4.0/moe_kernels-0.4.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
-
 [[package]]
 name = "mpmath"
 version = "1.3.0"
 description = "Python library for arbitrary-precision floating-point arithmetic"
-optional = true
+optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
     {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
@@ -1386,144 +946,13 @@ docs = ["sphinx"]
 gmpy = ["gmpy2 (>=2.1.0a4)"]
 tests = ["pytest (>=4.6)"]
 
-[[package]]
-name = "multidict"
-version = "6.1.0"
-description = "multidict implementation"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"},
-    {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"},
-    {file = "multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429"},
-    {file = "multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160"},
-    {file = "multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7"},
-    {file = "multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0"},
-    {file = "multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d"},
-    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6"},
-    {file = "multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156"},
-    {file = "multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351"},
-    {file = "multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3"},
-    {file = "multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753"},
-    {file = "multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80"},
-    {file = "multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926"},
-    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa"},
-    {file = "multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436"},
-    {file = "multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925"},
-    {file = "multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6"},
-    {file = "multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3"},
-    {file = "multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133"},
-    {file = "multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1"},
-    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008"},
-    {file = "multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f"},
-    {file = "multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44"},
-    {file = "multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4"},
-    {file = "multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6"},
-    {file = "multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81"},
-    {file = "multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774"},
-    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392"},
-    {file = "multidict-6.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a"},
-    {file = "multidict-6.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d"},
-    {file = "multidict-6.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492"},
-    {file = "multidict-6.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd"},
-    {file = "multidict-6.1.0-cp38-cp38-win32.whl", hash = "sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167"},
-    {file = "multidict-6.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1"},
-    {file = "multidict-6.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255"},
-    {file = "multidict-6.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972"},
-    {file = "multidict-6.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43"},
-    {file = "multidict-6.1.0-cp39-cp39-win32.whl", hash = "sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada"},
-    {file = "multidict-6.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a"},
-    {file = "multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506"},
-    {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"},
-]
-
-[package.dependencies]
-typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
-
-[[package]]
-name = "multiprocess"
-version = "0.70.15"
-description = "better multiprocessing and multithreading in Python"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "multiprocess-0.70.15-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:aa36c7ed16f508091438687fe9baa393a7a8e206731d321e443745e743a0d4e5"},
-    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:20e024018c46d0d1602024c613007ac948f9754659e3853b0aa705e83f6931d8"},
-    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_i686.whl", hash = "sha256:e576062981c91f0fe8a463c3d52506e598dfc51320a8dd8d78b987dfca91c5db"},
-    {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:e73f497e6696a0f5433ada2b3d599ae733b87a6e8b008e387c62ac9127add177"},
-    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73db2e7b32dcc7f9b0f075c2ffa45c90b6729d3f1805f27e88534c8d321a1be5"},
-    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_i686.whl", hash = "sha256:4271647bd8a49c28ecd6eb56a7fdbd3c212c45529ad5303b40b3c65fc6928e5f"},
-    {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:cf981fb998d6ec3208cb14f0cf2e9e80216e834f5d51fd09ebc937c32b960902"},
-    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:18f9f2c7063346d1617bd1684fdcae8d33380ae96b99427260f562e1a1228b67"},
-    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_i686.whl", hash = "sha256:0eac53214d664c49a34695e5824872db4006b1a465edd7459a251809c3773370"},
-    {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:1a51dd34096db47fb21fa2b839e615b051d51b97af9a67afbcdaa67186b44883"},
-    {file = "multiprocess-0.70.15-py310-none-any.whl", hash = "sha256:7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a"},
-    {file = "multiprocess-0.70.15-py311-none-any.whl", hash = "sha256:134f89053d82c9ed3b73edd3a2531eb791e602d4f4156fc92a79259590bd9670"},
-    {file = "multiprocess-0.70.15-py37-none-any.whl", hash = "sha256:f7d4a1629bccb433114c3b4885f69eccc200994323c80f6feee73b0edc9199c5"},
-    {file = "multiprocess-0.70.15-py38-none-any.whl", hash = "sha256:bee9afba476c91f9ebee7beeee0601face9eff67d822e893f9a893725fbd6316"},
-    {file = "multiprocess-0.70.15-py39-none-any.whl", hash = "sha256:3e0953f5d52b4c76f1c973eaf8214554d146f2be5decb48e928e55c7a2d19338"},
-    {file = "multiprocess-0.70.15.tar.gz", hash = "sha256:f20eed3036c0ef477b07a4177cf7c1ba520d9a2677870a4f47fe026f0cd6787e"},
-]
-
-[package.dependencies]
-dill = ">=0.3.7"
-
 [[package]]
 name = "nest-asyncio"
 version = "1.6.0"
 description = "Patch asyncio to allow nested event loops"
 optional = true
 python-versions = ">=3.5"
+groups = ["main"]
 files = [
     {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
     {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
@@ -1533,8 +962,9 @@ files = [
 name = "networkx"
 version = "3.2.1"
 description = "Python package for creating and manipulating graphs and networks"
-optional = true
+optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"},
     {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"},
@@ -1553,6 +983,7 @@ version = "0.60.0"
 description = "compiling Python code using LLVM"
 optional = true
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"},
     {file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"},
@@ -1587,6 +1018,7 @@ version = "1.26.4"
 description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
     {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
@@ -1628,54 +1060,68 @@ files = [
 
 [[package]]
 name = "nvidia-cublas-cu12"
-version = "12.1.3.1"
+version = "12.4.5.8"
 description = "CUBLAS native runtime libraries"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"},
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc"},
 ]
 
 [[package]]
 name = "nvidia-cuda-cupti-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "CUDA profiling tools runtime libs."
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"},
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922"},
 ]
 
 [[package]]
 name = "nvidia-cuda-nvrtc-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "NVRTC native runtime libraries"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"},
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec"},
 ]
 
 [[package]]
 name = "nvidia-cuda-runtime-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "CUDA Runtime native Libraries"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"},
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e"},
 ]
 
 [[package]]
 name = "nvidia-cudnn-cu12"
 version = "9.1.0.70"
 description = "cuDNN runtime libraries"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
     {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"},
     {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"},
@@ -1686,35 +1132,47 @@ nvidia-cublas-cu12 = "*"
 
 [[package]]
 name = "nvidia-cufft-cu12"
-version = "11.0.2.54"
+version = "11.2.1.3"
 description = "CUFFT native runtime libraries"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"},
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b"},
 ]
 
+[package.dependencies]
+nvidia-nvjitlink-cu12 = "*"
+
 [[package]]
 name = "nvidia-curand-cu12"
-version = "10.3.2.106"
+version = "10.3.5.147"
 description = "CURAND native runtime libraries"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"},
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771"},
 ]
 
 [[package]]
 name = "nvidia-cusolver-cu12"
-version = "11.4.5.107"
+version = "11.6.1.9"
 description = "CUDA solver native runtime libraries"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"},
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c"},
 ]
 
 [package.dependencies]
@@ -1724,254 +1182,279 @@ nvidia-nvjitlink-cu12 = "*"
 
 [[package]]
 name = "nvidia-cusparse-cu12"
-version = "12.1.0.106"
+version = "12.3.1.170"
 description = "CUSPARSE native runtime libraries"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"},
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f"},
 ]
 
 [package.dependencies]
 nvidia-nvjitlink-cu12 = "*"
 
 [[package]]
-name = "nvidia-ml-py"
-version = "12.560.30"
-description = "Python Bindings for the NVIDIA Management Library"
-optional = true
+name = "nvidia-cusparselt-cu12"
+version = "0.6.2"
+description = "NVIDIA cuSPARSELt"
+optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia-ml-py-12.560.30.tar.gz", hash = "sha256:f0254dc7400647680a072ee02509bfd46102b60bdfeca321576d4d4817e7fe97"},
-    {file = "nvidia_ml_py-12.560.30-py3-none-any.whl", hash = "sha256:fea371c94d63e38a611c17bbb85fe400e9c8ddb9e8684a9cd0e47786a4bc3c73"},
+    {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"},
+    {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"},
+    {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70"},
 ]
 
 [[package]]
 name = "nvidia-nccl-cu12"
-version = "2.20.5"
+version = "2.21.5"
 description = "NVIDIA Collective Communication Library (NCCL) Runtime"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"},
-    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"},
+    {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"},
 ]
 
 [[package]]
 name = "nvidia-nvjitlink-cu12"
-version = "12.6.77"
+version = "12.4.127"
 description = "Nvidia JIT LTO Library"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:3bf10d85bb1801e9c894c6e197e44dd137d2a0a9e43f8450e9ad13f2df0dd52d"},
-    {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9ae346d16203ae4ea513be416495167a0101d33d2d14935aa9c1829a3fb45142"},
-    {file = "nvidia_nvjitlink_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:410718cd44962bed862a31dd0318620f6f9a8b28a6291967bcfcb446a6516771"},
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"},
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"},
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"},
 ]
 
 [[package]]
 name = "nvidia-nvtx-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "NVIDIA Tools Extension"
-optional = true
+optional = false
 python-versions = ">=3"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"},
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"},
 ]
 
 [[package]]
 name = "opentelemetry-api"
-version = "1.25.0"
+version = "1.32.0"
 description = "OpenTelemetry Python API"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_api-1.25.0-py3-none-any.whl", hash = "sha256:757fa1aa020a0f8fa139f8959e53dec2051cc26b832e76fa839a6d76ecefd737"},
-    {file = "opentelemetry_api-1.25.0.tar.gz", hash = "sha256:77c4985f62f2614e42ce77ee4c9da5fa5f0bc1e1821085e9a47533a9323ae869"},
+    {file = "opentelemetry_api-1.32.0-py3-none-any.whl", hash = "sha256:15df743c765078611f376037b0d9111ec5c1febf2ec9440cdd919370faa1ce55"},
+    {file = "opentelemetry_api-1.32.0.tar.gz", hash = "sha256:2623280c916f9b19cad0aa4280cb171265f19fd2909b0d47e4f06f7c83b02cb5"},
 ]
 
 [package.dependencies]
 deprecated = ">=1.2.6"
-importlib-metadata = ">=6.0,<=7.1"
+importlib-metadata = ">=6.0,<8.7.0"
 
 [[package]]
 name = "opentelemetry-exporter-otlp"
-version = "1.25.0"
+version = "1.32.0"
 description = "OpenTelemetry Collector Exporters"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_exporter_otlp-1.25.0-py3-none-any.whl", hash = "sha256:d67a831757014a3bc3174e4cd629ae1493b7ba8d189e8a007003cacb9f1a6b60"},
-    {file = "opentelemetry_exporter_otlp-1.25.0.tar.gz", hash = "sha256:ce03199c1680a845f82e12c0a6a8f61036048c07ec7a0bd943142aca8fa6ced0"},
+    {file = "opentelemetry_exporter_otlp-1.32.0-py3-none-any.whl", hash = "sha256:8b563bee30f05415fb51e075eb6461cdaa7bcef1cc79917cfd79caf12e5bb548"},
+    {file = "opentelemetry_exporter_otlp-1.32.0.tar.gz", hash = "sha256:4c66681f8acd95dce44966842182e3690e77256e5791ceb34b76ea1c34b20463"},
 ]
 
 [package.dependencies]
-opentelemetry-exporter-otlp-proto-grpc = "1.25.0"
-opentelemetry-exporter-otlp-proto-http = "1.25.0"
+opentelemetry-exporter-otlp-proto-grpc = "1.32.0"
+opentelemetry-exporter-otlp-proto-http = "1.32.0"
 
 [[package]]
 name = "opentelemetry-exporter-otlp-proto-common"
-version = "1.25.0"
+version = "1.32.0"
 description = "OpenTelemetry Protobuf encoding"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_exporter_otlp_proto_common-1.25.0-py3-none-any.whl", hash = "sha256:15637b7d580c2675f70246563363775b4e6de947871e01d0f4e3881d1848d693"},
-    {file = "opentelemetry_exporter_otlp_proto_common-1.25.0.tar.gz", hash = "sha256:c93f4e30da4eee02bacd1e004eb82ce4da143a2f8e15b987a9f603e0a85407d3"},
+    {file = "opentelemetry_exporter_otlp_proto_common-1.32.0-py3-none-any.whl", hash = "sha256:277a63a18768b3b460d082a489f6f80d4ae2c1e6b185bb701c6bd4e91405e4bd"},
+    {file = "opentelemetry_exporter_otlp_proto_common-1.32.0.tar.gz", hash = "sha256:2bca672f2a279c4f517115e635c0cc1269d07b2982a36681c521f7e56179a222"},
 ]
 
 [package.dependencies]
-opentelemetry-proto = "1.25.0"
+opentelemetry-proto = "1.32.0"
 
 [[package]]
 name = "opentelemetry-exporter-otlp-proto-grpc"
-version = "1.25.0"
+version = "1.32.0"
 description = "OpenTelemetry Collector Protobuf over gRPC Exporter"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_exporter_otlp_proto_grpc-1.25.0-py3-none-any.whl", hash = "sha256:3131028f0c0a155a64c430ca600fd658e8e37043cb13209f0109db5c1a3e4eb4"},
-    {file = "opentelemetry_exporter_otlp_proto_grpc-1.25.0.tar.gz", hash = "sha256:c0b1661415acec5af87625587efa1ccab68b873745ca0ee96b69bb1042087eac"},
+    {file = "opentelemetry_exporter_otlp_proto_grpc-1.32.0-py3-none-any.whl", hash = "sha256:85b7c42bebe48ef55866793a3123ebf357dcaf629d961b27067025fd60104dbe"},
+    {file = "opentelemetry_exporter_otlp_proto_grpc-1.32.0.tar.gz", hash = "sha256:c069c5d5f429a46fb1001f38191730939f593789c847648e4cea26dc8b6018a8"},
 ]
 
 [package.dependencies]
 deprecated = ">=1.2.6"
 googleapis-common-protos = ">=1.52,<2.0"
-grpcio = ">=1.0.0,<2.0.0"
+grpcio = {version = ">=1.63.2,<2.0.0", markers = "python_version < \"3.13\""}
 opentelemetry-api = ">=1.15,<2.0"
-opentelemetry-exporter-otlp-proto-common = "1.25.0"
-opentelemetry-proto = "1.25.0"
-opentelemetry-sdk = ">=1.25.0,<1.26.0"
+opentelemetry-exporter-otlp-proto-common = "1.32.0"
+opentelemetry-proto = "1.32.0"
+opentelemetry-sdk = ">=1.32.0,<1.33.0"
 
 [[package]]
 name = "opentelemetry-exporter-otlp-proto-http"
-version = "1.25.0"
+version = "1.32.0"
 description = "OpenTelemetry Collector Protobuf over HTTP Exporter"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_exporter_otlp_proto_http-1.25.0-py3-none-any.whl", hash = "sha256:2eca686ee11b27acd28198b3ea5e5863a53d1266b91cda47c839d95d5e0541a6"},
-    {file = "opentelemetry_exporter_otlp_proto_http-1.25.0.tar.gz", hash = "sha256:9f8723859e37c75183ea7afa73a3542f01d0fd274a5b97487ea24cb683d7d684"},
+    {file = "opentelemetry_exporter_otlp_proto_http-1.32.0-py3-none-any.whl", hash = "sha256:e2ffecd6d2220eaf1291a46339f109bc0a57ee7c4c6abb8174df418bf00ce01f"},
+    {file = "opentelemetry_exporter_otlp_proto_http-1.32.0.tar.gz", hash = "sha256:a5dfd94603da86e313e4f4fb8d181fd3b64a7c2a9c7b408c3653d2b1bc68d14f"},
 ]
 
 [package.dependencies]
 deprecated = ">=1.2.6"
 googleapis-common-protos = ">=1.52,<2.0"
 opentelemetry-api = ">=1.15,<2.0"
-opentelemetry-exporter-otlp-proto-common = "1.25.0"
-opentelemetry-proto = "1.25.0"
-opentelemetry-sdk = ">=1.25.0,<1.26.0"
+opentelemetry-exporter-otlp-proto-common = "1.32.0"
+opentelemetry-proto = "1.32.0"
+opentelemetry-sdk = ">=1.32.0,<1.33.0"
 requests = ">=2.7,<3.0"
 
 [[package]]
 name = "opentelemetry-instrumentation"
-version = "0.46b0"
+version = "0.53b0"
 description = "Instrumentation Tools & Auto Instrumentation for OpenTelemetry Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_instrumentation-0.46b0-py3-none-any.whl", hash = "sha256:89cd721b9c18c014ca848ccd11181e6b3fd3f6c7669e35d59c48dc527408c18b"},
-    {file = "opentelemetry_instrumentation-0.46b0.tar.gz", hash = "sha256:974e0888fb2a1e01c38fbacc9483d024bb1132aad92d6d24e2e5543887a7adda"},
+    {file = "opentelemetry_instrumentation-0.53b0-py3-none-any.whl", hash = "sha256:70600778fd567c9c5fbfca181378ae179c0dec3ff613171707d3d77c360ff105"},
+    {file = "opentelemetry_instrumentation-0.53b0.tar.gz", hash = "sha256:f2c21d71a3cdf28c656e3d90d247ee7558fb9b0239b3d9e9190266499dbed9d2"},
 ]
 
 [package.dependencies]
 opentelemetry-api = ">=1.4,<2.0"
-setuptools = ">=16.0"
+opentelemetry-semantic-conventions = "0.53b0"
+packaging = ">=18.0"
 wrapt = ">=1.0.0,<2.0.0"
 
 [[package]]
 name = "opentelemetry-instrumentation-grpc"
-version = "0.46b0"
+version = "0.53b0"
 description = "OpenTelemetry gRPC instrumentation"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_instrumentation_grpc-0.46b0-py3-none-any.whl", hash = "sha256:cccfb28db07c28849709f2dcf330237fae0fca9f86971bfce27b28bb9a8b0577"},
-    {file = "opentelemetry_instrumentation_grpc-0.46b0.tar.gz", hash = "sha256:9c5738592cf82672805099826b676d352324b54e03f9ac72a1368ba0605d6ff9"},
+    {file = "opentelemetry_instrumentation_grpc-0.53b0-py3-none-any.whl", hash = "sha256:bd44f113c58fd66614b07bd9b8115ec311389ec58ef7e48a06581e302971c3f4"},
+    {file = "opentelemetry_instrumentation_grpc-0.53b0.tar.gz", hash = "sha256:a95b752e0782e7b503379de1c64a5afa2c7c1cd8196fa5f2b5c090d01c15e517"},
 ]
 
 [package.dependencies]
 opentelemetry-api = ">=1.12,<2.0"
-opentelemetry-instrumentation = "0.46b0"
-opentelemetry-semantic-conventions = "0.46b0"
+opentelemetry-instrumentation = "0.53b0"
+opentelemetry-semantic-conventions = "0.53b0"
 wrapt = ">=1.0.0,<2.0.0"
 
 [package.extras]
-instruments = ["grpcio (>=1.27,<2.0)"]
+instruments = ["grpcio (>=1.42.0)"]
 
 [[package]]
 name = "opentelemetry-proto"
-version = "1.25.0"
+version = "1.32.0"
 description = "OpenTelemetry Python Proto"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_proto-1.25.0-py3-none-any.whl", hash = "sha256:f07e3341c78d835d9b86665903b199893befa5e98866f63d22b00d0b7ca4972f"},
-    {file = "opentelemetry_proto-1.25.0.tar.gz", hash = "sha256:35b6ef9dc4a9f7853ecc5006738ad40443701e52c26099e197895cbda8b815a3"},
+    {file = "opentelemetry_proto-1.32.0-py3-none-any.whl", hash = "sha256:f699269dc037e18fba05442580a8682c9fbd0f4c7f5addfed82c44be0c53c5ff"},
+    {file = "opentelemetry_proto-1.32.0.tar.gz", hash = "sha256:f8b70ae52f4ef8a4e4c0760e87c9071e07ece2618c080d4839bef44c0156cd44"},
 ]
 
 [package.dependencies]
-protobuf = ">=3.19,<5.0"
+protobuf = ">=5.0,<6.0"
 
 [[package]]
 name = "opentelemetry-sdk"
-version = "1.25.0"
+version = "1.32.0"
 description = "OpenTelemetry Python SDK"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_sdk-1.25.0-py3-none-any.whl", hash = "sha256:d97ff7ec4b351692e9d5a15af570c693b8715ad78b8aafbec5c7100fe966b4c9"},
-    {file = "opentelemetry_sdk-1.25.0.tar.gz", hash = "sha256:ce7fc319c57707ef5bf8b74fb9f8ebdb8bfafbe11898410e0d2a761d08a98ec7"},
+    {file = "opentelemetry_sdk-1.32.0-py3-none-any.whl", hash = "sha256:ed252d035c22a15536c1f603ca089298daab60850fc2f5ddfa95d95cc1c043ea"},
+    {file = "opentelemetry_sdk-1.32.0.tar.gz", hash = "sha256:5ff07fb371d1ab1189fa7047702e2e888b5403c5efcbb18083cae0d5aa5f58d2"},
 ]
 
 [package.dependencies]
-opentelemetry-api = "1.25.0"
-opentelemetry-semantic-conventions = "0.46b0"
+opentelemetry-api = "1.32.0"
+opentelemetry-semantic-conventions = "0.53b0"
 typing-extensions = ">=3.7.4"
 
 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.46b0"
+version = "0.53b0"
 description = "OpenTelemetry Semantic Conventions"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "opentelemetry_semantic_conventions-0.36b0-py3-none-any.whl", hash = "sha256:adc05635e87b9d3e007c9f530eed487fc3ef2177d02f82f674f28ebf9aff8243"},
-    {file = "opentelemetry_semantic_conventions-0.36b0.tar.gz", hash = "sha256:829dc221795467d98b773c04096e29be038d77526dc8d6ac76f546fb6279bf01"},
-]
-
-[[package]]
-name = "optimum"
-version = "1.23.2"
-description = "Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality."
-optional = false
-python-versions = ">=3.7.0"
-files = [
-    {file = "optimum-1.23.2-py3-none-any.whl", hash = "sha256:1b6f450b2c0203377b76cf4dd1e05e45b116d8069612fe2ad73df8b5743d5a57"},
-    {file = "optimum-1.23.2.tar.gz", hash = "sha256:245415add6e621c6c5b7c29eee968bc325fc1a982d9c42eb83d97b7ec844ef39"},
+    {file = "opentelemetry_semantic_conventions-0.53b0-py3-none-any.whl", hash = "sha256:561da89f766ab51615c0e72b12329e0a1bc16945dbd62c8646ffc74e36a1edff"},
+    {file = "opentelemetry_semantic_conventions-0.53b0.tar.gz", hash = "sha256:05b7908e1da62d72f9bf717ed25c72f566fe005a2dd260c61b11e025f2552cf6"},
+]
+
+[package.dependencies]
+deprecated = ">=1.2.6"
+opentelemetry-api = "1.32.0"
+
+[[package]]
+name = "optimum"
+version = "1.24.0"
+description = "Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality."
+optional = false
+python-versions = ">=3.9.0"
+groups = ["main"]
+files = [
+    {file = "optimum-1.24.0-py3-none-any.whl", hash = "sha256:196776949183cd3a56a15097a02be41e6f37aa92d824bd053de89c39ee6b0087"},
+    {file = "optimum-1.24.0.tar.gz", hash = "sha256:b502a2afbf78bb73370ebb1eff07b93108a1b386116e87eb17e882d210150551"},
 ]
 
 [package.dependencies]
-coloredlogs = "*"
-datasets = "*"
 huggingface-hub = ">=0.8.0"
 numpy = "*"
 packaging = "*"
-sympy = "*"
 torch = ">=1.11"
-transformers = {version = ">=4.29", extras = ["sentencepiece"]}
+transformers = ">=4.29"
 
 [package.extras]
 amd = ["optimum-amd"]
 benchmark = ["evaluate (>=0.2.0)", "optuna", "scikit-learn", "seqeval", "torchvision", "tqdm"]
-dev = ["Pillow", "accelerate", "black (>=23.1,<24.0)", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "ruff (==0.1.5)", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
+dev = ["Pillow", "accelerate", "black (>=23.1,<24.0)", "einops", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "ruff (==0.1.5)", "sacremoses", "scikit-learn", "sentencepiece", "timm", "torchaudio", "torchvision"]
 diffusers = ["diffusers"]
 doc-build = ["accelerate"]
-exporters = ["onnx", "onnxruntime", "timm", "transformers (<4.46.0)"]
-exporters-gpu = ["onnx", "onnxruntime-gpu", "timm", "transformers (<4.46.0)"]
-exporters-tf = ["datasets (<=2.16)", "h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm", "transformers[sentencepiece] (>=4.26,<4.38)"]
+exporters = ["onnx", "onnxruntime", "timm", "transformers (>=4.36,<4.49.0)"]
+exporters-gpu = ["onnx", "onnxruntime-gpu", "timm", "transformers (>=4.36,<4.49.0)"]
+exporters-tf = ["datasets (<=2.16)", "h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm", "transformers (>=4.36,<4.38)"]
 furiosa = ["optimum-furiosa"]
 graphcore = ["optimum-graphcore"]
 habana = ["optimum-habana", "transformers (>=4.45.0,<4.46.0)"]
@@ -1981,46 +1464,49 @@ neural-compressor = ["optimum-intel[neural-compressor] (>=1.18.0)"]
 neuron = ["optimum-neuron[neuron] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
 neuronx = ["optimum-neuron[neuronx] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
 nncf = ["optimum-intel[nncf] (>=1.18.0)"]
-onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)", "transformers (<4.46.0)"]
-onnxruntime-gpu = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)", "transformers (<4.46.0)"]
+onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)", "transformers (>=4.36,<4.49.0)"]
+onnxruntime-gpu = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)", "transformers (>=4.36,<4.49.0)"]
+onnxruntime-training = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnxruntime-training (>=1.11.0)", "protobuf (>=3.20.1)", "torch-ort", "transformers (>=4.36,<4.49.0)"]
 openvino = ["optimum-intel[openvino] (>=1.18.0)"]
 quality = ["black (>=23.1,<24.0)", "ruff (==0.1.5)"]
 quanto = ["optimum-quanto (>=0.2.4)"]
-tests = ["Pillow", "accelerate", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
+tests = ["Pillow", "accelerate", "einops", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "sentencepiece", "timm", "torchaudio", "torchvision"]
 
 [[package]]
 name = "optimum-habana"
-version = "1.14.1"
+version = "1.17.0"
 description = "Optimum Habana is the interface between the Hugging Face Transformers and Diffusers libraries and Habana's Gaudi processor (HPU). It provides a set of tools enabling easy model loading, training and inference on single- and multi-HPU settings for different downstream tasks."
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
-    {file = "optimum_habana-1.14.1-py3-none-any.whl", hash = "sha256:557dd06a5fa2597b1487384a354c3a1618390c35e6b088f5d7a80c0611fe2bfd"},
-    {file = "optimum_habana-1.14.1.tar.gz", hash = "sha256:627626860b82452b75f73087df3e14004278ed30dacf767c100f1324675c8120"},
+    {file = "optimum_habana-1.17.0-py3-none-any.whl", hash = "sha256:4f1008c7e84248b62778c8d5f79443237026a5a281b50ee67c7db211c5ca7d2a"},
+    {file = "optimum_habana-1.17.0.tar.gz", hash = "sha256:634adaa775c5c1694a164bdec46133e9712676237e996aadf3392c820573ce92"},
 ]
 
 [package.dependencies]
 accelerate = ">=0.33.0,<0.34.0"
-diffusers = "0.29.2"
-huggingface-hub = ">=0.24.7"
+diffusers = ">=0.31.0,<0.32.0"
+huggingface_hub = ">=0.24.7"
 optimum = "*"
-sentence-transformers = {version = "3.0.1", extras = ["train"]}
+sentence-transformers = "3.3.1"
 torch = "*"
-transformers = ">=4.45.2,<4.46.0"
+transformers = ">=4.49.0,<4.50.0"
 
 [package.extras]
-quality = ["hf-doc-builder", "ruff"]
-tests = ["GitPython", "datasets", "optuna", "parameterized", "peft", "psutil", "pytest (<8.0.0)", "safetensors", "scipy", "sentencepiece", "timm", "torchsde"]
+quality = ["hf_doc_builder", "ruff"]
+tests = ["GitPython", "datasets", "optuna", "parameterized", "peft", "psutil", "pytest (<8.0.0)", "safetensors", "scipy", "sentencepiece", "timm", "timm", "torchsde"]
 
 [[package]]
 name = "outlines"
-version = "0.0.34"
+version = "0.0.36"
 description = "Probabilistic Generative Model Programming"
 optional = true
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "outlines-0.0.34-py3-none-any.whl", hash = "sha256:911588a7e64a4f193b97fb4c501d98ccfd4e95a98f6a3ada67a280bf0c373c50"},
-    {file = "outlines-0.0.34.tar.gz", hash = "sha256:594e7204c770b47a62eb5c2ba7d25ea0ab2e16882b5f04556712a0228d3d3309"},
+    {file = "outlines-0.0.36-py3-none-any.whl", hash = "sha256:afa02ca5c449c47731fa06af66d13c2f5ee8b30f8b82b4db90e08215d6f111d1"},
+    {file = "outlines-0.0.36.tar.gz", hash = "sha256:3cffb43143548cd78c6061990feb461cffd5479999391b8390471ea839c2d46e"},
 ]
 
 [package.dependencies]
@@ -2043,119 +1529,35 @@ transformers = "*"
 
 [package.extras]
 serve = ["fastapi", "pydantic (>=2.0)", "ray (==2.9.0)", "uvicorn", "vllm (>=0.3.0)"]
-test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "llama-cpp-python (>=0.2.42)", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers"]
+test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "llama-cpp-python", "openai (>=1.0.0)", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers"]
 
 [[package]]
 name = "packaging"
-version = "24.1"
+version = "24.2"
 description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.8"
+groups = ["main", "dev"]
 files = [
-    {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
-    {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
+    {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
+    {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
 ]
 
-[[package]]
-name = "pandas"
-version = "2.2.3"
-description = "Powerful data structures for data analysis, time series, and statistics"
-optional = true
-python-versions = ">=3.9"
-files = [
-    {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"},
-    {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
-    {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"},
-    {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"},
-    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"},
-    {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"},
-    {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"},
-    {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"},
-    {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"},
-    {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"},
-    {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"},
-    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"},
-    {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"},
-    {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"},
-    {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"},
-    {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"},
-    {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"},
-    {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"},
-    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"},
-    {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"},
-    {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"},
-    {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"},
-    {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"},
-    {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"},
-    {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"},
-    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"},
-    {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"},
-    {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"},
-    {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"},
-    {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"},
-    {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"},
-    {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
-    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
-    {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
-    {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
-    {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
-    {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
-    {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
-    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
-    {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
-    {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
-    {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
-]
-
-[package.dependencies]
-numpy = [
-    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
-]
-python-dateutil = ">=2.8.2"
-pytz = ">=2020.1"
-tzdata = ">=2022.7"
-
-[package.extras]
-all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
-aws = ["s3fs (>=2022.11.0)"]
-clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
-compression = ["zstandard (>=0.19.0)"]
-computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
-consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
-excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
-feather = ["pyarrow (>=10.0.1)"]
-fss = ["fsspec (>=2022.11.0)"]
-gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
-hdf5 = ["tables (>=3.8.0)"]
-html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
-mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
-output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
-parquet = ["pyarrow (>=10.0.1)"]
-performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
-plot = ["matplotlib (>=3.6.3)"]
-postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
-pyarrow = ["pyarrow (>=10.0.1)"]
-spss = ["pyreadstat (>=1.2.0)"]
-sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
-test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
-xml = ["lxml (>=4.9.2)"]
-
 [[package]]
 name = "peft"
-version = "0.10.0"
+version = "0.15.1"
 description = "Parameter-Efficient Fine-Tuning (PEFT)"
-optional = true
-python-versions = ">=3.8.0"
+optional = false
+python-versions = ">=3.9.0"
+groups = ["main"]
 files = [
-    {file = "peft-0.10.0-py3-none-any.whl", hash = "sha256:d5249c97e818d3e31f92553c73c2953acd0ec12649b8b749afff7152cbc86cbb"},
-    {file = "peft-0.10.0.tar.gz", hash = "sha256:36a7628c15f88d37abb26cfc74c22468f9037ee02e9c9b65de943cfe7c672049"},
+    {file = "peft-0.15.1-py3-none-any.whl", hash = "sha256:5fb3960beb518f00668f2cdc53424a5cc495c78281697821ce24609c90ca0a10"},
+    {file = "peft-0.15.1.tar.gz", hash = "sha256:e4c65af70683a9ef3baf1ab450710f1eb7181f369ef6172ca8bf15bf4ae6ff71"},
 ]
 
 [package.dependencies]
 accelerate = ">=0.21.0"
-huggingface-hub = ">=0.17.0"
+huggingface_hub = ">=0.25.0"
 numpy = ">=1.17"
 packaging = ">=20.0"
 psutil = "*"
@@ -2166,100 +1568,108 @@ tqdm = "*"
 transformers = "*"
 
 [package.extras]
-dev = ["black", "hf-doc-builder", "ruff (>=0.2.1,<0.3.0)"]
+dev = ["black", "black", "hf-doc-builder", "hf-doc-builder", "ruff (>=0.9.2,<0.10.0)"]
 docs-specific = ["black", "hf-doc-builder"]
-quality = ["black", "hf-doc-builder", "ruff (>=0.2.1,<0.3.0)"]
-test = ["black", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameterized", "pytest", "pytest-cov", "pytest-xdist", "ruff (>=0.2.1,<0.3.0)", "scipy"]
+quality = ["black", "hf-doc-builder", "ruff (>=0.9.2,<0.10.0)"]
+test = ["black", "black", "datasets", "diffusers", "hf-doc-builder", "hf-doc-builder", "parameterized", "protobuf", "pytest", "pytest-cov", "pytest-xdist", "ruff (>=0.9.2,<0.10.0)", "scipy", "sentencepiece"]
 
 [[package]]
 name = "pillow"
-version = "11.0.0"
+version = "11.2.1"
 description = "Python Imaging Library (Fork)"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "pillow-11.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:6619654954dc4936fcff82db8eb6401d3159ec6be81e33c6000dfd76ae189947"},
-    {file = "pillow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b3c5ac4bed7519088103d9450a1107f76308ecf91d6dabc8a33a2fcfb18d0fba"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a65149d8ada1055029fcb665452b2814fe7d7082fcb0c5bed6db851cb69b2086"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88a58d8ac0cc0e7f3a014509f0455248a76629ca9b604eca7dc5927cc593c5e9"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c26845094b1af3c91852745ae78e3ea47abf3dbcd1cf962f16b9a5fbe3ee8488"},
-    {file = "pillow-11.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:1a61b54f87ab5786b8479f81c4b11f4d61702830354520837f8cc791ebba0f5f"},
-    {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:674629ff60030d144b7bca2b8330225a9b11c482ed408813924619c6f302fdbb"},
-    {file = "pillow-11.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:598b4e238f13276e0008299bd2482003f48158e2b11826862b1eb2ad7c768b97"},
-    {file = "pillow-11.0.0-cp310-cp310-win32.whl", hash = "sha256:9a0f748eaa434a41fccf8e1ee7a3eed68af1b690e75328fd7a60af123c193b50"},
-    {file = "pillow-11.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:a5629742881bcbc1f42e840af185fd4d83a5edeb96475a575f4da50d6ede337c"},
-    {file = "pillow-11.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:ee217c198f2e41f184f3869f3e485557296d505b5195c513b2bfe0062dc537f1"},
-    {file = "pillow-11.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1c1d72714f429a521d8d2d018badc42414c3077eb187a59579f28e4270b4b0fc"},
-    {file = "pillow-11.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:499c3a1b0d6fc8213519e193796eb1a86a1be4b1877d678b30f83fd979811d1a"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8b2351c85d855293a299038e1f89db92a2f35e8d2f783489c6f0b2b5f3fe8a3"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f4dba50cfa56f910241eb7f883c20f1e7b1d8f7d91c750cd0b318bad443f4d5"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:5ddbfd761ee00c12ee1be86c9c0683ecf5bb14c9772ddbd782085779a63dd55b"},
-    {file = "pillow-11.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:45c566eb10b8967d71bf1ab8e4a525e5a93519e29ea071459ce517f6b903d7fa"},
-    {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b4fd7bd29610a83a8c9b564d457cf5bd92b4e11e79a4ee4716a63c959699b306"},
-    {file = "pillow-11.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cb929ca942d0ec4fac404cbf520ee6cac37bf35be479b970c4ffadf2b6a1cad9"},
-    {file = "pillow-11.0.0-cp311-cp311-win32.whl", hash = "sha256:006bcdd307cc47ba43e924099a038cbf9591062e6c50e570819743f5607404f5"},
-    {file = "pillow-11.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:52a2d8323a465f84faaba5236567d212c3668f2ab53e1c74c15583cf507a0291"},
-    {file = "pillow-11.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:16095692a253047fe3ec028e951fa4221a1f3ed3d80c397e83541a3037ff67c9"},
-    {file = "pillow-11.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d2c0a187a92a1cb5ef2c8ed5412dd8d4334272617f532d4ad4de31e0495bd923"},
-    {file = "pillow-11.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:084a07ef0821cfe4858fe86652fffac8e187b6ae677e9906e192aafcc1b69903"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8069c5179902dcdce0be9bfc8235347fdbac249d23bd90514b7a47a72d9fecf4"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f02541ef64077f22bf4924f225c0fd1248c168f86e4b7abdedd87d6ebaceab0f"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fcb4621042ac4b7865c179bb972ed0da0218a076dc1820ffc48b1d74c1e37fe9"},
-    {file = "pillow-11.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:00177a63030d612148e659b55ba99527803288cea7c75fb05766ab7981a8c1b7"},
-    {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8853a3bf12afddfdf15f57c4b02d7ded92c7a75a5d7331d19f4f9572a89c17e6"},
-    {file = "pillow-11.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3107c66e43bda25359d5ef446f59c497de2b5ed4c7fdba0894f8d6cf3822dafc"},
-    {file = "pillow-11.0.0-cp312-cp312-win32.whl", hash = "sha256:86510e3f5eca0ab87429dd77fafc04693195eec7fd6a137c389c3eeb4cfb77c6"},
-    {file = "pillow-11.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:8ec4a89295cd6cd4d1058a5e6aec6bf51e0eaaf9714774e1bfac7cfc9051db47"},
-    {file = "pillow-11.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:27a7860107500d813fcd203b4ea19b04babe79448268403172782754870dac25"},
-    {file = "pillow-11.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:bcd1fb5bb7b07f64c15618c89efcc2cfa3e95f0e3bcdbaf4642509de1942a699"},
-    {file = "pillow-11.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0e038b0745997c7dcaae350d35859c9715c71e92ffb7e0f4a8e8a16732150f38"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ae08bd8ffc41aebf578c2af2f9d8749d91f448b3bfd41d7d9ff573d74f2a6b2"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d69bfd8ec3219ae71bcde1f942b728903cad25fafe3100ba2258b973bd2bc1b2"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:61b887f9ddba63ddf62fd02a3ba7add935d053b6dd7d58998c630e6dbade8527"},
-    {file = "pillow-11.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:c6a660307ca9d4867caa8d9ca2c2658ab685de83792d1876274991adec7b93fa"},
-    {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:73e3a0200cdda995c7e43dd47436c1548f87a30bb27fb871f352a22ab8dcf45f"},
-    {file = "pillow-11.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fba162b8872d30fea8c52b258a542c5dfd7b235fb5cb352240c8d63b414013eb"},
-    {file = "pillow-11.0.0-cp313-cp313-win32.whl", hash = "sha256:f1b82c27e89fffc6da125d5eb0ca6e68017faf5efc078128cfaa42cf5cb38798"},
-    {file = "pillow-11.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:8ba470552b48e5835f1d23ecb936bb7f71d206f9dfeee64245f30c3270b994de"},
-    {file = "pillow-11.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:846e193e103b41e984ac921b335df59195356ce3f71dcfd155aa79c603873b84"},
-    {file = "pillow-11.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4ad70c4214f67d7466bea6a08061eba35c01b1b89eaa098040a35272a8efb22b"},
-    {file = "pillow-11.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ec0d5af64f2e3d64a165f490d96368bb5dea8b8f9ad04487f9ab60dc4bb6003"},
-    {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c809a70e43c7977c4a42aefd62f0131823ebf7dd73556fa5d5950f5b354087e2"},
-    {file = "pillow-11.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4b60c9520f7207aaf2e1d94de026682fc227806c6e1f55bba7606d1c94dd623a"},
-    {file = "pillow-11.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1e2688958a840c822279fda0086fec1fdab2f95bf2b717b66871c4ad9859d7e8"},
-    {file = "pillow-11.0.0-cp313-cp313t-win32.whl", hash = "sha256:607bbe123c74e272e381a8d1957083a9463401f7bd01287f50521ecb05a313f8"},
-    {file = "pillow-11.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5c39ed17edea3bc69c743a8dd3e9853b7509625c2462532e62baa0732163a904"},
-    {file = "pillow-11.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:75acbbeb05b86bc53cbe7b7e6fe00fbcf82ad7c684b3ad82e3d711da9ba287d3"},
-    {file = "pillow-11.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2e46773dc9f35a1dd28bd6981332fd7f27bec001a918a72a79b4133cf5291dba"},
-    {file = "pillow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2679d2258b7f1192b378e2893a8a0a0ca472234d4c2c0e6bdd3380e8dfa21b6a"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eda2616eb2313cbb3eebbe51f19362eb434b18e3bb599466a1ffa76a033fb916"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ec184af98a121fb2da42642dea8a29ec80fc3efbaefb86d8fdd2606619045d"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:8594f42df584e5b4bb9281799698403f7af489fba84c34d53d1c4bfb71b7c4e7"},
-    {file = "pillow-11.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:c12b5ae868897c7338519c03049a806af85b9b8c237b7d675b8c5e089e4a618e"},
-    {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:70fbbdacd1d271b77b7721fe3cdd2d537bbbd75d29e6300c672ec6bb38d9672f"},
-    {file = "pillow-11.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5178952973e588b3f1360868847334e9e3bf49d19e169bbbdfaf8398002419ae"},
-    {file = "pillow-11.0.0-cp39-cp39-win32.whl", hash = "sha256:8c676b587da5673d3c75bd67dd2a8cdfeb282ca38a30f37950511766b26858c4"},
-    {file = "pillow-11.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:94f3e1780abb45062287b4614a5bc0874519c86a777d4a7ad34978e86428b8dd"},
-    {file = "pillow-11.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:290f2cc809f9da7d6d622550bbf4c1e57518212da51b6a30fe8e0a270a5b78bd"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1187739620f2b365de756ce086fdb3604573337cc28a0d3ac4a01ab6b2d2a6d2"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fbbcb7b57dc9c794843e3d1258c0fbf0f48656d46ffe9e09b63bbd6e8cd5d0a2"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d203af30149ae339ad1b4f710d9844ed8796e97fda23ffbc4cc472968a47d0b"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21a0d3b115009ebb8ac3d2ebec5c2982cc693da935f4ab7bb5c8ebe2f47d36f2"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:73853108f56df97baf2bb8b522f3578221e56f646ba345a372c78326710d3830"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e58876c91f97b0952eb766123bfef372792ab3f4e3e1f1a2267834c2ab131734"},
-    {file = "pillow-11.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:224aaa38177597bb179f3ec87eeefcce8e4f85e608025e9cfac60de237ba6316"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:5bd2d3bdb846d757055910f0a59792d33b555800813c3b39ada1829c372ccb06"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:375b8dd15a1f5d2feafff536d47e22f69625c1aa92f12b339ec0b2ca40263273"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:daffdf51ee5db69a82dd127eabecce20729e21f7a3680cf7cbb23f0829189790"},
-    {file = "pillow-11.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7326a1787e3c7b0429659e0a944725e1b03eeaa10edd945a86dead1913383944"},
-    {file = "pillow-11.0.0.tar.gz", hash = "sha256:72bacbaf24ac003fea9bff9837d1eedb6088758d41e100c1552930151f677739"},
+    {file = "pillow-11.2.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:d57a75d53922fc20c165016a20d9c44f73305e67c351bbc60d1adaf662e74047"},
+    {file = "pillow-11.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:127bf6ac4a5b58b3d32fc8289656f77f80567d65660bc46f72c0d77e6600cc95"},
+    {file = "pillow-11.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4ba4be812c7a40280629e55ae0b14a0aafa150dd6451297562e1764808bbe61"},
+    {file = "pillow-11.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8bd62331e5032bc396a93609982a9ab6b411c05078a52f5fe3cc59234a3abd1"},
+    {file = "pillow-11.2.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:562d11134c97a62fe3af29581f083033179f7ff435f78392565a1ad2d1c2c45c"},
+    {file = "pillow-11.2.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c97209e85b5be259994eb5b69ff50c5d20cca0f458ef9abd835e262d9d88b39d"},
+    {file = "pillow-11.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0c3e6d0f59171dfa2e25d7116217543310908dfa2770aa64b8f87605f8cacc97"},
+    {file = "pillow-11.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cc1c3bc53befb6096b84165956e886b1729634a799e9d6329a0c512ab651e579"},
+    {file = "pillow-11.2.1-cp310-cp310-win32.whl", hash = "sha256:312c77b7f07ab2139924d2639860e084ec2a13e72af54d4f08ac843a5fc9c79d"},
+    {file = "pillow-11.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:9bc7ae48b8057a611e5fe9f853baa88093b9a76303937449397899385da06fad"},
+    {file = "pillow-11.2.1-cp310-cp310-win_arm64.whl", hash = "sha256:2728567e249cdd939f6cc3d1f049595c66e4187f3c34078cbc0a7d21c47482d2"},
+    {file = "pillow-11.2.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:35ca289f712ccfc699508c4658a1d14652e8033e9b69839edf83cbdd0ba39e70"},
+    {file = "pillow-11.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0409af9f829f87a2dfb7e259f78f317a5351f2045158be321fd135973fff7bf"},
+    {file = "pillow-11.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4e5c5edee874dce4f653dbe59db7c73a600119fbea8d31f53423586ee2aafd7"},
+    {file = "pillow-11.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b93a07e76d13bff9444f1a029e0af2964e654bfc2e2c2d46bfd080df5ad5f3d8"},
+    {file = "pillow-11.2.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:e6def7eed9e7fa90fde255afaf08060dc4b343bbe524a8f69bdd2a2f0018f600"},
+    {file = "pillow-11.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8f4f3724c068be008c08257207210c138d5f3731af6c155a81c2b09a9eb3a788"},
+    {file = "pillow-11.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a0a6709b47019dff32e678bc12c63008311b82b9327613f534e496dacaefb71e"},
+    {file = "pillow-11.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f6b0c664ccb879109ee3ca702a9272d877f4fcd21e5eb63c26422fd6e415365e"},
+    {file = "pillow-11.2.1-cp311-cp311-win32.whl", hash = "sha256:cc5d875d56e49f112b6def6813c4e3d3036d269c008bf8aef72cd08d20ca6df6"},
+    {file = "pillow-11.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:0f5c7eda47bf8e3c8a283762cab94e496ba977a420868cb819159980b6709193"},
+    {file = "pillow-11.2.1-cp311-cp311-win_arm64.whl", hash = "sha256:4d375eb838755f2528ac8cbc926c3e31cc49ca4ad0cf79cff48b20e30634a4a7"},
+    {file = "pillow-11.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:78afba22027b4accef10dbd5eed84425930ba41b3ea0a86fa8d20baaf19d807f"},
+    {file = "pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:78092232a4ab376a35d68c4e6d5e00dfd73454bd12b230420025fbe178ee3b0b"},
+    {file = "pillow-11.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25a5f306095c6780c52e6bbb6109624b95c5b18e40aab1c3041da3e9e0cd3e2d"},
+    {file = "pillow-11.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c7b29dbd4281923a2bfe562acb734cee96bbb129e96e6972d315ed9f232bef4"},
+    {file = "pillow-11.2.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3e645b020f3209a0181a418bffe7b4a93171eef6c4ef6cc20980b30bebf17b7d"},
+    {file = "pillow-11.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b2dbea1012ccb784a65349f57bbc93730b96e85b42e9bf7b01ef40443db720b4"},
+    {file = "pillow-11.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:da3104c57bbd72948d75f6a9389e6727d2ab6333c3617f0a89d72d4940aa0443"},
+    {file = "pillow-11.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:598174aef4589af795f66f9caab87ba4ff860ce08cd5bb447c6fc553ffee603c"},
+    {file = "pillow-11.2.1-cp312-cp312-win32.whl", hash = "sha256:1d535df14716e7f8776b9e7fee118576d65572b4aad3ed639be9e4fa88a1cad3"},
+    {file = "pillow-11.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:14e33b28bf17c7a38eede290f77db7c664e4eb01f7869e37fa98a5aa95978941"},
+    {file = "pillow-11.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:21e1470ac9e5739ff880c211fc3af01e3ae505859392bf65458c224d0bf283eb"},
+    {file = "pillow-11.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fdec757fea0b793056419bca3e9932eb2b0ceec90ef4813ea4c1e072c389eb28"},
+    {file = "pillow-11.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b0e130705d568e2f43a17bcbe74d90958e8a16263868a12c3e0d9c8162690830"},
+    {file = "pillow-11.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bdb5e09068332578214cadd9c05e3d64d99e0e87591be22a324bdbc18925be0"},
+    {file = "pillow-11.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d189ba1bebfbc0c0e529159631ec72bb9e9bc041f01ec6d3233d6d82eb823bc1"},
+    {file = "pillow-11.2.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:191955c55d8a712fab8934a42bfefbf99dd0b5875078240943f913bb66d46d9f"},
+    {file = "pillow-11.2.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:ad275964d52e2243430472fc5d2c2334b4fc3ff9c16cb0a19254e25efa03a155"},
+    {file = "pillow-11.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:750f96efe0597382660d8b53e90dd1dd44568a8edb51cb7f9d5d918b80d4de14"},
+    {file = "pillow-11.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fe15238d3798788d00716637b3d4e7bb6bde18b26e5d08335a96e88564a36b6b"},
+    {file = "pillow-11.2.1-cp313-cp313-win32.whl", hash = "sha256:3fe735ced9a607fee4f481423a9c36701a39719252a9bb251679635f99d0f7d2"},
+    {file = "pillow-11.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:74ee3d7ecb3f3c05459ba95eed5efa28d6092d751ce9bf20e3e253a4e497e691"},
+    {file = "pillow-11.2.1-cp313-cp313-win_arm64.whl", hash = "sha256:5119225c622403afb4b44bad4c1ca6c1f98eed79db8d3bc6e4e160fc6339d66c"},
+    {file = "pillow-11.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8ce2e8411c7aaef53e6bb29fe98f28cd4fbd9a1d9be2eeea434331aac0536b22"},
+    {file = "pillow-11.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9ee66787e095127116d91dea2143db65c7bb1e232f617aa5957c0d9d2a3f23a7"},
+    {file = "pillow-11.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9622e3b6c1d8b551b6e6f21873bdcc55762b4b2126633014cea1803368a9aa16"},
+    {file = "pillow-11.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63b5dff3a68f371ea06025a1a6966c9a1e1ee452fc8020c2cd0ea41b83e9037b"},
+    {file = "pillow-11.2.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:31df6e2d3d8fc99f993fd253e97fae451a8db2e7207acf97859732273e108406"},
+    {file = "pillow-11.2.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:062b7a42d672c45a70fa1f8b43d1d38ff76b63421cbbe7f88146b39e8a558d91"},
+    {file = "pillow-11.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4eb92eca2711ef8be42fd3f67533765d9fd043b8c80db204f16c8ea62ee1a751"},
+    {file = "pillow-11.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f91ebf30830a48c825590aede79376cb40f110b387c17ee9bd59932c961044f9"},
+    {file = "pillow-11.2.1-cp313-cp313t-win32.whl", hash = "sha256:e0b55f27f584ed623221cfe995c912c61606be8513bfa0e07d2c674b4516d9dd"},
+    {file = "pillow-11.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:36d6b82164c39ce5482f649b437382c0fb2395eabc1e2b1702a6deb8ad647d6e"},
+    {file = "pillow-11.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681"},
+    {file = "pillow-11.2.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:7491cf8a79b8eb867d419648fff2f83cb0b3891c8b36da92cc7f1931d46108c8"},
+    {file = "pillow-11.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8b02d8f9cb83c52578a0b4beadba92e37d83a4ef11570a8688bbf43f4ca50909"},
+    {file = "pillow-11.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:014ca0050c85003620526b0ac1ac53f56fc93af128f7546623cc8e31875ab928"},
+    {file = "pillow-11.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3692b68c87096ac6308296d96354eddd25f98740c9d2ab54e1549d6c8aea9d79"},
+    {file = "pillow-11.2.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:f781dcb0bc9929adc77bad571b8621ecb1e4cdef86e940fe2e5b5ee24fd33b35"},
+    {file = "pillow-11.2.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:2b490402c96f907a166615e9a5afacf2519e28295f157ec3a2bb9bd57de638cb"},
+    {file = "pillow-11.2.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dd6b20b93b3ccc9c1b597999209e4bc5cf2853f9ee66e3fc9a400a78733ffc9a"},
+    {file = "pillow-11.2.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4b835d89c08a6c2ee7781b8dd0a30209a8012b5f09c0a665b65b0eb3560b6f36"},
+    {file = "pillow-11.2.1-cp39-cp39-win32.whl", hash = "sha256:b10428b3416d4f9c61f94b494681280be7686bda15898a3a9e08eb66a6d92d67"},
+    {file = "pillow-11.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:6ebce70c3f486acf7591a3d73431fa504a4e18a9b97ff27f5f47b7368e4b9dd1"},
+    {file = "pillow-11.2.1-cp39-cp39-win_arm64.whl", hash = "sha256:c27476257b2fdcd7872d54cfd119b3a9ce4610fb85c8e32b70b42e3680a29a1e"},
+    {file = "pillow-11.2.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:9b7b0d4fd2635f54ad82785d56bc0d94f147096493a79985d0ab57aedd563156"},
+    {file = "pillow-11.2.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:aa442755e31c64037aa7c1cb186e0b369f8416c567381852c63444dd666fb772"},
+    {file = "pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0d3348c95b766f54b76116d53d4cb171b52992a1027e7ca50c81b43b9d9e363"},
+    {file = "pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85d27ea4c889342f7e35f6d56e7e1cb345632ad592e8c51b693d7b7556043ce0"},
+    {file = "pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bf2c33d6791c598142f00c9c4c7d47f6476731c31081331664eb26d6ab583e01"},
+    {file = "pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e616e7154c37669fc1dfc14584f11e284e05d1c650e1c0f972f281c4ccc53193"},
+    {file = "pillow-11.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:39ad2e0f424394e3aebc40168845fee52df1394a4673a6ee512d840d14ab3013"},
+    {file = "pillow-11.2.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:80f1df8dbe9572b4b7abdfa17eb5d78dd620b1d55d9e25f834efdbee872d3aed"},
+    {file = "pillow-11.2.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ea926cfbc3957090becbcbbb65ad177161a2ff2ad578b5a6ec9bb1e1cd78753c"},
+    {file = "pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:738db0e0941ca0376804d4de6a782c005245264edaa253ffce24e5a15cbdc7bd"},
+    {file = "pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9db98ab6565c69082ec9b0d4e40dd9f6181dab0dd236d26f7a50b8b9bfbd5076"},
+    {file = "pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:036e53f4170e270ddb8797d4c590e6dd14d28e15c7da375c18978045f7e6c37b"},
+    {file = "pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14f73f7c291279bd65fda51ee87affd7c1e097709f7fdd0188957a16c264601f"},
+    {file = "pillow-11.2.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:208653868d5c9ecc2b327f9b9ef34e0e42a4cdd172c2988fd81d62d2bc9bc044"},
+    {file = "pillow-11.2.1.tar.gz", hash = "sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6"},
 ]
 
 [package.extras]
-docs = ["furo", "olefile", "sphinx (>=8.1)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"]
+docs = ["furo", "olefile", "sphinx (>=8.2)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"]
 fpx = ["olefile"]
 mic = ["olefile"]
-tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
+test-arrow = ["pyarrow"]
+tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout", "trove-classifiers (>=2024.10.12)"]
 typing = ["typing-extensions"]
 xmp = ["defusedxml"]
 
@@ -2269,6 +1679,7 @@ version = "1.5.0"
 description = "plugin and hook calling mechanisms for python"
 optional = false
 python-versions = ">=3.8"
+groups = ["dev"]
 files = [
     {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
     {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
@@ -2280,173 +1691,62 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "prometheus-client"
-version = "0.20.0"
+version = "0.21.1"
 description = "Python client for the Prometheus monitoring system."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "prometheus_client-0.20.0-py3-none-any.whl", hash = "sha256:cde524a85bce83ca359cc837f28b8c0db5cac7aa653a588fd7e84ba061c329e7"},
-    {file = "prometheus_client-0.20.0.tar.gz", hash = "sha256:287629d00b147a32dcb2be0b9df905da599b2d82f80377083ec8463309a4bb89"},
+    {file = "prometheus_client-0.21.1-py3-none-any.whl", hash = "sha256:594b45c410d6f4f8888940fe80b5cc2521b305a1fafe1c58609ef715a001f301"},
+    {file = "prometheus_client-0.21.1.tar.gz", hash = "sha256:252505a722ac04b0456be05c05f75f45d760c2911ffc45f2a06bcaed9f3ae3fb"},
 ]
 
 [package.extras]
 twisted = ["twisted"]
 
-[[package]]
-name = "propcache"
-version = "0.2.0"
-description = "Accelerated property cache"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
-    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
-    {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"},
-    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"},
-    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"},
-    {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"},
-    {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"},
-    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"},
-    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"},
-    {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"},
-    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"},
-    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"},
-    {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"},
-    {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"},
-    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"},
-    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"},
-    {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"},
-    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"},
-    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"},
-    {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"},
-    {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"},
-    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"},
-    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"},
-    {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"},
-    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"},
-    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"},
-    {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"},
-    {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"},
-    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"},
-    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"},
-    {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"},
-    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"},
-    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"},
-    {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"},
-    {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"},
-    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"},
-    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"},
-    {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"},
-    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"},
-    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"},
-    {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"},
-    {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"},
-    {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"},
-    {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"},
-]
-
 [[package]]
 name = "protobuf"
-version = "4.25.5"
+version = "5.29.4"
 description = ""
 optional = false
 python-versions = ">=3.8"
+groups = ["main", "dev"]
 files = [
-    {file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"},
-    {file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"},
-    {file = "protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173"},
-    {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d"},
-    {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331"},
-    {file = "protobuf-4.25.5-cp38-cp38-win32.whl", hash = "sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1"},
-    {file = "protobuf-4.25.5-cp38-cp38-win_amd64.whl", hash = "sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a"},
-    {file = "protobuf-4.25.5-cp39-cp39-win32.whl", hash = "sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f"},
-    {file = "protobuf-4.25.5-cp39-cp39-win_amd64.whl", hash = "sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45"},
-    {file = "protobuf-4.25.5-py3-none-any.whl", hash = "sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41"},
-    {file = "protobuf-4.25.5.tar.gz", hash = "sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584"},
+    {file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"},
+    {file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"},
+    {file = "protobuf-5.29.4-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:307ecba1d852ec237e9ba668e087326a67564ef83e45a0189a772ede9e854dd0"},
+    {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:aec4962f9ea93c431d5714ed1be1c93f13e1a8618e70035ba2b0564d9e633f2e"},
+    {file = "protobuf-5.29.4-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:d7d3f7d1d5a66ed4942d4fefb12ac4b14a29028b209d4bfb25c68ae172059922"},
+    {file = "protobuf-5.29.4-cp38-cp38-win32.whl", hash = "sha256:1832f0515b62d12d8e6ffc078d7e9eb06969aa6dc13c13e1036e39d73bebc2de"},
+    {file = "protobuf-5.29.4-cp38-cp38-win_amd64.whl", hash = "sha256:476cb7b14914c780605a8cf62e38c2a85f8caff2e28a6a0bad827ec7d6c85d68"},
+    {file = "protobuf-5.29.4-cp39-cp39-win32.whl", hash = "sha256:fd32223020cb25a2cc100366f1dedc904e2d71d9322403224cdde5fdced0dabe"},
+    {file = "protobuf-5.29.4-cp39-cp39-win_amd64.whl", hash = "sha256:678974e1e3a9b975b8bc2447fca458db5f93a2fb6b0c8db46b6675b5b5346812"},
+    {file = "protobuf-5.29.4-py3-none-any.whl", hash = "sha256:3fde11b505e1597f71b875ef2fc52062b6a9740e5f7c8997ce878b6009145862"},
+    {file = "protobuf-5.29.4.tar.gz", hash = "sha256:4f1dfcd7997b31ef8f53ec82781ff434a28bf71d9102ddde14d076adcfc78c99"},
 ]
 
 [[package]]
 name = "psutil"
-version = "6.1.0"
-description = "Cross-platform lib for process and system monitoring in Python."
-optional = true
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+version = "7.0.0"
+description = "Cross-platform lib for process and system monitoring in Python.  NOTE: the syntax of this script MUST be kept compatible with Python 2.7."
+optional = false
+python-versions = ">=3.6"
+groups = ["main"]
 files = [
-    {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
-    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
-    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"},
-    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"},
-    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"},
-    {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"},
-    {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"},
-    {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"},
-    {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"},
-    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"},
-    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"},
-    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"},
-    {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"},
-    {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"},
-    {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"},
-    {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"},
-    {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"},
+    {file = "psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25"},
+    {file = "psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da"},
+    {file = "psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91"},
+    {file = "psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34"},
+    {file = "psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993"},
+    {file = "psutil-7.0.0-cp36-cp36m-win32.whl", hash = "sha256:84df4eb63e16849689f76b1ffcb36db7b8de703d1bc1fe41773db487621b6c17"},
+    {file = "psutil-7.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1e744154a6580bc968a0195fd25e80432d3afec619daf145b9e5ba16cc1d688e"},
+    {file = "psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99"},
+    {file = "psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553"},
+    {file = "psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456"},
 ]
 
 [package.extras]
-dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"]
+dev = ["abi3audit", "black (==24.10.0)", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest", "pytest-cov", "pytest-xdist", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"]
 test = ["pytest", "pytest-xdist", "setuptools"]
 
 [[package]]
@@ -2455,77 +1755,29 @@ version = "9.0.0"
 description = "Get CPU info with pure Python"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"},
     {file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"},
 ]
 
-[[package]]
-name = "pyarrow"
-version = "17.0.0"
-description = "Python library for Apache Arrow"
-optional = true
-python-versions = ">=3.8"
-files = [
-    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
-    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
-    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
-    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
-    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
-    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
-    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
-    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
-    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
-    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
-    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
-    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
-    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
-    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
-    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
-    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
-]
-
-[package.dependencies]
-numpy = ">=1.16.6"
-
-[package.extras]
-test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
-
 [[package]]
 name = "pydantic"
-version = "2.9.2"
+version = "2.11.3"
 description = "Data validation using Python type hints"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12"},
-    {file = "pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f"},
+    {file = "pydantic-2.11.3-py3-none-any.whl", hash = "sha256:a082753436a07f9ba1289c6ffa01cd93db3548776088aa917cc43b63f68fa60f"},
+    {file = "pydantic-2.11.3.tar.gz", hash = "sha256:7471657138c16adad9322fe3070c0116dd6c3ad8d649300e3cbdfe91f4db4ec3"},
 ]
 
 [package.dependencies]
 annotated-types = ">=0.6.0"
-pydantic-core = "2.23.4"
-typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""}
+pydantic-core = "2.33.1"
+typing-extensions = ">=4.12.2"
+typing-inspection = ">=0.4.0"
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
@@ -2533,128 +1785,141 @@ timezone = ["tzdata"]
 
 [[package]]
 name = "pydantic-core"
-version = "2.23.4"
+version = "2.33.1"
 description = "Core functionality for Pydantic validation and serialization"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b10bd51f823d891193d4717448fab065733958bdb6a6b351967bd349d48d5c9b"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4fc714bdbfb534f94034efaa6eadd74e5b93c8fa6315565a222f7b6f42ca1166"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63e46b3169866bd62849936de036f901a9356e36376079b05efa83caeaa02ceb"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed1a53de42fbe34853ba90513cea21673481cd81ed1be739f7f2efb931b24916"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cfdd16ab5e59fc31b5e906d1a3f666571abc367598e3e02c83403acabc092e07"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255a8ef062cbf6674450e668482456abac99a5583bbafb73f9ad469540a3a232"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a7cd62e831afe623fbb7aabbb4fe583212115b3ef38a9f6b71869ba644624a2"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f09e2ff1f17c2b51f2bc76d1cc33da96298f0a036a137f5440ab3ec5360b624f"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e38e63e6f3d1cec5a27e0afe90a085af8b6806ee208b33030e65b6516353f1a3"},
-    {file = "pydantic_core-2.23.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0dbd8dbed2085ed23b5c04afa29d8fd2771674223135dc9bc937f3c09284d071"},
-    {file = "pydantic_core-2.23.4-cp310-none-win32.whl", hash = "sha256:6531b7ca5f951d663c339002e91aaebda765ec7d61b7d1e3991051906ddde119"},
-    {file = "pydantic_core-2.23.4-cp310-none-win_amd64.whl", hash = "sha256:7c9129eb40958b3d4500fa2467e6a83356b3b61bfff1b414c7361d9220f9ae8f"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:77733e3892bb0a7fa797826361ce8a9184d25c8dffaec60b7ffe928153680ba8"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b84d168f6c48fabd1f2027a3d1bdfe62f92cade1fb273a5d68e621da0e44e6d"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df49e7a0861a8c36d089c1ed57d308623d60416dab2647a4a17fe050ba85de0e"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff02b6d461a6de369f07ec15e465a88895f3223eb75073ffea56b84d9331f607"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:996a38a83508c54c78a5f41456b0103c30508fed9abcad0a59b876d7398f25fd"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d97683ddee4723ae8c95d1eddac7c192e8c552da0c73a925a89fa8649bf13eea"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:216f9b2d7713eb98cb83c80b9c794de1f6b7e3145eef40400c62e86cee5f4e1e"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f783e0ec4803c787bcea93e13e9932edab72068f68ecffdf86a99fd5918878b"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d0776dea117cf5272382634bd2a5c1b6eb16767c223c6a5317cd3e2a757c61a0"},
-    {file = "pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5f7a395a8cf1621939692dba2a6b6a830efa6b3cee787d82c7de1ad2930de64"},
-    {file = "pydantic_core-2.23.4-cp311-none-win32.whl", hash = "sha256:74b9127ffea03643e998e0c5ad9bd3811d3dac8c676e47db17b0ee7c3c3bf35f"},
-    {file = "pydantic_core-2.23.4-cp311-none-win_amd64.whl", hash = "sha256:98d134c954828488b153d88ba1f34e14259284f256180ce659e8d83e9c05eaa3"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f3e0da4ebaef65158d4dfd7d3678aad692f7666877df0002b8a522cdf088f231"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f69a8e0b033b747bb3e36a44e7732f0c99f7edd5cea723d45bc0d6e95377ffee"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723314c1d51722ab28bfcd5240d858512ffd3116449c557a1336cbe3919beb87"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bb2802e667b7051a1bebbfe93684841cc9351004e2badbd6411bf357ab8d5ac8"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18ca8148bebe1b0a382a27a8ee60350091a6ddaf475fa05ef50dc35b5df6327"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33e3d65a85a2a4a0dc3b092b938a4062b1a05f3a9abde65ea93b233bca0e03f2"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:128585782e5bfa515c590ccee4b727fb76925dd04a98864182b22e89a4e6ed36"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:68665f4c17edcceecc112dfed5dbe6f92261fb9d6054b47d01bf6371a6196126"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:20152074317d9bed6b7a95ade3b7d6054845d70584216160860425f4fbd5ee9e"},
-    {file = "pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9261d3ce84fa1d38ed649c3638feefeae23d32ba9182963e465d58d62203bd24"},
-    {file = "pydantic_core-2.23.4-cp312-none-win32.whl", hash = "sha256:4ba762ed58e8d68657fc1281e9bb72e1c3e79cc5d464be146e260c541ec12d84"},
-    {file = "pydantic_core-2.23.4-cp312-none-win_amd64.whl", hash = "sha256:97df63000f4fea395b2824da80e169731088656d1818a11b95f3b173747b6cd9"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7530e201d10d7d14abce4fb54cfe5b94a0aefc87da539d0346a484ead376c3cc"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df933278128ea1cd77772673c73954e53a1c95a4fdf41eef97c2b779271bd0bd"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cb3da3fd1b6a5d0279a01877713dbda118a2a4fc6f0d821a57da2e464793f05"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c6dcb030aefb668a2b7009c85b27f90e51e6a3b4d5c9bc4c57631292015b0d"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:696dd8d674d6ce621ab9d45b205df149399e4bb9aa34102c970b721554828510"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2971bb5ffe72cc0f555c13e19b23c85b654dd2a8f7ab493c262071377bfce9f6"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8394d940e5d400d04cad4f75c0598665cbb81aecefaca82ca85bd28264af7f9b"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dff76e0602ca7d4cdaacc1ac4c005e0ce0dcfe095d5b5259163a80d3a10d327"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7d32706badfe136888bdea71c0def994644e09fff0bfe47441deaed8e96fdbc6"},
-    {file = "pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f"},
-    {file = "pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769"},
-    {file = "pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d4488a93b071c04dc20f5cecc3631fc78b9789dd72483ba15d423b5b3689b555"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:81965a16b675b35e1d09dd14df53f190f9129c0202356ed44ab2728b1c905658"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ffa2ebd4c8530079140dd2d7f794a9d9a73cbb8e9d59ffe24c63436efa8f271"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:61817945f2fe7d166e75fbfb28004034b48e44878177fc54d81688e7b85a3665"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29d2c342c4bc01b88402d60189f3df065fb0dda3654744d5a165a5288a657368"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e11661ce0fd30a6790e8bcdf263b9ec5988e95e63cf901972107efc49218b13"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d18368b137c6295db49ce7218b1a9ba15c5bc254c96d7c9f9e924a9bc7825ad"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ec4e55f79b1c4ffb2eecd8a0cfba9955a2588497d96851f4c8f99aa4a1d39b12"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:374a5e5049eda9e0a44c696c7ade3ff355f06b1fe0bb945ea3cac2bc336478a2"},
-    {file = "pydantic_core-2.23.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5c364564d17da23db1106787675fc7af45f2f7b58b4173bfdd105564e132e6fb"},
-    {file = "pydantic_core-2.23.4-cp38-none-win32.whl", hash = "sha256:d7a80d21d613eec45e3d41eb22f8f94ddc758a6c4720842dc74c0581f54993d6"},
-    {file = "pydantic_core-2.23.4-cp38-none-win_amd64.whl", hash = "sha256:5f5ff8d839f4566a474a969508fe1c5e59c31c80d9e140566f9a37bba7b8d556"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a4fa4fc04dff799089689f4fd502ce7d59de529fc2f40a2c8836886c03e0175a"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7df63886be5e270da67e0966cf4afbae86069501d35c8c1b3b6c168f42cb36"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcedcd19a557e182628afa1d553c3895a9f825b936415d0dbd3cd0bbcfd29b4b"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f54b118ce5de9ac21c363d9b3caa6c800341e8c47a508787e5868c6b79c9323"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86d2f57d3e1379a9525c5ab067b27dbb8a0642fb5d454e17a9ac434f9ce523e3"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de6d1d1b9e5101508cb37ab0d972357cac5235f5c6533d1071964c47139257df"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1278e0d324f6908e872730c9102b0112477a7f7cf88b308e4fc36ce1bdb6d58c"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a6b5099eeec78827553827f4c6b8615978bb4b6a88e5d9b93eddf8bb6790f55"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e55541f756f9b3ee346b840103f32779c695a19826a4c442b7954550a0972040"},
-    {file = "pydantic_core-2.23.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a5c7ba8ffb6d6f8f2ab08743be203654bb1aaa8c9dcb09f82ddd34eadb695605"},
-    {file = "pydantic_core-2.23.4-cp39-none-win32.whl", hash = "sha256:37b0fe330e4a58d3c58b24d91d1eb102aeec675a3db4c292ec3928ecd892a9a6"},
-    {file = "pydantic_core-2.23.4-cp39-none-win_amd64.whl", hash = "sha256:1498bec4c05c9c787bde9125cfdcc63a41004ff167f495063191b863399b1a29"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f455ee30a9d61d3e1a15abd5068827773d6e4dc513e795f380cdd59932c782d5"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1e90d2e3bd2c3863d48525d297cd143fe541be8bbf6f579504b9712cb6b643ec"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e203fdf807ac7e12ab59ca2bfcabb38c7cf0b33c41efeb00f8e5da1d86af480"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e08277a400de01bc72436a0ccd02bdf596631411f592ad985dcee21445bd0068"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f220b0eea5965dec25480b6333c788fb72ce5f9129e8759ef876a1d805d00801"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d06b0c8da4f16d1d1e352134427cb194a0a6e19ad5db9161bf32b2113409e728"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ba1a0996f6c2773bd83e63f18914c1de3c9dd26d55f4ac302a7efe93fb8e7433"},
-    {file = "pydantic_core-2.23.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9a5bce9d23aac8f0cf0836ecfc033896aa8443b501c58d0602dbfd5bd5b37753"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:78ddaaa81421a29574a682b3179d4cf9e6d405a09b99d93ddcf7e5239c742e21"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:883a91b5dd7d26492ff2f04f40fbb652de40fcc0afe07e8129e8ae779c2110eb"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88ad334a15b32a791ea935af224b9de1bf99bcd62fabf745d5f3442199d86d59"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233710f069d251feb12a56da21e14cca67994eab08362207785cf8c598e74577"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19442362866a753485ba5e4be408964644dd6a09123d9416c54cd49171f50744"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:624e278a7d29b6445e4e813af92af37820fafb6dcc55c012c834f9e26f9aaaef"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f5ef8f42bec47f21d07668a043f077d507e5bf4e668d5c6dfe6aaba89de1a5b8"},
-    {file = "pydantic_core-2.23.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:aea443fffa9fbe3af1a9ba721a87f926fe548d32cab71d188a6ede77d0ff244e"},
-    {file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3077cfdb6125cc8dab61b155fdd714663e401f0e6883f9632118ec12cf42df26"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8ffab8b2908d152e74862d276cf5017c81a2f3719f14e8e3e8d6b83fda863927"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5183e4f6a2d468787243ebcd70cf4098c247e60d73fb7d68d5bc1e1beaa0c4db"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:398a38d323f37714023be1e0285765f0a27243a8b1506b7b7de87b647b517e48"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87d3776f0001b43acebfa86f8c64019c043b55cc5a6a2e313d728b5c95b46969"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c566dd9c5f63d22226409553531f89de0cac55397f2ab8d97d6f06cfce6d947e"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0d5f3acc81452c56895e90643a625302bd6be351e7010664151cc55b7b97f89"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d3a07fadec2a13274a8d861d3d37c61e97a816beae717efccaa4b36dfcaadcde"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f99aeda58dce827f76963ee87a0ebe75e648c72ff9ba1174a253f6744f518f65"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:902dbc832141aa0ec374f4310f1e4e7febeebc3256f00dc359a9ac3f264a45dc"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fe44d56aa0b00d66640aa84a3cbe80b7a3ccdc6f0b1ca71090696a6d4777c091"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-win32.whl", hash = "sha256:ed3eb16d51257c763539bde21e011092f127a2202692afaeaccb50db55a31383"},
+    {file = "pydantic_core-2.33.1-cp310-cp310-win_amd64.whl", hash = "sha256:694ad99a7f6718c1a498dc170ca430687a39894a60327f548e02a9c7ee4b6504"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e966fc3caaf9f1d96b349b0341c70c8d6573bf1bac7261f7b0ba88f96c56c24"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bfd0adeee563d59c598ceabddf2c92eec77abcb3f4a391b19aa7366170bd9e30"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91815221101ad3c6b507804178a7bb5cb7b2ead9ecd600041669c8d805ebd595"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9fea9c1869bb4742d174a57b4700c6dadea951df8b06de40c2fedb4f02931c2e"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d20eb4861329bb2484c021b9d9a977566ab16d84000a57e28061151c62b349a"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fb935c5591573ae3201640579f30128ccc10739b45663f93c06796854405505"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c964fd24e6166420d18fb53996d8c9fd6eac9bf5ae3ec3d03015be4414ce497f"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:681d65e9011f7392db5aa002b7423cc442d6a673c635668c227c6c8d0e5a4f77"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e100c52f7355a48413e2999bfb4e139d2977a904495441b374f3d4fb4a170961"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:048831bd363490be79acdd3232f74a0e9951b11b2b4cc058aeb72b22fdc3abe1"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bdc84017d28459c00db6f918a7272a5190bec3090058334e43a76afb279eac7c"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-win32.whl", hash = "sha256:32cd11c5914d1179df70406427097c7dcde19fddf1418c787540f4b730289896"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-win_amd64.whl", hash = "sha256:2ea62419ba8c397e7da28a9170a16219d310d2cf4970dbc65c32faf20d828c83"},
+    {file = "pydantic_core-2.33.1-cp311-cp311-win_arm64.whl", hash = "sha256:fc903512177361e868bc1f5b80ac8c8a6e05fcdd574a5fb5ffeac5a9982b9e89"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:1293d7febb995e9d3ec3ea09caf1a26214eec45b0f29f6074abb004723fc1de8"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:99b56acd433386c8f20be5c4000786d1e7ca0523c8eefc995d14d79c7a081498"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35a5ec3fa8c2fe6c53e1b2ccc2454398f95d5393ab398478f53e1afbbeb4d939"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b172f7b9d2f3abc0efd12e3386f7e48b576ef309544ac3a63e5e9cdd2e24585d"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9097b9f17f91eea659b9ec58148c0747ec354a42f7389b9d50701610d86f812e"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc77ec5b7e2118b152b0d886c7514a4653bcb58c6b1d760134a9fab915f777b3"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3d15245b08fa4a84cefc6c9222e6f37c98111c8679fbd94aa145f9a0ae23d"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef99779001d7ac2e2461d8ab55d3373fe7315caefdbecd8ced75304ae5a6fc6b"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fc6bf8869e193855e8d91d91f6bf59699a5cdfaa47a404e278e776dd7f168b39"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:b1caa0bc2741b043db7823843e1bde8aaa58a55a58fda06083b0569f8b45693a"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ec259f62538e8bf364903a7d0d0239447059f9434b284f5536e8402b7dd198db"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-win32.whl", hash = "sha256:e14f369c98a7c15772b9da98987f58e2b509a93235582838bd0d1d8c08b68fda"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-win_amd64.whl", hash = "sha256:1c607801d85e2e123357b3893f82c97a42856192997b95b4d8325deb1cd0c5f4"},
+    {file = "pydantic_core-2.33.1-cp312-cp312-win_arm64.whl", hash = "sha256:8d13f0276806ee722e70a1c93da19748594f19ac4299c7e41237fc791d1861ea"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:70af6a21237b53d1fe7b9325b20e65cbf2f0a848cf77bed492b029139701e66a"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:282b3fe1bbbe5ae35224a0dbd05aed9ccabccd241e8e6b60370484234b456266"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b315e596282bbb5822d0c7ee9d255595bd7506d1cb20c2911a4da0b970187d3"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1dfae24cf9921875ca0ca6a8ecb4bb2f13c855794ed0d468d6abbec6e6dcd44a"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6dd8ecfde08d8bfadaea669e83c63939af76f4cf5538a72597016edfa3fad516"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f593494876eae852dc98c43c6f260f45abdbfeec9e4324e31a481d948214764"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:948b73114f47fd7016088e5186d13faf5e1b2fe83f5e320e371f035557fd264d"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e11f3864eb516af21b01e25fac915a82e9ddad3bb0fb9e95a246067398b435a4"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:549150be302428b56fdad0c23c2741dcdb5572413776826c965619a25d9c6bde"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:495bc156026efafd9ef2d82372bd38afce78ddd82bf28ef5276c469e57c0c83e"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ec79de2a8680b1a67a07490bddf9636d5c2fab609ba8c57597e855fa5fa4dacd"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-win32.whl", hash = "sha256:ee12a7be1742f81b8a65b36c6921022301d466b82d80315d215c4c691724986f"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-win_amd64.whl", hash = "sha256:ede9b407e39949d2afc46385ce6bd6e11588660c26f80576c11c958e6647bc40"},
+    {file = "pydantic_core-2.33.1-cp313-cp313-win_arm64.whl", hash = "sha256:aa687a23d4b7871a00e03ca96a09cad0f28f443690d300500603bd0adba4b523"},
+    {file = "pydantic_core-2.33.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:401d7b76e1000d0dd5538e6381d28febdcacb097c8d340dde7d7fc6e13e9f95d"},
+    {file = "pydantic_core-2.33.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7aeb055a42d734c0255c9e489ac67e75397d59c6fbe60d155851e9782f276a9c"},
+    {file = "pydantic_core-2.33.1-cp313-cp313t-win_amd64.whl", hash = "sha256:338ea9b73e6e109f15ab439e62cb3b78aa752c7fd9536794112e14bee02c8d18"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5ab77f45d33d264de66e1884fca158bc920cb5e27fd0764a72f72f5756ae8bdb"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e7aaba1b4b03aaea7bb59e1b5856d734be011d3e6d98f5bcaa98cb30f375f2ad"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7fb66263e9ba8fea2aa85e1e5578980d127fb37d7f2e292773e7bc3a38fb0c7b"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3f2648b9262607a7fb41d782cc263b48032ff7a03a835581abbf7a3bec62bcf5"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:723c5630c4259400818b4ad096735a829074601805d07f8cafc366d95786d331"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d100e3ae783d2167782391e0c1c7a20a31f55f8015f3293647544df3f9c67824"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:177d50460bc976a0369920b6c744d927b0ecb8606fb56858ff542560251b19e5"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3edde68d1a1f9af1273b2fe798997b33f90308fb6d44d8550c89fc6a3647cf6"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a62c3c3ef6a7e2c45f7853b10b5bc4ddefd6ee3cd31024754a1a5842da7d598d"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:c91dbb0ab683fa0cd64a6e81907c8ff41d6497c346890e26b23de7ee55353f96"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9f466e8bf0a62dc43e068c12166281c2eca72121dd2adc1040f3aa1e21ef8599"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-win32.whl", hash = "sha256:ab0277cedb698749caada82e5d099dc9fed3f906a30d4c382d1a21725777a1e5"},
+    {file = "pydantic_core-2.33.1-cp39-cp39-win_amd64.whl", hash = "sha256:5773da0ee2d17136b1f1c6fbde543398d452a6ad2a7b54ea1033e2daa739b8d2"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c834f54f8f4640fd7e4b193f80eb25a0602bba9e19b3cd2fc7ffe8199f5ae02"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:049e0de24cf23766f12cc5cc71d8abc07d4a9deb9061b334b62093dedc7cb068"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a28239037b3d6f16916a4c831a5a0eadf856bdd6d2e92c10a0da3a59eadcf3e"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d3da303ab5f378a268fa7d45f37d7d85c3ec19769f28d2cc0c61826a8de21fe"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:25626fb37b3c543818c14821afe0fd3830bc327a43953bc88db924b68c5723f1"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3ab2d36e20fbfcce8f02d73c33a8a7362980cff717926bbae030b93ae46b56c7"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:2f9284e11c751b003fd4215ad92d325d92c9cb19ee6729ebd87e3250072cdcde"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:048c01eee07d37cbd066fc512b9d8b5ea88ceeb4e629ab94b3e56965ad655add"},
+    {file = "pydantic_core-2.33.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5ccd429694cf26af7997595d627dd2637e7932214486f55b8a357edaac9dae8c"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3a371dc00282c4b84246509a5ddc808e61b9864aa1eae9ecc92bb1268b82db4a"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f59295ecc75a1788af8ba92f2e8c6eeaa5a94c22fc4d151e8d9638814f85c8fc"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08530b8ac922003033f399128505f513e30ca770527cc8bbacf75a84fcc2c74b"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bae370459da6a5466978c0eacf90690cb57ec9d533f8e63e564ef3822bfa04fe"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e3de2777e3b9f4d603112f78006f4ae0acb936e95f06da6cb1a45fbad6bdb4b5"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3a64e81e8cba118e108d7126362ea30e021291b7805d47e4896e52c791be2761"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:52928d8c1b6bda03cc6d811e8923dffc87a2d3c8b3bfd2ce16471c7147a24850"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:1b30d92c9412beb5ac6b10a3eb7ef92ccb14e3f2a8d7732e2d739f58b3aa7544"},
+    {file = "pydantic_core-2.33.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f995719707e0e29f0f41a8aa3bcea6e761a36c9136104d3189eafb83f5cec5e5"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7edbc454a29fc6aeae1e1eecba4f07b63b8d76e76a748532233c4c167b4cb9ea"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:ad05b683963f69a1d5d2c2bdab1274a31221ca737dbbceaa32bcb67359453cdd"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df6a94bf9452c6da9b5d76ed229a5683d0306ccb91cca8e1eea883189780d568"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7965c13b3967909a09ecc91f21d09cfc4576bf78140b988904e94f130f188396"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3f1fdb790440a34f6ecf7679e1863b825cb5ffde858a9197f851168ed08371e5"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:5277aec8d879f8d05168fdd17ae811dd313b8ff894aeeaf7cd34ad28b4d77e33"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:8ab581d3530611897d863d1a649fb0644b860286b4718db919bfd51ece41f10b"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0483847fa9ad5e3412265c1bd72aad35235512d9ce9d27d81a56d935ef489672"},
+    {file = "pydantic_core-2.33.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:de9e06abe3cc5ec6a2d5f75bc99b0bdca4f5c719a5b34026f8c57efbdecd2ee3"},
+    {file = "pydantic_core-2.33.1.tar.gz", hash = "sha256:bcc9c6fdb0ced789245b02b7d6603e17d1563064ddcfc36f046b61c0c05dd9df"},
 ]
 
 [package.dependencies]
 typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
 [[package]]
-name = "pyreadline3"
-version = "3.5.4"
-description = "A python implementation of GNU readline."
+name = "pygments"
+version = "2.19.1"
+description = "Pygments is a syntax highlighting package written in Python."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"},
-    {file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"},
+    {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
+    {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
 ]
 
 [package.extras]
-dev = ["build", "flake8", "mypy", "pytest", "twine"]
+windows-terminal = ["colorama (>=0.4.6)"]
 
 [[package]]
 name = "pytest"
-version = "7.4.4"
+version = "8.3.5"
 description = "pytest: simple powerful testing with Python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
+groups = ["dev"]
 files = [
-    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
-    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
+    {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
+    {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
 ]
 
 [package.dependencies]
@@ -2662,36 +1927,11 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""}
 exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
 iniconfig = "*"
 packaging = "*"
-pluggy = ">=0.12,<2.0"
-tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
+pluggy = ">=1.5,<2"
+tomli = {version = ">=1", markers = "python_version < \"3.11\""}
 
 [package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
-
-[[package]]
-name = "python-dateutil"
-version = "2.9.0.post0"
-description = "Extensions to the standard Python datetime module"
-optional = true
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
-files = [
-    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
-    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
-]
-
-[package.dependencies]
-six = ">=1.5"
-
-[[package]]
-name = "pytz"
-version = "2024.2"
-description = "World timezone definitions, modern and historical"
-optional = true
-python-versions = "*"
-files = [
-    {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
-    {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
-]
+dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
 name = "pyyaml"
@@ -2699,6 +1939,7 @@ version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@@ -2757,120 +1998,123 @@ files = [
 
 [[package]]
 name = "referencing"
-version = "0.35.1"
+version = "0.36.2"
 description = "JSON Referencing + Python"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de"},
-    {file = "referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c"},
+    {file = "referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0"},
+    {file = "referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa"},
 ]
 
 [package.dependencies]
 attrs = ">=22.2.0"
 rpds-py = ">=0.7.0"
+typing-extensions = {version = ">=4.4.0", markers = "python_version < \"3.13\""}
 
 [[package]]
 name = "regex"
-version = "2024.9.11"
+version = "2024.11.6"
 description = "Alternative regular expression module, to replace re."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1494fa8725c285a81d01dc8c06b55287a1ee5e0e382d8413adc0a9197aac6408"},
-    {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0e12c481ad92d129c78f13a2a3662317e46ee7ef96c94fd332e1c29131875b7d"},
-    {file = "regex-2024.9.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:16e13a7929791ac1216afde26f712802e3df7bf0360b32e4914dca3ab8baeea5"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46989629904bad940bbec2106528140a218b4a36bb3042d8406980be1941429c"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a906ed5e47a0ce5f04b2c981af1c9acf9e8696066900bf03b9d7879a6f679fc8"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9a091b0550b3b0207784a7d6d0f1a00d1d1c8a11699c1a4d93db3fbefc3ad35"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ddcd9a179c0a6fa8add279a4444015acddcd7f232a49071ae57fa6e278f1f71"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b41e1adc61fa347662b09398e31ad446afadff932a24807d3ceb955ed865cc8"},
-    {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ced479f601cd2f8ca1fd7b23925a7e0ad512a56d6e9476f79b8f381d9d37090a"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:635a1d96665f84b292e401c3d62775851aedc31d4f8784117b3c68c4fcd4118d"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c0256beda696edcf7d97ef16b2a33a8e5a875affd6fa6567b54f7c577b30a137"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:3ce4f1185db3fbde8ed8aa223fc9620f276c58de8b0d4f8cc86fd1360829edb6"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:09d77559e80dcc9d24570da3745ab859a9cf91953062e4ab126ba9d5993688ca"},
-    {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a22ccefd4db3f12b526eccb129390942fe874a3a9fdbdd24cf55773a1faab1a"},
-    {file = "regex-2024.9.11-cp310-cp310-win32.whl", hash = "sha256:f745ec09bc1b0bd15cfc73df6fa4f726dcc26bb16c23a03f9e3367d357eeedd0"},
-    {file = "regex-2024.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:01c2acb51f8a7d6494c8c5eafe3d8e06d76563d8a8a4643b37e9b2dd8a2ff623"},
-    {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2cce2449e5927a0bf084d346da6cd5eb016b2beca10d0013ab50e3c226ffc0df"},
-    {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b37fa423beefa44919e009745ccbf353d8c981516e807995b2bd11c2c77d268"},
-    {file = "regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:64ce2799bd75039b480cc0360907c4fb2f50022f030bf9e7a8705b636e408fad"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4cc92bb6db56ab0c1cbd17294e14f5e9224f0cc6521167ef388332604e92679"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d05ac6fa06959c4172eccd99a222e1fbf17b5670c4d596cb1e5cde99600674c4"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:040562757795eeea356394a7fb13076ad4f99d3c62ab0f8bdfb21f99a1f85664"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6113c008a7780792efc80f9dfe10ba0cd043cbf8dc9a76ef757850f51b4edc50"},
-    {file = "regex-2024.9.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e5fb5f77c8745a60105403a774fe2c1759b71d3e7b4ca237a5e67ad066c7199"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:54d9ff35d4515debf14bc27f1e3b38bfc453eff3220f5bce159642fa762fe5d4"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:df5cbb1fbc74a8305b6065d4ade43b993be03dbe0f8b30032cced0d7740994bd"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7fb89ee5d106e4a7a51bce305ac4efb981536301895f7bdcf93ec92ae0d91c7f"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a738b937d512b30bf75995c0159c0ddf9eec0775c9d72ac0202076c72f24aa96"},
-    {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e28f9faeb14b6f23ac55bfbbfd3643f5c7c18ede093977f1df249f73fd22c7b1"},
-    {file = "regex-2024.9.11-cp311-cp311-win32.whl", hash = "sha256:18e707ce6c92d7282dfce370cd205098384b8ee21544e7cb29b8aab955b66fa9"},
-    {file = "regex-2024.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:313ea15e5ff2a8cbbad96ccef6be638393041b0a7863183c2d31e0c6116688cf"},
-    {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b0d0a6c64fcc4ef9c69bd5b3b3626cc3776520a1637d8abaa62b9edc147a58f7"},
-    {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:49b0e06786ea663f933f3710a51e9385ce0cba0ea56b67107fd841a55d56a231"},
-    {file = "regex-2024.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5b513b6997a0b2f10e4fd3a1313568e373926e8c252bd76c960f96fd039cd28d"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee439691d8c23e76f9802c42a95cfeebf9d47cf4ffd06f18489122dbb0a7ad64"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a8f877c89719d759e52783f7fe6e1c67121076b87b40542966c02de5503ace42"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23b30c62d0f16827f2ae9f2bb87619bc4fba2044911e2e6c2eb1af0161cdb766"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85ab7824093d8f10d44330fe1e6493f756f252d145323dd17ab6b48733ff6c0a"},
-    {file = "regex-2024.9.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8dee5b4810a89447151999428fe096977346cf2f29f4d5e29609d2e19e0199c9"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:98eeee2f2e63edae2181c886d7911ce502e1292794f4c5ee71e60e23e8d26b5d"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:57fdd2e0b2694ce6fc2e5ccf189789c3e2962916fb38779d3e3521ff8fe7a822"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d552c78411f60b1fdaafd117a1fca2f02e562e309223b9d44b7de8be451ec5e0"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a0b2b80321c2ed3fcf0385ec9e51a12253c50f146fddb2abbb10f033fe3d049a"},
-    {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:18406efb2f5a0e57e3a5881cd9354c1512d3bb4f5c45d96d110a66114d84d23a"},
-    {file = "regex-2024.9.11-cp312-cp312-win32.whl", hash = "sha256:e464b467f1588e2c42d26814231edecbcfe77f5ac414d92cbf4e7b55b2c2a776"},
-    {file = "regex-2024.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:9e8719792ca63c6b8340380352c24dcb8cd7ec49dae36e963742a275dfae6009"},
-    {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c157bb447303070f256e084668b702073db99bbb61d44f85d811025fcf38f784"},
-    {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4db21ece84dfeefc5d8a3863f101995de646c6cb0536952c321a2650aa202c36"},
-    {file = "regex-2024.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:220e92a30b426daf23bb67a7962900ed4613589bab80382be09b48896d211e92"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb1ae19e64c14c7ec1995f40bd932448713d3c73509e82d8cd7744dc00e29e86"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f47cd43a5bfa48f86925fe26fbdd0a488ff15b62468abb5d2a1e092a4fb10e85"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9d4a76b96f398697fe01117093613166e6aa8195d63f1b4ec3f21ab637632963"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ea51dcc0835eea2ea31d66456210a4e01a076d820e9039b04ae8d17ac11dee6"},
-    {file = "regex-2024.9.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7aaa315101c6567a9a45d2839322c51c8d6e81f67683d529512f5bcfb99c802"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c57d08ad67aba97af57a7263c2d9006d5c404d721c5f7542f077f109ec2a4a29"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f8404bf61298bb6f8224bb9176c1424548ee1181130818fcd2cbffddc768bed8"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dd4490a33eb909ef5078ab20f5f000087afa2a4daa27b4c072ccb3cb3050ad84"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:eee9130eaad130649fd73e5cd92f60e55708952260ede70da64de420cdcad554"},
-    {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6a2644a93da36c784e546de579ec1806bfd2763ef47babc1b03d765fe560c9f8"},
-    {file = "regex-2024.9.11-cp313-cp313-win32.whl", hash = "sha256:e997fd30430c57138adc06bba4c7c2968fb13d101e57dd5bb9355bf8ce3fa7e8"},
-    {file = "regex-2024.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:042c55879cfeb21a8adacc84ea347721d3d83a159da6acdf1116859e2427c43f"},
-    {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:35f4a6f96aa6cb3f2f7247027b07b15a374f0d5b912c0001418d1d55024d5cb4"},
-    {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:55b96e7ce3a69a8449a66984c268062fbaa0d8ae437b285428e12797baefce7e"},
-    {file = "regex-2024.9.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cb130fccd1a37ed894824b8c046321540263013da72745d755f2d35114b81a60"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:323c1f04be6b2968944d730e5c2091c8c89767903ecaa135203eec4565ed2b2b"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be1c8ed48c4c4065ecb19d882a0ce1afe0745dfad8ce48c49586b90a55f02366"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b5b029322e6e7b94fff16cd120ab35a253236a5f99a79fb04fda7ae71ca20ae8"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6fff13ef6b5f29221d6904aa816c34701462956aa72a77f1f151a8ec4f56aeb"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:587d4af3979376652010e400accc30404e6c16b7df574048ab1f581af82065e4"},
-    {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:079400a8269544b955ffa9e31f186f01d96829110a3bf79dc338e9910f794fca"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f9268774428ec173654985ce55fc6caf4c6d11ade0f6f914d48ef4719eb05ebb"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:23f9985c8784e544d53fc2930fc1ac1a7319f5d5332d228437acc9f418f2f168"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ae2941333154baff9838e88aa71c1d84f4438189ecc6021a12c7573728b5838e"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:e93f1c331ca8e86fe877a48ad64e77882c0c4da0097f2212873a69bbfea95d0c"},
-    {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:846bc79ee753acf93aef4184c040d709940c9d001029ceb7b7a52747b80ed2dd"},
-    {file = "regex-2024.9.11-cp38-cp38-win32.whl", hash = "sha256:c94bb0a9f1db10a1d16c00880bdebd5f9faf267273b8f5bd1878126e0fbde771"},
-    {file = "regex-2024.9.11-cp38-cp38-win_amd64.whl", hash = "sha256:2b08fce89fbd45664d3df6ad93e554b6c16933ffa9d55cb7e01182baaf971508"},
-    {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:07f45f287469039ffc2c53caf6803cd506eb5f5f637f1d4acb37a738f71dd066"},
-    {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4838e24ee015101d9f901988001038f7f0d90dc0c3b115541a1365fb439add62"},
-    {file = "regex-2024.9.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6edd623bae6a737f10ce853ea076f56f507fd7726bee96a41ee3d68d347e4d16"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c69ada171c2d0e97a4b5aa78fbb835e0ffbb6b13fc5da968c09811346564f0d3"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02087ea0a03b4af1ed6ebab2c54d7118127fee8d71b26398e8e4b05b78963199"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69dee6a020693d12a3cf892aba4808fe168d2a4cef368eb9bf74f5398bfd4ee8"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:297f54910247508e6e5cae669f2bc308985c60540a4edd1c77203ef19bfa63ca"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ecea58b43a67b1b79805f1a0255730edaf5191ecef84dbc4cc85eb30bc8b63b9"},
-    {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eab4bb380f15e189d1313195b062a6aa908f5bd687a0ceccd47c8211e9cf0d4a"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0cbff728659ce4bbf4c30b2a1be040faafaa9eca6ecde40aaff86f7889f4ab39"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:54c4a097b8bc5bb0dfc83ae498061d53ad7b5762e00f4adaa23bee22b012e6ba"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:73d6d2f64f4d894c96626a75578b0bf7d9e56dcda8c3d037a2118fdfe9b1c664"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:e53b5fbab5d675aec9f0c501274c467c0f9a5d23696cfc94247e1fb56501ed89"},
-    {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0ffbcf9221e04502fc35e54d1ce9567541979c3fdfb93d2c554f0ca583a19b35"},
-    {file = "regex-2024.9.11-cp39-cp39-win32.whl", hash = "sha256:e4c22e1ac1f1ec1e09f72e6c44d8f2244173db7eb9629cc3a346a8d7ccc31142"},
-    {file = "regex-2024.9.11-cp39-cp39-win_amd64.whl", hash = "sha256:faa3c142464efec496967359ca99696c896c591c56c53506bac1ad465f66e919"},
-    {file = "regex-2024.9.11.tar.gz", hash = "sha256:6c188c307e8433bcb63dc1915022deb553b4203a70722fc542c363bf120a01fd"},
+    {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91"},
+    {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0"},
+    {file = "regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:164d8b7b3b4bcb2068b97428060b2a53be050085ef94eca7f240e7947f1b080e"},
+    {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3660c82f209655a06b587d55e723f0b813d3a7db2e32e5e7dc64ac2a9e86fde"},
+    {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d22326fcdef5e08c154280b71163ced384b428343ae16a5ab2b3354aed12436e"},
+    {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1ac758ef6aebfc8943560194e9fd0fa18bcb34d89fd8bd2af18183afd8da3a2"},
+    {file = "regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:997d6a487ff00807ba810e0f8332c18b4eb8d29463cfb7c820dc4b6e7562d0cf"},
+    {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:02a02d2bb04fec86ad61f3ea7f49c015a0681bf76abb9857f945d26159d2968c"},
+    {file = "regex-2024.11.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f02f93b92358ee3f78660e43b4b0091229260c5d5c408d17d60bf26b6c900e86"},
+    {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:06eb1be98df10e81ebaded73fcd51989dcf534e3c753466e4b60c4697a003b67"},
+    {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:040df6fe1a5504eb0f04f048e6d09cd7c7110fef851d7c567a6b6e09942feb7d"},
+    {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fdabbfc59f2c6edba2a6622c647b716e34e8e3867e0ab975412c5c2f79b82da2"},
+    {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8447d2d39b5abe381419319f942de20b7ecd60ce86f16a23b0698f22e1b70008"},
+    {file = "regex-2024.11.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:da8f5fc57d1933de22a9e23eec290a0d8a5927a5370d24bda9a6abe50683fe62"},
+    {file = "regex-2024.11.6-cp310-cp310-win32.whl", hash = "sha256:b489578720afb782f6ccf2840920f3a32e31ba28a4b162e13900c3e6bd3f930e"},
+    {file = "regex-2024.11.6-cp310-cp310-win_amd64.whl", hash = "sha256:5071b2093e793357c9d8b2929dfc13ac5f0a6c650559503bb81189d0a3814519"},
+    {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5478c6962ad548b54a591778e93cd7c456a7a29f8eca9c49e4f9a806dcc5d638"},
+    {file = "regex-2024.11.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c89a8cc122b25ce6945f0423dc1352cb9593c68abd19223eebbd4e56612c5b7"},
+    {file = "regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:94d87b689cdd831934fa3ce16cc15cd65748e6d689f5d2b8f4f4df2065c9fa20"},
+    {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1062b39a0a2b75a9c694f7a08e7183a80c63c0d62b301418ffd9c35f55aaa114"},
+    {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:167ed4852351d8a750da48712c3930b031f6efdaa0f22fa1933716bfcd6bf4a3"},
+    {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d548dafee61f06ebdb584080621f3e0c23fff312f0de1afc776e2a2ba99a74f"},
+    {file = "regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a19f302cd1ce5dd01a9099aaa19cae6173306d1302a43b627f62e21cf18ac0"},
+    {file = "regex-2024.11.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bec9931dfb61ddd8ef2ebc05646293812cb6b16b60cf7c9511a832b6f1854b55"},
+    {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9714398225f299aa85267fd222f7142fcb5c769e73d7733344efc46f2ef5cf89"},
+    {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:202eb32e89f60fc147a41e55cb086db2a3f8cb82f9a9a88440dcfc5d37faae8d"},
+    {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4181b814e56078e9b00427ca358ec44333765f5ca1b45597ec7446d3a1ef6e34"},
+    {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:068376da5a7e4da51968ce4c122a7cd31afaaec4fccc7856c92f63876e57b51d"},
+    {file = "regex-2024.11.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ac10f2c4184420d881a3475fb2c6f4d95d53a8d50209a2500723d831036f7c45"},
+    {file = "regex-2024.11.6-cp311-cp311-win32.whl", hash = "sha256:c36f9b6f5f8649bb251a5f3f66564438977b7ef8386a52460ae77e6070d309d9"},
+    {file = "regex-2024.11.6-cp311-cp311-win_amd64.whl", hash = "sha256:02e28184be537f0e75c1f9b2f8847dc51e08e6e171c6bde130b2687e0c33cf60"},
+    {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:52fb28f528778f184f870b7cf8f225f5eef0a8f6e3778529bdd40c7b3920796a"},
+    {file = "regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9"},
+    {file = "regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:805e6b60c54bf766b251e94526ebad60b7de0c70f70a4e6210ee2891acb70bf2"},
+    {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b85c2530be953a890eaffde05485238f07029600e8f098cdf1848d414a8b45e4"},
+    {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb26437975da7dc36b7efad18aa9dd4ea569d2357ae6b783bf1118dabd9ea577"},
+    {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abfa5080c374a76a251ba60683242bc17eeb2c9818d0d30117b4486be10c59d3"},
+    {file = "regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b7fa6606c2881c1db9479b0eaa11ed5dfa11c8d60a474ff0e095099f39d98e"},
+    {file = "regex-2024.11.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c32f75920cf99fe6b6c539c399a4a128452eaf1af27f39bce8909c9a3fd8cbe"},
+    {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:982e6d21414e78e1f51cf595d7f321dcd14de1f2881c5dc6a6e23bbbbd68435e"},
+    {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a7c2155f790e2fb448faed6dd241386719802296ec588a8b9051c1f5c481bc29"},
+    {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149f5008d286636e48cd0b1dd65018548944e495b0265b45e1bffecce1ef7f39"},
+    {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:e5364a4502efca094731680e80009632ad6624084aff9a23ce8c8c6820de3e51"},
+    {file = "regex-2024.11.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0a86e7eeca091c09e021db8eb72d54751e527fa47b8d5787caf96d9831bd02ad"},
+    {file = "regex-2024.11.6-cp312-cp312-win32.whl", hash = "sha256:32f9a4c643baad4efa81d549c2aadefaeba12249b2adc5af541759237eee1c54"},
+    {file = "regex-2024.11.6-cp312-cp312-win_amd64.whl", hash = "sha256:a93c194e2df18f7d264092dc8539b8ffb86b45b899ab976aa15d48214138e81b"},
+    {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a6ba92c0bcdf96cbf43a12c717eae4bc98325ca3730f6b130ffa2e3c3c723d84"},
+    {file = "regex-2024.11.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:525eab0b789891ac3be914d36893bdf972d483fe66551f79d3e27146191a37d4"},
+    {file = "regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:086a27a0b4ca227941700e0b31425e7a28ef1ae8e5e05a33826e17e47fbfdba0"},
+    {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde01f35767c4a7899b7eb6e823b125a64de314a8ee9791367c9a34d56af18d0"},
+    {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b583904576650166b3d920d2bcce13971f6f9e9a396c673187f49811b2769dc7"},
+    {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c4de13f06a0d54fa0d5ab1b7138bfa0d883220965a29616e3ea61b35d5f5fc7"},
+    {file = "regex-2024.11.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3cde6e9f2580eb1665965ce9bf17ff4952f34f5b126beb509fee8f4e994f143c"},
+    {file = "regex-2024.11.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d7f453dca13f40a02b79636a339c5b62b670141e63efd511d3f8f73fba162b3"},
+    {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59dfe1ed21aea057a65c6b586afd2a945de04fc7db3de0a6e3ed5397ad491b07"},
+    {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b97c1e0bd37c5cd7902e65f410779d39eeda155800b65fc4d04cc432efa9bc6e"},
+    {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f9d1e379028e0fc2ae3654bac3cbbef81bf3fd571272a42d56c24007979bafb6"},
+    {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:13291b39131e2d002a7940fb176e120bec5145f3aeb7621be6534e46251912c4"},
+    {file = "regex-2024.11.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f51f88c126370dcec4908576c5a627220da6c09d0bff31cfa89f2523843316d"},
+    {file = "regex-2024.11.6-cp313-cp313-win32.whl", hash = "sha256:63b13cfd72e9601125027202cad74995ab26921d8cd935c25f09c630436348ff"},
+    {file = "regex-2024.11.6-cp313-cp313-win_amd64.whl", hash = "sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a"},
+    {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3a51ccc315653ba012774efca4f23d1d2a8a8f278a6072e29c7147eee7da446b"},
+    {file = "regex-2024.11.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ad182d02e40de7459b73155deb8996bbd8e96852267879396fb274e8700190e3"},
+    {file = "regex-2024.11.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba9b72e5643641b7d41fa1f6d5abda2c9a263ae835b917348fc3c928182ad467"},
+    {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40291b1b89ca6ad8d3f2b82782cc33807f1406cf68c8d440861da6304d8ffbbd"},
+    {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf58d0e516ee426a48f7b2c03a332a4114420716d55769ff7108c37a09951bf"},
+    {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a36fdf2af13c2b14738f6e973aba563623cb77d753bbbd8d414d18bfaa3105dd"},
+    {file = "regex-2024.11.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cee317bfc014c2419a76bcc87f071405e3966da434e03e13beb45f8aced1a6"},
+    {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50153825ee016b91549962f970d6a4442fa106832e14c918acd1c8e479916c4f"},
+    {file = "regex-2024.11.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ea1bfda2f7162605f6e8178223576856b3d791109f15ea99a9f95c16a7636fb5"},
+    {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:df951c5f4a1b1910f1a99ff42c473ff60f8225baa1cdd3539fe2819d9543e9df"},
+    {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:072623554418a9911446278f16ecb398fb3b540147a7828c06e2011fa531e773"},
+    {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f654882311409afb1d780b940234208a252322c24a93b442ca714d119e68086c"},
+    {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:89d75e7293d2b3e674db7d4d9b1bee7f8f3d1609428e293771d1a962617150cc"},
+    {file = "regex-2024.11.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:f65557897fc977a44ab205ea871b690adaef6b9da6afda4790a2484b04293a5f"},
+    {file = "regex-2024.11.6-cp38-cp38-win32.whl", hash = "sha256:6f44ec28b1f858c98d3036ad5d7d0bfc568bdd7a74f9c24e25f41ef1ebfd81a4"},
+    {file = "regex-2024.11.6-cp38-cp38-win_amd64.whl", hash = "sha256:bb8f74f2f10dbf13a0be8de623ba4f9491faf58c24064f32b65679b021ed0001"},
+    {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5704e174f8ccab2026bd2f1ab6c510345ae8eac818b613d7d73e785f1310f839"},
+    {file = "regex-2024.11.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:220902c3c5cc6af55d4fe19ead504de80eb91f786dc102fbd74894b1551f095e"},
+    {file = "regex-2024.11.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5e7e351589da0850c125f1600a4c4ba3c722efefe16b297de54300f08d734fbf"},
+    {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5056b185ca113c88e18223183aa1a50e66507769c9640a6ff75859619d73957b"},
+    {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e34b51b650b23ed3354b5a07aab37034d9f923db2a40519139af34f485f77d0"},
+    {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5670bce7b200273eee1840ef307bfa07cda90b38ae56e9a6ebcc9f50da9c469b"},
+    {file = "regex-2024.11.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08986dce1339bc932923e7d1232ce9881499a0e02925f7402fb7c982515419ef"},
+    {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93c0b12d3d3bc25af4ebbf38f9ee780a487e8bf6954c115b9f015822d3bb8e48"},
+    {file = "regex-2024.11.6-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:764e71f22ab3b305e7f4c21f1a97e1526a25ebdd22513e251cf376760213da13"},
+    {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f056bf21105c2515c32372bbc057f43eb02aae2fda61052e2f7622c801f0b4e2"},
+    {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:69ab78f848845569401469da20df3e081e6b5a11cb086de3eed1d48f5ed57c95"},
+    {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:86fddba590aad9208e2fa8b43b4c098bb0ec74f15718bb6a704e3c63e2cef3e9"},
+    {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:684d7a212682996d21ca12ef3c17353c021fe9de6049e19ac8481ec35574a70f"},
+    {file = "regex-2024.11.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a03e02f48cd1abbd9f3b7e3586d97c8f7a9721c436f51a5245b3b9483044480b"},
+    {file = "regex-2024.11.6-cp39-cp39-win32.whl", hash = "sha256:41758407fc32d5c3c5de163888068cfee69cb4c2be844e7ac517a52770f9af57"},
+    {file = "regex-2024.11.6-cp39-cp39-win_amd64.whl", hash = "sha256:b2837718570f95dd41675328e111345f9b7095d821bac435aac173ac80b19983"},
+    {file = "regex-2024.11.6.tar.gz", hash = "sha256:7ab159b063c52a0333c884e4679f8d7a85112ee3078fe3d9004b2dd875585519"},
 ]
 
 [[package]]
@@ -2879,6 +2123,7 @@ version = "2.32.3"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
     {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
@@ -2896,251 +2141,171 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "rich"
-version = "13.8.1"
+version = "14.0.0"
 description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.8.0"
+groups = ["main"]
 files = [
-    {file = "rich-13.8.1-py3-none-any.whl", hash = "sha256:1760a3c0848469b97b558fc61c85233e3dafb69c7a071b4d60c38099d3cd4c06"},
-    {file = "rich-13.8.1.tar.gz", hash = "sha256:8260cda28e3db6bf04d2d1ef4dbc03ba80a824c88b0e7668a0f23126a424844a"},
+    {file = "rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0"},
+    {file = "rich-14.0.0.tar.gz", hash = "sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725"},
 ]
 
 [package.dependencies]
 markdown-it-py = ">=2.2.0"
 pygments = ">=2.13.0,<3.0.0"
+typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
 jupyter = ["ipywidgets (>=7.5.1,<9)"]
 
 [[package]]
 name = "rpds-py"
-version = "0.20.0"
+version = "0.24.0"
 description = "Python bindings to Rust's persistent data structures (rpds)"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "rpds_py-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3ad0fda1635f8439cde85c700f964b23ed5fc2d28016b32b9ee5fe30da5c84e2"},
-    {file = "rpds_py-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9bb4a0d90fdb03437c109a17eade42dfbf6190408f29b2744114d11586611d6f"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6377e647bbfd0a0b159fe557f2c6c602c159fc752fa316572f012fc0bf67150"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb851b7df9dda52dc1415ebee12362047ce771fc36914586b2e9fcbd7d293b3e"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e0f80b739e5a8f54837be5d5c924483996b603d5502bfff79bf33da06164ee2"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a8c94dad2e45324fc74dce25e1645d4d14df9a4e54a30fa0ae8bad9a63928e3"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8e604fe73ba048c06085beaf51147eaec7df856824bfe7b98657cf436623daf"},
-    {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df3de6b7726b52966edf29663e57306b23ef775faf0ac01a3e9f4012a24a4140"},
-    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf258ede5bc22a45c8e726b29835b9303c285ab46fc7c3a4cc770736b5304c9f"},
-    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:55fea87029cded5df854ca7e192ec7bdb7ecd1d9a3f63d5c4eb09148acf4a7ce"},
-    {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ae94bd0b2f02c28e199e9bc51485d0c5601f58780636185660f86bf80c89af94"},
-    {file = "rpds_py-0.20.0-cp310-none-win32.whl", hash = "sha256:28527c685f237c05445efec62426d285e47a58fb05ba0090a4340b73ecda6dee"},
-    {file = "rpds_py-0.20.0-cp310-none-win_amd64.whl", hash = "sha256:238a2d5b1cad28cdc6ed15faf93a998336eb041c4e440dd7f902528b8891b399"},
-    {file = "rpds_py-0.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ac2f4f7a98934c2ed6505aead07b979e6f999389f16b714448fb39bbaa86a489"},
-    {file = "rpds_py-0.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:220002c1b846db9afd83371d08d239fdc865e8f8c5795bbaec20916a76db3318"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d7919548df3f25374a1f5d01fbcd38dacab338ef5f33e044744b5c36729c8db"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:758406267907b3781beee0f0edfe4a179fbd97c0be2e9b1154d7f0a1279cf8e5"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d61339e9f84a3f0767b1995adfb171a0d00a1185192718a17af6e124728e0f5"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1259c7b3705ac0a0bd38197565a5d603218591d3f6cee6e614e380b6ba61c6f6"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c1dc0f53856b9cc9a0ccca0a7cc61d3d20a7088201c0937f3f4048c1718a209"},
-    {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7e60cb630f674a31f0368ed32b2a6b4331b8350d67de53c0359992444b116dd3"},
-    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbe982f38565bb50cb7fb061ebf762c2f254ca3d8c20d4006878766e84266272"},
-    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:514b3293b64187172bc77c8fb0cdae26981618021053b30d8371c3a902d4d5ad"},
-    {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d0a26ffe9d4dd35e4dfdd1e71f46401cff0181c75ac174711ccff0459135fa58"},
-    {file = "rpds_py-0.20.0-cp311-none-win32.whl", hash = "sha256:89c19a494bf3ad08c1da49445cc5d13d8fefc265f48ee7e7556839acdacf69d0"},
-    {file = "rpds_py-0.20.0-cp311-none-win_amd64.whl", hash = "sha256:c638144ce971df84650d3ed0096e2ae7af8e62ecbbb7b201c8935c370df00a2c"},
-    {file = "rpds_py-0.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a84ab91cbe7aab97f7446652d0ed37d35b68a465aeef8fc41932a9d7eee2c1a6"},
-    {file = "rpds_py-0.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:56e27147a5a4c2c21633ff8475d185734c0e4befd1c989b5b95a5d0db699b21b"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2580b0c34583b85efec8c5c5ec9edf2dfe817330cc882ee972ae650e7b5ef739"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b80d4a7900cf6b66bb9cee5c352b2d708e29e5a37fe9bf784fa97fc11504bf6c"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50eccbf054e62a7b2209b28dc7a22d6254860209d6753e6b78cfaeb0075d7bee"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:49a8063ea4296b3a7e81a5dfb8f7b2d73f0b1c20c2af401fb0cdf22e14711a96"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea438162a9fcbee3ecf36c23e6c68237479f89f962f82dae83dc15feeceb37e4"},
-    {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:18d7585c463087bddcfa74c2ba267339f14f2515158ac4db30b1f9cbdb62c8ef"},
-    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d4c7d1a051eeb39f5c9547e82ea27cbcc28338482242e3e0b7768033cb083821"},
-    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4df1e3b3bec320790f699890d41c59d250f6beda159ea3c44c3f5bac1976940"},
-    {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2cf126d33a91ee6eedc7f3197b53e87a2acdac63602c0f03a02dd69e4b138174"},
-    {file = "rpds_py-0.20.0-cp312-none-win32.whl", hash = "sha256:8bc7690f7caee50b04a79bf017a8d020c1f48c2a1077ffe172abec59870f1139"},
-    {file = "rpds_py-0.20.0-cp312-none-win_amd64.whl", hash = "sha256:0e13e6952ef264c40587d510ad676a988df19adea20444c2b295e536457bc585"},
-    {file = "rpds_py-0.20.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:aa9a0521aeca7d4941499a73ad7d4f8ffa3d1affc50b9ea11d992cd7eff18a29"},
-    {file = "rpds_py-0.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a1f1d51eccb7e6c32ae89243cb352389228ea62f89cd80823ea7dd1b98e0b91"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a86a9b96070674fc88b6f9f71a97d2c1d3e5165574615d1f9168ecba4cecb24"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c8ef2ebf76df43f5750b46851ed1cdf8f109d7787ca40035fe19fbdc1acc5a7"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b74b25f024b421d5859d156750ea9a65651793d51b76a2e9238c05c9d5f203a9"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57eb94a8c16ab08fef6404301c38318e2c5a32216bf5de453e2714c964c125c8"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1940dae14e715e2e02dfd5b0f64a52e8374a517a1e531ad9412319dc3ac7879"},
-    {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d20277fd62e1b992a50c43f13fbe13277a31f8c9f70d59759c88f644d66c619f"},
-    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:06db23d43f26478303e954c34c75182356ca9aa7797d22c5345b16871ab9c45c"},
-    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2a5db5397d82fa847e4c624b0c98fe59d2d9b7cf0ce6de09e4d2e80f8f5b3f2"},
-    {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a35df9f5548fd79cb2f52d27182108c3e6641a4feb0f39067911bf2adaa3e57"},
-    {file = "rpds_py-0.20.0-cp313-none-win32.whl", hash = "sha256:fd2d84f40633bc475ef2d5490b9c19543fbf18596dcb1b291e3a12ea5d722f7a"},
-    {file = "rpds_py-0.20.0-cp313-none-win_amd64.whl", hash = "sha256:9bc2d153989e3216b0559251b0c260cfd168ec78b1fac33dd485750a228db5a2"},
-    {file = "rpds_py-0.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f2fbf7db2012d4876fb0d66b5b9ba6591197b0f165db8d99371d976546472a24"},
-    {file = "rpds_py-0.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1e5f3cd7397c8f86c8cc72d5a791071431c108edd79872cdd96e00abd8497d29"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce9845054c13696f7af7f2b353e6b4f676dab1b4b215d7fe5e05c6f8bb06f965"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c3e130fd0ec56cb76eb49ef52faead8ff09d13f4527e9b0c400307ff72b408e1"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b16aa0107ecb512b568244ef461f27697164d9a68d8b35090e9b0c1c8b27752"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa7f429242aae2947246587d2964fad750b79e8c233a2367f71b554e9447949c"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af0fc424a5842a11e28956e69395fbbeab2c97c42253169d87e90aac2886d751"},
-    {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b8c00a3b1e70c1d3891f0db1b05292747f0dbcfb49c43f9244d04c70fbc40eb8"},
-    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:40ce74fc86ee4645d0a225498d091d8bc61f39b709ebef8204cb8b5a464d3c0e"},
-    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:4fe84294c7019456e56d93e8ababdad5a329cd25975be749c3f5f558abb48253"},
-    {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:338ca4539aad4ce70a656e5187a3a31c5204f261aef9f6ab50e50bcdffaf050a"},
-    {file = "rpds_py-0.20.0-cp38-none-win32.whl", hash = "sha256:54b43a2b07db18314669092bb2de584524d1ef414588780261e31e85846c26a5"},
-    {file = "rpds_py-0.20.0-cp38-none-win_amd64.whl", hash = "sha256:a1862d2d7ce1674cffa6d186d53ca95c6e17ed2b06b3f4c476173565c862d232"},
-    {file = "rpds_py-0.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:3fde368e9140312b6e8b6c09fb9f8c8c2f00999d1823403ae90cc00480221b22"},
-    {file = "rpds_py-0.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9824fb430c9cf9af743cf7aaf6707bf14323fb51ee74425c380f4c846ea70789"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11ef6ce74616342888b69878d45e9f779b95d4bd48b382a229fe624a409b72c5"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c52d3f2f82b763a24ef52f5d24358553e8403ce05f893b5347098014f2d9eff2"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d35cef91e59ebbeaa45214861874bc6f19eb35de96db73e467a8358d701a96c"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d72278a30111e5b5525c1dd96120d9e958464316f55adb030433ea905866f4de"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4c29cbbba378759ac5786730d1c3cb4ec6f8ababf5c42a9ce303dc4b3d08cda"},
-    {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6632f2d04f15d1bd6fe0eedd3b86d9061b836ddca4c03d5cf5c7e9e6b7c14580"},
-    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d0b67d87bb45ed1cd020e8fbf2307d449b68abc45402fe1a4ac9e46c3c8b192b"},
-    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ec31a99ca63bf3cd7f1a5ac9fe95c5e2d060d3c768a09bc1d16e235840861420"},
-    {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22e6c9976e38f4d8c4a63bd8a8edac5307dffd3ee7e6026d97f3cc3a2dc02a0b"},
-    {file = "rpds_py-0.20.0-cp39-none-win32.whl", hash = "sha256:569b3ea770c2717b730b61998b6c54996adee3cef69fc28d444f3e7920313cf7"},
-    {file = "rpds_py-0.20.0-cp39-none-win_amd64.whl", hash = "sha256:e6900ecdd50ce0facf703f7a00df12374b74bbc8ad9fe0f6559947fb20f82364"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:617c7357272c67696fd052811e352ac54ed1d9b49ab370261a80d3b6ce385045"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9426133526f69fcaba6e42146b4e12d6bc6c839b8b555097020e2b78ce908dcc"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deb62214c42a261cb3eb04d474f7155279c1a8a8c30ac89b7dcb1721d92c3c02"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcaeb7b57f1a1e071ebd748984359fef83ecb026325b9d4ca847c95bc7311c92"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d454b8749b4bd70dd0a79f428731ee263fa6995f83ccb8bada706e8d1d3ff89d"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d807dc2051abe041b6649681dce568f8e10668e3c1c6543ebae58f2d7e617855"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3c20f0ddeb6e29126d45f89206b8291352b8c5b44384e78a6499d68b52ae511"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b7f19250ceef892adf27f0399b9e5afad019288e9be756d6919cb58892129f51"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4f1ed4749a08379555cebf4650453f14452eaa9c43d0a95c49db50c18b7da075"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dcedf0b42bcb4cfff4101d7771a10532415a6106062f005ab97d1d0ab5681c60"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:39ed0d010457a78f54090fafb5d108501b5aa5604cc22408fc1c0c77eac14344"},
-    {file = "rpds_py-0.20.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bb273176be34a746bdac0b0d7e4e2c467323d13640b736c4c477881a3220a989"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f918a1a130a6dfe1d7fe0f105064141342e7dd1611f2e6a21cd2f5c8cb1cfb3e"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:f60012a73aa396be721558caa3a6fd49b3dd0033d1675c6d59c4502e870fcf0c"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d2b1ad682a3dfda2a4e8ad8572f3100f95fad98cb99faf37ff0ddfe9cbf9d03"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:614fdafe9f5f19c63ea02817fa4861c606a59a604a77c8cdef5aa01d28b97921"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa518bcd7600c584bf42e6617ee8132869e877db2f76bcdc281ec6a4113a53ab"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0475242f447cc6cb8a9dd486d68b2ef7fbee84427124c232bff5f63b1fe11e5"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f90a4cd061914a60bd51c68bcb4357086991bd0bb93d8aa66a6da7701370708f"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:def7400461c3a3f26e49078302e1c1b38f6752342c77e3cf72ce91ca69fb1bc1"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:65794e4048ee837494aea3c21a28ad5fc080994dfba5b036cf84de37f7ad5074"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:faefcc78f53a88f3076b7f8be0a8f8d35133a3ecf7f3770895c25f8813460f08"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5b4f105deeffa28bbcdff6c49b34e74903139afa690e35d2d9e3c2c2fba18cec"},
-    {file = "rpds_py-0.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdfc3a892927458d98f3d55428ae46b921d1f7543b89382fdb483f5640daaec8"},
-    {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
+    {file = "rpds_py-0.24.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:006f4342fe729a368c6df36578d7a348c7c716be1da0a1a0f86e3021f8e98724"},
+    {file = "rpds_py-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2d53747da70a4e4b17f559569d5f9506420966083a31c5fbd84e764461c4444b"},
+    {file = "rpds_py-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8acd55bd5b071156bae57b555f5d33697998752673b9de554dd82f5b5352727"},
+    {file = "rpds_py-0.24.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7e80d375134ddb04231a53800503752093dbb65dad8dabacce2c84cccc78e964"},
+    {file = "rpds_py-0.24.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60748789e028d2a46fc1c70750454f83c6bdd0d05db50f5ae83e2db500b34da5"},
+    {file = "rpds_py-0.24.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6e1daf5bf6c2be39654beae83ee6b9a12347cb5aced9a29eecf12a2d25fff664"},
+    {file = "rpds_py-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b221c2457d92a1fb3c97bee9095c874144d196f47c038462ae6e4a14436f7bc"},
+    {file = "rpds_py-0.24.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:66420986c9afff67ef0c5d1e4cdc2d0e5262f53ad11e4f90e5e22448df485bf0"},
+    {file = "rpds_py-0.24.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:43dba99f00f1d37b2a0265a259592d05fcc8e7c19d140fe51c6e6f16faabeb1f"},
+    {file = "rpds_py-0.24.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:a88c0d17d039333a41d9bf4616bd062f0bd7aa0edeb6cafe00a2fc2a804e944f"},
+    {file = "rpds_py-0.24.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cc31e13ce212e14a539d430428cd365e74f8b2d534f8bc22dd4c9c55b277b875"},
+    {file = "rpds_py-0.24.0-cp310-cp310-win32.whl", hash = "sha256:fc2c1e1b00f88317d9de6b2c2b39b012ebbfe35fe5e7bef980fd2a91f6100a07"},
+    {file = "rpds_py-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:c0145295ca415668420ad142ee42189f78d27af806fcf1f32a18e51d47dd2052"},
+    {file = "rpds_py-0.24.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:2d3ee4615df36ab8eb16c2507b11e764dcc11fd350bbf4da16d09cda11fcedef"},
+    {file = "rpds_py-0.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e13ae74a8a3a0c2f22f450f773e35f893484fcfacb00bb4344a7e0f4f48e1f97"},
+    {file = "rpds_py-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf86f72d705fc2ef776bb7dd9e5fbba79d7e1f3e258bf9377f8204ad0fc1c51e"},
+    {file = "rpds_py-0.24.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c43583ea8517ed2e780a345dd9960896afc1327e8cf3ac8239c167530397440d"},
+    {file = "rpds_py-0.24.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4cd031e63bc5f05bdcda120646a0d32f6d729486d0067f09d79c8db5368f4586"},
+    {file = "rpds_py-0.24.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:34d90ad8c045df9a4259c47d2e16a3f21fdb396665c94520dbfe8766e62187a4"},
+    {file = "rpds_py-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e838bf2bb0b91ee67bf2b889a1a841e5ecac06dd7a2b1ef4e6151e2ce155c7ae"},
+    {file = "rpds_py-0.24.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04ecf5c1ff4d589987b4d9882872f80ba13da7d42427234fce8f22efb43133bc"},
+    {file = "rpds_py-0.24.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:630d3d8ea77eabd6cbcd2ea712e1c5cecb5b558d39547ac988351195db433f6c"},
+    {file = "rpds_py-0.24.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ebcb786b9ff30b994d5969213a8430cbb984cdd7ea9fd6df06663194bd3c450c"},
+    {file = "rpds_py-0.24.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:174e46569968ddbbeb8a806d9922f17cd2b524aa753b468f35b97ff9c19cb718"},
+    {file = "rpds_py-0.24.0-cp311-cp311-win32.whl", hash = "sha256:5ef877fa3bbfb40b388a5ae1cb00636a624690dcb9a29a65267054c9ea86d88a"},
+    {file = "rpds_py-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:e274f62cbd274359eff63e5c7e7274c913e8e09620f6a57aae66744b3df046d6"},
+    {file = "rpds_py-0.24.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:d8551e733626afec514b5d15befabea0dd70a343a9f23322860c4f16a9430205"},
+    {file = "rpds_py-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0e374c0ce0ca82e5b67cd61fb964077d40ec177dd2c4eda67dba130de09085c7"},
+    {file = "rpds_py-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d69d003296df4840bd445a5d15fa5b6ff6ac40496f956a221c4d1f6f7b4bc4d9"},
+    {file = "rpds_py-0.24.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8212ff58ac6dfde49946bea57474a386cca3f7706fc72c25b772b9ca4af6b79e"},
+    {file = "rpds_py-0.24.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:528927e63a70b4d5f3f5ccc1fa988a35456eb5d15f804d276709c33fc2f19bda"},
+    {file = "rpds_py-0.24.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a824d2c7a703ba6daaca848f9c3d5cb93af0505be505de70e7e66829affd676e"},
+    {file = "rpds_py-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44d51febb7a114293ffd56c6cf4736cb31cd68c0fddd6aa303ed09ea5a48e029"},
+    {file = "rpds_py-0.24.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3fab5f4a2c64a8fb64fc13b3d139848817a64d467dd6ed60dcdd6b479e7febc9"},
+    {file = "rpds_py-0.24.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9be4f99bee42ac107870c61dfdb294d912bf81c3c6d45538aad7aecab468b6b7"},
+    {file = "rpds_py-0.24.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:564c96b6076a98215af52f55efa90d8419cc2ef45d99e314fddefe816bc24f91"},
+    {file = "rpds_py-0.24.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:75a810b7664c17f24bf2ffd7f92416c00ec84b49bb68e6a0d93e542406336b56"},
+    {file = "rpds_py-0.24.0-cp312-cp312-win32.whl", hash = "sha256:f6016bd950be4dcd047b7475fdf55fb1e1f59fc7403f387be0e8123e4a576d30"},
+    {file = "rpds_py-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:998c01b8e71cf051c28f5d6f1187abbdf5cf45fc0efce5da6c06447cba997034"},
+    {file = "rpds_py-0.24.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:3d2d8e4508e15fc05b31285c4b00ddf2e0eb94259c2dc896771966a163122a0c"},
+    {file = "rpds_py-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0f00c16e089282ad68a3820fd0c831c35d3194b7cdc31d6e469511d9bffc535c"},
+    {file = "rpds_py-0.24.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:951cc481c0c395c4a08639a469d53b7d4afa252529a085418b82a6b43c45c240"},
+    {file = "rpds_py-0.24.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9ca89938dff18828a328af41ffdf3902405a19f4131c88e22e776a8e228c5a8"},
+    {file = "rpds_py-0.24.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ed0ef550042a8dbcd657dfb284a8ee00f0ba269d3f2286b0493b15a5694f9fe8"},
+    {file = "rpds_py-0.24.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b2356688e5d958c4d5cb964af865bea84db29971d3e563fb78e46e20fe1848b"},
+    {file = "rpds_py-0.24.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78884d155fd15d9f64f5d6124b486f3d3f7fd7cd71a78e9670a0f6f6ca06fb2d"},
+    {file = "rpds_py-0.24.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6a4a535013aeeef13c5532f802708cecae8d66c282babb5cd916379b72110cf7"},
+    {file = "rpds_py-0.24.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:84e0566f15cf4d769dade9b366b7b87c959be472c92dffb70462dd0844d7cbad"},
+    {file = "rpds_py-0.24.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:823e74ab6fbaa028ec89615ff6acb409e90ff45580c45920d4dfdddb069f2120"},
+    {file = "rpds_py-0.24.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c61a2cb0085c8783906b2f8b1f16a7e65777823c7f4d0a6aaffe26dc0d358dd9"},
+    {file = "rpds_py-0.24.0-cp313-cp313-win32.whl", hash = "sha256:60d9b630c8025b9458a9d114e3af579a2c54bd32df601c4581bd054e85258143"},
+    {file = "rpds_py-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:6eea559077d29486c68218178ea946263b87f1c41ae7f996b1f30a983c476a5a"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:d09dc82af2d3c17e7dd17120b202a79b578d79f2b5424bda209d9966efeed114"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5fc13b44de6419d1e7a7e592a4885b323fbc2f46e1f22151e3a8ed3b8b920405"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c347a20d79cedc0a7bd51c4d4b7dbc613ca4e65a756b5c3e57ec84bd43505b47"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:20f2712bd1cc26a3cc16c5a1bfee9ed1abc33d4cdf1aabd297fe0eb724df4272"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aad911555286884be1e427ef0dc0ba3929e6821cbeca2194b13dc415a462c7fd"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0aeb3329c1721c43c58cae274d7d2ca85c1690d89485d9c63a006cb79a85771a"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a0f156e9509cee987283abd2296ec816225145a13ed0391df8f71bf1d789e2d"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aa6800adc8204ce898c8a424303969b7aa6a5e4ad2789c13f8648739830323b7"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a18fc371e900a21d7392517c6f60fe859e802547309e94313cd8181ad9db004d"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:9168764133fd919f8dcca2ead66de0105f4ef5659cbb4fa044f7014bed9a1797"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5f6e3cec44ba05ee5cbdebe92d052f69b63ae792e7d05f1020ac5e964394080c"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-win32.whl", hash = "sha256:8ebc7e65ca4b111d928b669713865f021b7773350eeac4a31d3e70144297baba"},
+    {file = "rpds_py-0.24.0-cp313-cp313t-win_amd64.whl", hash = "sha256:675269d407a257b8c00a6b58205b72eec8231656506c56fd429d924ca00bb350"},
+    {file = "rpds_py-0.24.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a36b452abbf29f68527cf52e181fced56685731c86b52e852053e38d8b60bc8d"},
+    {file = "rpds_py-0.24.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8b3b397eefecec8e8e39fa65c630ef70a24b09141a6f9fc17b3c3a50bed6b50e"},
+    {file = "rpds_py-0.24.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdabcd3beb2a6dca7027007473d8ef1c3b053347c76f685f5f060a00327b8b65"},
+    {file = "rpds_py-0.24.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5db385bacd0c43f24be92b60c857cf760b7f10d8234f4bd4be67b5b20a7c0b6b"},
+    {file = "rpds_py-0.24.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8097b3422d020ff1c44effc40ae58e67d93e60d540a65649d2cdaf9466030791"},
+    {file = "rpds_py-0.24.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:493fe54318bed7d124ce272fc36adbf59d46729659b2c792e87c3b95649cdee9"},
+    {file = "rpds_py-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8aa362811ccdc1f8dadcc916c6d47e554169ab79559319ae9fae7d7752d0d60c"},
+    {file = "rpds_py-0.24.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d8f9a6e7fd5434817526815f09ea27f2746c4a51ee11bb3439065f5fc754db58"},
+    {file = "rpds_py-0.24.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8205ee14463248d3349131bb8099efe15cd3ce83b8ef3ace63c7e976998e7124"},
+    {file = "rpds_py-0.24.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:921ae54f9ecba3b6325df425cf72c074cd469dea843fb5743a26ca7fb2ccb149"},
+    {file = "rpds_py-0.24.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:32bab0a56eac685828e00cc2f5d1200c548f8bc11f2e44abf311d6b548ce2e45"},
+    {file = "rpds_py-0.24.0-cp39-cp39-win32.whl", hash = "sha256:f5c0ed12926dec1dfe7d645333ea59cf93f4d07750986a586f511c0bc61fe103"},
+    {file = "rpds_py-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:afc6e35f344490faa8276b5f2f7cbf71f88bc2cda4328e00553bd451728c571f"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:619ca56a5468f933d940e1bf431c6f4e13bef8e688698b067ae68eb4f9b30e3a"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:4b28e5122829181de1898c2c97f81c0b3246d49f585f22743a1246420bb8d399"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e5ab32cf9eb3647450bc74eb201b27c185d3857276162c101c0f8c6374e098"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:208b3a70a98cf3710e97cabdc308a51cd4f28aa6e7bb11de3d56cd8b74bab98d"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbc4362e06f950c62cad3d4abf1191021b2ffaf0b31ac230fbf0526453eee75e"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebea2821cdb5f9fef44933617be76185b80150632736f3d76e54829ab4a3b4d1"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9a4df06c35465ef4d81799999bba810c68d29972bf1c31db61bfdb81dd9d5bb"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d3aa13bdf38630da298f2e0d77aca967b200b8cc1473ea05248f6c5e9c9bdb44"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:041f00419e1da7a03c46042453598479f45be3d787eb837af382bfc169c0db33"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:d8754d872a5dfc3c5bf9c0e059e8107451364a30d9fd50f1f1a85c4fb9481164"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:896c41007931217a343eff197c34513c154267636c8056fb409eafd494c3dcdc"},
+    {file = "rpds_py-0.24.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:92558d37d872e808944c3c96d0423b8604879a3d1c86fdad508d7ed91ea547d5"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f9e0057a509e096e47c87f753136c9b10d7a91842d8042c2ee6866899a717c0d"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d6e109a454412ab82979c5b1b3aee0604eca4bbf9a02693bb9df027af2bfa91a"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc1c892b1ec1f8cbd5da8de287577b455e388d9c328ad592eabbdcb6fc93bee5"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9c39438c55983d48f4bb3487734d040e22dad200dab22c41e331cee145e7a50d"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d7e8ce990ae17dda686f7e82fd41a055c668e13ddcf058e7fb5e9da20b57793"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9ea7f4174d2e4194289cb0c4e172d83e79a6404297ff95f2875cf9ac9bced8ba"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb2954155bb8f63bb19d56d80e5e5320b61d71084617ed89efedb861a684baea"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:04f2b712a2206e13800a8136b07aaedc23af3facab84918e7aa89e4be0260032"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:eda5c1e2a715a4cbbca2d6d304988460942551e4e5e3b7457b50943cd741626d"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:9abc80fe8c1f87218db116016de575a7998ab1629078c90840e8d11ab423ee25"},
+    {file = "rpds_py-0.24.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6a727fd083009bc83eb83d6950f0c32b3c94c8b80a9b667c87f4bd1274ca30ba"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e0f3ef95795efcd3b2ec3fe0a5bcfb5dadf5e3996ea2117427e524d4fbf309c6"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:2c13777ecdbbba2077670285dd1fe50828c8742f6a4119dbef6f83ea13ad10fb"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79e8d804c2ccd618417e96720ad5cd076a86fa3f8cb310ea386a3e6229bae7d1"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fd822f019ccccd75c832deb7aa040bb02d70a92eb15a2f16c7987b7ad4ee8d83"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0047638c3aa0dbcd0ab99ed1e549bbf0e142c9ecc173b6492868432d8989a046"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a5b66d1b201cc71bc3081bc2f1fc36b0c1f268b773e03bbc39066651b9e18391"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbcbb6db5582ea33ce46a5d20a5793134b5365110d84df4e30b9d37c6fd40ad3"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:63981feca3f110ed132fd217bf7768ee8ed738a55549883628ee3da75bb9cb78"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:3a55fc10fdcbf1a4bd3c018eea422c52cf08700cf99c28b5cb10fe97ab77a0d3"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:c30ff468163a48535ee7e9bf21bd14c7a81147c0e58a36c1078289a8ca7af0bd"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:369d9c6d4c714e36d4a03957b4783217a3ccd1e222cdd67d464a3a479fc17796"},
+    {file = "rpds_py-0.24.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:24795c099453e3721fda5d8ddd45f5dfcc8e5a547ce7b8e9da06fecc3832e26f"},
+    {file = "rpds_py-0.24.0.tar.gz", hash = "sha256:772cc1b2cd963e7e17e6cc55fe0371fb9c704d63e44cacec7b9b7f523b78919e"},
 ]
 
 [[package]]
 name = "safetensors"
-version = "0.4.5"
+version = "0.5.3"
 description = ""
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7"},
-    {file = "safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6885016f34bef80ea1085b7e99b3c1f92cb1be78a49839203060f67b40aee761"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:133620f443450429322f238fda74d512c4008621227fccf2f8cf4a76206fea7c"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4fb3e0609ec12d2a77e882f07cced530b8262027f64b75d399f1504ffec0ba56"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0f1dd769f064adc33831f5e97ad07babbd728427f98e3e1db6902e369122737"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6d156bdb26732feada84f9388a9f135528c1ef5b05fae153da365ad4319c4c5"},
-    {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e347d77e2c77eb7624400ccd09bed69d35c0332f417ce8c048d404a096c593b"},
-    {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9f556eea3aec1d3d955403159fe2123ddd68e880f83954ee9b4a3f2e15e716b6"},
-    {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9483f42be3b6bc8ff77dd67302de8ae411c4db39f7224dec66b0eb95822e4163"},
-    {file = "safetensors-0.4.5-cp310-none-win32.whl", hash = "sha256:7389129c03fadd1ccc37fd1ebbc773f2b031483b04700923c3511d2a939252cc"},
-    {file = "safetensors-0.4.5-cp310-none-win_amd64.whl", hash = "sha256:e98ef5524f8b6620c8cdef97220c0b6a5c1cef69852fcd2f174bb96c2bb316b1"},
-    {file = "safetensors-0.4.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:21f848d7aebd5954f92538552d6d75f7c1b4500f51664078b5b49720d180e47c"},
-    {file = "safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb07000b19d41e35eecef9a454f31a8b4718a185293f0d0b1c4b61d6e4487971"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09dedf7c2fda934ee68143202acff6e9e8eb0ddeeb4cfc24182bef999efa9f42"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:59b77e4b7a708988d84f26de3ebead61ef1659c73dcbc9946c18f3b1786d2688"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d3bc83e14d67adc2e9387e511097f254bd1b43c3020440e708858c684cbac68"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39371fc551c1072976073ab258c3119395294cf49cdc1f8476794627de3130df"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6c19feda32b931cae0acd42748a670bdf56bee6476a046af20181ad3fee4090"},
-    {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a659467495de201e2f282063808a41170448c78bada1e62707b07a27b05e6943"},
-    {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bad5e4b2476949bcd638a89f71b6916fa9a5cae5c1ae7eede337aca2100435c0"},
-    {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a3a315a6d0054bc6889a17f5668a73f94f7fe55121ff59e0a199e3519c08565f"},
-    {file = "safetensors-0.4.5-cp311-none-win32.whl", hash = "sha256:a01e232e6d3d5cf8b1667bc3b657a77bdab73f0743c26c1d3c5dd7ce86bd3a92"},
-    {file = "safetensors-0.4.5-cp311-none-win_amd64.whl", hash = "sha256:cbd39cae1ad3e3ef6f63a6f07296b080c951f24cec60188378e43d3713000c04"},
-    {file = "safetensors-0.4.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:473300314e026bd1043cef391bb16a8689453363381561b8a3e443870937cc1e"},
-    {file = "safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:801183a0f76dc647f51a2d9141ad341f9665602a7899a693207a82fb102cc53e"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1524b54246e422ad6fb6aea1ac71edeeb77666efa67230e1faf6999df9b2e27f"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3139098e3e8b2ad7afbca96d30ad29157b50c90861084e69fcb80dec7430461"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65573dc35be9059770808e276b017256fa30058802c29e1038eb1c00028502ea"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd33da8e9407559f8779c82a0448e2133737f922d71f884da27184549416bfed"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3685ce7ed036f916316b567152482b7e959dc754fcc4a8342333d222e05f407c"},
-    {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dde2bf390d25f67908278d6f5d59e46211ef98e44108727084d4637ee70ab4f1"},
-    {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7469d70d3de970b1698d47c11ebbf296a308702cbaae7fcb993944751cf985f4"},
-    {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3a6ba28118636a130ccbb968bc33d4684c48678695dba2590169d5ab03a45646"},
-    {file = "safetensors-0.4.5-cp312-none-win32.whl", hash = "sha256:c859c7ed90b0047f58ee27751c8e56951452ed36a67afee1b0a87847d065eec6"},
-    {file = "safetensors-0.4.5-cp312-none-win_amd64.whl", hash = "sha256:b5a8810ad6a6f933fff6c276eae92c1da217b39b4d8b1bc1c0b8af2d270dc532"},
-    {file = "safetensors-0.4.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:25e5f8e2e92a74f05b4ca55686234c32aac19927903792b30ee6d7bd5653d54e"},
-    {file = "safetensors-0.4.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81efb124b58af39fcd684254c645e35692fea81c51627259cdf6d67ff4458916"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:585f1703a518b437f5103aa9cf70e9bd437cb78eea9c51024329e4fb8a3e3679"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b99fbf72e3faf0b2f5f16e5e3458b93b7d0a83984fe8d5364c60aa169f2da89"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b17b299ca9966ca983ecda1c0791a3f07f9ca6ab5ded8ef3d283fff45f6bcd5f"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76ded72f69209c9780fdb23ea89e56d35c54ae6abcdec67ccb22af8e696e449a"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2783956926303dcfeb1de91a4d1204cd4089ab441e622e7caee0642281109db3"},
-    {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d94581aab8c6b204def4d7320f07534d6ee34cd4855688004a4354e63b639a35"},
-    {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:67e1e7cb8678bb1b37ac48ec0df04faf689e2f4e9e81e566b5c63d9f23748523"},
-    {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:dbd280b07e6054ea68b0cb4b16ad9703e7d63cd6890f577cb98acc5354780142"},
-    {file = "safetensors-0.4.5-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:77d9b228da8374c7262046a36c1f656ba32a93df6cc51cd4453af932011e77f1"},
-    {file = "safetensors-0.4.5-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:500cac01d50b301ab7bb192353317035011c5ceeef0fca652f9f43c000bb7f8d"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75331c0c746f03158ded32465b7d0b0e24c5a22121743662a2393439c43a45cf"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:670e95fe34e0d591d0529e5e59fd9d3d72bc77b1444fcaa14dccda4f36b5a38b"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:098923e2574ff237c517d6e840acada8e5b311cb1fa226019105ed82e9c3b62f"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ca0902d2648775089fa6a0c8fc9e6390c5f8ee576517d33f9261656f851e3f"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f0032bedc869c56f8d26259fe39cd21c5199cd57f2228d817a0e23e8370af25"},
-    {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4b15f51b4f8f2a512341d9ce3475cacc19c5fdfc5db1f0e19449e75f95c7dc8"},
-    {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f6594d130d0ad933d885c6a7b75c5183cb0e8450f799b80a39eae2b8508955eb"},
-    {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:60c828a27e852ded2c85fc0f87bf1ec20e464c5cd4d56ff0e0711855cc2e17f8"},
-    {file = "safetensors-0.4.5-cp37-none-win32.whl", hash = "sha256:6d3de65718b86c3eeaa8b73a9c3d123f9307a96bbd7be9698e21e76a56443af5"},
-    {file = "safetensors-0.4.5-cp37-none-win_amd64.whl", hash = "sha256:5a2d68a523a4cefd791156a4174189a4114cf0bf9c50ceb89f261600f3b2b81a"},
-    {file = "safetensors-0.4.5-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:e7a97058f96340850da0601a3309f3d29d6191b0702b2da201e54c6e3e44ccf0"},
-    {file = "safetensors-0.4.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:63bfd425e25f5c733f572e2246e08a1c38bd6f2e027d3f7c87e2e43f228d1345"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3664ac565d0e809b0b929dae7ccd74e4d3273cd0c6d1220c6430035befb678e"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:313514b0b9b73ff4ddfb4edd71860696dbe3c1c9dc4d5cc13dbd74da283d2cbf"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31fa33ee326f750a2f2134a6174773c281d9a266ccd000bd4686d8021f1f3dac"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:09566792588d77b68abe53754c9f1308fadd35c9f87be939e22c623eaacbed6b"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309aaec9b66cbf07ad3a2e5cb8a03205663324fea024ba391594423d0f00d9fe"},
-    {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:53946c5813b8f9e26103c5efff4a931cc45d874f45229edd68557ffb35ffb9f8"},
-    {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:868f9df9e99ad1e7f38c52194063a982bc88fedc7d05096f4f8160403aaf4bd6"},
-    {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9cc9449bd0b0bc538bd5e268221f0c5590bc5c14c1934a6ae359d44410dc68c4"},
-    {file = "safetensors-0.4.5-cp38-none-win32.whl", hash = "sha256:83c4f13a9e687335c3928f615cd63a37e3f8ef072a3f2a0599fa09f863fb06a2"},
-    {file = "safetensors-0.4.5-cp38-none-win_amd64.whl", hash = "sha256:b98d40a2ffa560653f6274e15b27b3544e8e3713a44627ce268f419f35c49478"},
-    {file = "safetensors-0.4.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:cf727bb1281d66699bef5683b04d98c894a2803442c490a8d45cd365abfbdeb2"},
-    {file = "safetensors-0.4.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96f1d038c827cdc552d97e71f522e1049fef0542be575421f7684756a748e457"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:139fbee92570ecea774e6344fee908907db79646d00b12c535f66bc78bd5ea2c"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c36302c1c69eebb383775a89645a32b9d266878fab619819ce660309d6176c9b"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d641f5b8149ea98deb5ffcf604d764aad1de38a8285f86771ce1abf8e74c4891"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b4db6a61d968de73722b858038c616a1bebd4a86abe2688e46ca0cc2d17558f2"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b75a616e02f21b6f1d5785b20cecbab5e2bd3f6358a90e8925b813d557666ec1"},
-    {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:788ee7d04cc0e0e7f944c52ff05f52a4415b312f5efd2ee66389fb7685ee030c"},
-    {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87bc42bd04fd9ca31396d3ca0433db0be1411b6b53ac5a32b7845a85d01ffc2e"},
-    {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4037676c86365a721a8c9510323a51861d703b399b78a6b4486a54a65a975fca"},
-    {file = "safetensors-0.4.5-cp39-none-win32.whl", hash = "sha256:1500418454529d0ed5c1564bda376c4ddff43f30fce9517d9bee7bcce5a8ef50"},
-    {file = "safetensors-0.4.5-cp39-none-win_amd64.whl", hash = "sha256:9d1a94b9d793ed8fe35ab6d5cea28d540a46559bafc6aae98f30ee0867000cab"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fdadf66b5a22ceb645d5435a0be7a0292ce59648ca1d46b352f13cff3ea80410"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d42ffd4c2259f31832cb17ff866c111684c87bd930892a1ba53fed28370c918c"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd8a1f6d2063a92cd04145c7fd9e31a1c7d85fbec20113a14b487563fdbc0597"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:951d2fcf1817f4fb0ef0b48f6696688a4e852a95922a042b3f96aaa67eedc920"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ac85d9a8c1af0e3132371d9f2d134695a06a96993c2e2f0bbe25debb9e3f67a"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e3cec4a29eb7fe8da0b1c7988bc3828183080439dd559f720414450de076fcab"},
-    {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c7db3006a4915151ce1913652e907cdede299b974641a83fbc092102ac41b644"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f68bf99ea970960a237f416ea394e266e0361895753df06e3e06e6ea7907d98b"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8158938cf3324172df024da511839d373c40fbfaa83e9abf467174b2910d7b4c"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:540ce6c4bf6b58cb0fd93fa5f143bc0ee341c93bb4f9287ccd92cf898cc1b0dd"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bfeaa1a699c6b9ed514bd15e6a91e74738b71125a9292159e3d6b7f0a53d2cde"},
-    {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:01c8f00da537af711979e1b42a69a8ec9e1d7112f208e0e9b8a35d2c381085ef"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a0dd565f83b30f2ca79b5d35748d0d99dd4b3454f80e03dfb41f0038e3bdf180"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:023b6e5facda76989f4cba95a861b7e656b87e225f61811065d5c501f78cdb3f"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9633b663393d5796f0b60249549371e392b75a0b955c07e9c6f8708a87fc841f"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78dd8adfb48716233c45f676d6e48534d34b4bceb50162c13d1f0bdf6f78590a"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8e8deb16c4321d61ae72533b8451ec4a9af8656d1c61ff81aa49f966406e4b68"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:52452fa5999dc50c4decaf0c53aa28371f7f1e0fe5c2dd9129059fbe1e1599c7"},
-    {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d5f23198821e227cfc52d50fa989813513db381255c6d100927b012f0cfec63d"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f4beb84b6073b1247a773141a6331117e35d07134b3bb0383003f39971d414bb"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:68814d599d25ed2fdd045ed54d370d1d03cf35e02dce56de44c651f828fb9b7b"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0b6453c54c57c1781292c46593f8a37254b8b99004c68d6c3ce229688931a22"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adaa9c6dead67e2dd90d634f89131e43162012479d86e25618e821a03d1eb1dc"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73e7d408e9012cd17511b382b43547850969c7979efc2bc353f317abaf23c84c"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:775409ce0fcc58b10773fdb4221ed1eb007de10fe7adbdf8f5e8a56096b6f0bc"},
-    {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:834001bed193e4440c4a3950a31059523ee5090605c907c66808664c932b549c"},
-    {file = "safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310"},
+    {file = "safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073"},
+    {file = "safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a"},
+    {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d"},
+    {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b"},
+    {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff"},
+    {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135"},
+    {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04"},
+    {file = "safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace"},
+    {file = "safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11"},
+    {file = "safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965"},
 ]
 
 [package.extras]
@@ -3150,7 +2315,7 @@ jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "safetensors[num
 mlx = ["mlx (>=0.0.9)"]
 numpy = ["numpy (>=1.21.6)"]
 paddlepaddle = ["paddlepaddle (>=2.4.1)", "safetensors[numpy]"]
-pinned-tf = ["safetensors[numpy]", "tensorflow (==2.11.0)"]
+pinned-tf = ["safetensors[numpy]", "tensorflow (==2.18.0)"]
 quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"]
 tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
 testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
@@ -3158,37 +2323,42 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"]
 
 [[package]]
 name = "scikit-learn"
-version = "1.5.2"
+version = "1.6.1"
 description = "A set of python modules for machine learning and data mining"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6"},
-    {file = "scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0"},
-    {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540"},
-    {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8"},
-    {file = "scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1"},
-    {file = "scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
-    {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7"},
-    {file = "scikit_learn-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe"},
-    {file = "scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d"},
+    {file = "scikit_learn-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d056391530ccd1e501056160e3c9673b4da4805eb67eb2bdf4e983e1f9c9204e"},
+    {file = "scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0c8d036eb937dbb568c6242fa598d551d88fb4399c0344d95c001980ec1c7d36"},
+    {file = "scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8634c4bd21a2a813e0a7e3900464e6d593162a29dd35d25bdf0103b3fce60ed5"},
+    {file = "scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:775da975a471c4f6f467725dff0ced5c7ac7bda5e9316b260225b48475279a1b"},
+    {file = "scikit_learn-1.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:8a600c31592bd7dab31e1c61b9bbd6dea1b3433e67d264d17ce1017dbdce8002"},
+    {file = "scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:72abc587c75234935e97d09aa4913a82f7b03ee0b74111dcc2881cba3c5a7b33"},
+    {file = "scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b3b00cdc8f1317b5f33191df1386c0befd16625f49d979fe77a8d44cae82410d"},
+    {file = "scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc4765af3386811c3ca21638f63b9cf5ecf66261cc4815c1db3f1e7dc7b79db2"},
+    {file = "scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25fc636bdaf1cc2f4a124a116312d837148b5e10872147bdaf4887926b8c03d8"},
+    {file = "scikit_learn-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:fa909b1a36e000a03c382aade0bd2063fd5680ff8b8e501660c0f59f021a6415"},
+    {file = "scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:926f207c804104677af4857b2c609940b743d04c4c35ce0ddc8ff4f053cddc1b"},
+    {file = "scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c2cae262064e6a9b77eee1c8e768fc46aa0b8338c6a8297b9b6759720ec0ff2"},
+    {file = "scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1061b7c028a8663fb9a1a1baf9317b64a257fcb036dae5c8752b2abef31d136f"},
+    {file = "scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e69fab4ebfc9c9b580a7a80111b43d214ab06250f8a7ef590a4edf72464dd86"},
+    {file = "scikit_learn-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:70b1d7e85b1c96383f872a519b3375f92f14731e279a7b4c6cfd650cf5dffc52"},
+    {file = "scikit_learn-1.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2ffa1e9e25b3d93990e74a4be2c2fc61ee5af85811562f1288d5d055880c4322"},
+    {file = "scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:dc5cf3d68c5a20ad6d571584c0750ec641cc46aeef1c1507be51300e6003a7e1"},
+    {file = "scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c06beb2e839ecc641366000ca84f3cf6fa9faa1777e29cf0c04be6e4d096a348"},
+    {file = "scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8ca8cb270fee8f1f76fa9bfd5c3507d60c6438bbee5687f81042e2bb98e5a97"},
+    {file = "scikit_learn-1.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:7a1c43c8ec9fde528d664d947dc4c0789be4077a3647f232869f41d9bf50e0fb"},
+    {file = "scikit_learn-1.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a17c1dea1d56dcda2fac315712f3651a1fea86565b64b48fa1bc090249cbf236"},
+    {file = "scikit_learn-1.6.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6a7aa5f9908f0f28f4edaa6963c0a6183f1911e63a69aa03782f0d924c830a35"},
+    {file = "scikit_learn-1.6.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0650e730afb87402baa88afbf31c07b84c98272622aaba002559b614600ca691"},
+    {file = "scikit_learn-1.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:3f59fe08dc03ea158605170eb52b22a105f238a5d512c4470ddeca71feae8e5f"},
+    {file = "scikit_learn-1.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6849dd3234e87f55dce1db34c89a810b489ead832aaf4d4550b7ea85628be6c1"},
+    {file = "scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:e7be3fa5d2eb9be7d77c3734ff1d599151bb523674be9b834e8da6abe132f44e"},
+    {file = "scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44a17798172df1d3c1065e8fcf9019183f06c87609b49a124ebdf57ae6cb0107"},
+    {file = "scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8b7a3b86e411e4bce21186e1c180d792f3d99223dcfa3b4f597ecc92fa1a422"},
+    {file = "scikit_learn-1.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:7a73d457070e3318e32bdb3aa79a8d990474f19035464dfd8bede2883ab5dc3b"},
+    {file = "scikit_learn-1.6.1.tar.gz", hash = "sha256:b4fc2525eca2c69a59260f583c56a7557c6ccdf8deafdba6e060f94c1c59738e"},
 ]
 
 [package.dependencies]
@@ -3200,11 +2370,11 @@ threadpoolctl = ">=3.1.0"
 [package.extras]
 benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"]
 build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"]
-docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.16.0)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)"]
+docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.17.1)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)", "towncrier (>=24.8.0)"]
 examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"]
 install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"]
 maintenance = ["conda-lock (==2.5.6)"]
-tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.2.1)", "scikit-image (>=0.17.2)"]
+tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.5.1)", "scikit-image (>=0.17.2)"]
 
 [[package]]
 name = "scipy"
@@ -3212,6 +2382,7 @@ version = "1.13.1"
 description = "Fundamental algorithms for scientific computing in Python"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca"},
     {file = "scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f"},
@@ -3248,12 +2419,41 @@ dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pyde
 doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"]
 test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
 
+[[package]]
+name = "sentence-transformers"
+version = "3.3.1"
+description = "State-of-the-Art Text Embeddings"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "sentence_transformers-3.3.1-py3-none-any.whl", hash = "sha256:abffcc79dab37b7d18d21a26d5914223dd42239cfe18cb5e111c66c54b658ae7"},
+    {file = "sentence_transformers-3.3.1.tar.gz", hash = "sha256:9635dbfb11c6b01d036b9cfcee29f7716ab64cf2407ad9f403a2e607da2ac48b"},
+]
+
+[package.dependencies]
+huggingface-hub = ">=0.20.0"
+Pillow = "*"
+scikit-learn = "*"
+scipy = "*"
+torch = ">=1.11.0"
+tqdm = "*"
+transformers = ">=4.41.0,<5.0.0"
+
+[package.extras]
+dev = ["accelerate (>=0.20.3)", "datasets", "peft", "pre-commit", "pytest", "pytest-cov"]
+onnx = ["optimum[onnxruntime] (>=1.23.1)"]
+onnx-gpu = ["optimum[onnxruntime-gpu] (>=1.23.1)"]
+openvino = ["optimum-intel[openvino] (>=1.20.0)"]
+train = ["accelerate (>=0.20.3)", "datasets"]
+
 [[package]]
 name = "sentencepiece"
 version = "0.2.0"
 description = "SentencePiece python wrapper"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:188779e1298a1c8b8253c7d3ad729cb0a9891e5cef5e5d07ce4592c54869e227"},
     {file = "sentencepiece-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bed9cf85b296fa2b76fc2547b9cbb691a523864cebaee86304c43a7b4cb1b452"},
@@ -3312,44 +2512,48 @@ files = [
 
 [[package]]
 name = "setuptools"
-version = "75.2.0"
+version = "78.1.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main", "dev"]
 files = [
-    {file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"},
-    {file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"},
+    {file = "setuptools-78.1.0-py3-none-any.whl", hash = "sha256:3e386e96793c8702ae83d17b853fb93d3e09ef82ec62722e61da5cd22376dcd8"},
+    {file = "setuptools-78.1.0.tar.gz", hash = "sha256:18fd474d4a82a5f83dac888df697af65afa82dec7323d09c3e37d1f14288da54"},
 ]
+markers = {main = "python_version >= \"3.12\""}
 
 [package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"]
-core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.8.0)"]
+core = ["importlib_metadata (>=6)", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"]
 cover = ["pytest-cov"]
 doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
 enabler = ["pytest-enabler (>=2.2)"]
-test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
-type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
+type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.14.*)", "pytest-mypy"]
 
 [[package]]
-name = "six"
-version = "1.16.0"
-description = "Python 2 and 3 compatibility utilities"
-optional = true
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+name = "shellingham"
+version = "1.5.4"
+description = "Tool to Detect Surrounding Shell"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
-    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+    {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"},
+    {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
 ]
 
 [[package]]
 name = "sympy"
-version = "1.13.3"
+version = "1.13.1"
 description = "Computer algebra system (CAS) in Python"
-optional = true
+optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73"},
-    {file = "sympy-1.13.3.tar.gz", hash = "sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9"},
+    {file = "sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8"},
+    {file = "sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f"},
 ]
 
 [package.dependencies]
@@ -3359,123 +2563,40 @@ mpmath = ">=1.1.0,<1.4"
 dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
 
 [[package]]
-name = "texttable"
-version = "1.7.0"
-description = "module to create simple ASCII tables"
-optional = true
-python-versions = "*"
+name = "threadpoolctl"
+version = "3.6.0"
+description = "threadpoolctl"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "texttable-1.7.0-py2.py3-none-any.whl", hash = "sha256:72227d592c82b3d7f672731ae73e4d1f88cd8e2ef5b075a7a7f01a23a3743917"},
-    {file = "texttable-1.7.0.tar.gz", hash = "sha256:2d2068fb55115807d3ac77a4ca68fa48803e84ebb0ee2340f858107a36522638"},
+    {file = "threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb"},
+    {file = "threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e"},
 ]
 
 [[package]]
 name = "tokenizers"
-version = "0.20.1"
+version = "0.21.1"
 description = ""
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "tokenizers-0.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:439261da7c0a5c88bda97acb284d49fbdaf67e9d3b623c0bfd107512d22787a9"},
-    {file = "tokenizers-0.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03dae629d99068b1ea5416d50de0fea13008f04129cc79af77a2a6392792d93c"},
-    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b61f561f329ffe4b28367798b89d60c4abf3f815d37413b6352bc6412a359867"},
-    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec870fce1ee5248a10be69f7a8408a234d6f2109f8ea827b4f7ecdbf08c9fd15"},
-    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d388d1ea8b7447da784e32e3b86a75cce55887e3b22b31c19d0b186b1c677800"},
-    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:299c85c1d21135bc01542237979bf25c32efa0d66595dd0069ae259b97fb2dbe"},
-    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e96f6c14c9752bb82145636b614d5a78e9cde95edfbe0a85dad0dd5ddd6ec95c"},
-    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc9e95ad49c932b80abfbfeaf63b155761e695ad9f8a58c52a47d962d76e310f"},
-    {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f22dee205329a636148c325921c73cf3e412e87d31f4d9c3153b302a0200057b"},
-    {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2ffd9a8895575ac636d44500c66dffaef133823b6b25067604fa73bbc5ec09d"},
-    {file = "tokenizers-0.20.1-cp310-none-win32.whl", hash = "sha256:2847843c53f445e0f19ea842a4e48b89dd0db4e62ba6e1e47a2749d6ec11f50d"},
-    {file = "tokenizers-0.20.1-cp310-none-win_amd64.whl", hash = "sha256:f9aa93eacd865f2798b9e62f7ce4533cfff4f5fbd50c02926a78e81c74e432cd"},
-    {file = "tokenizers-0.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4a717dcb08f2dabbf27ae4b6b20cbbb2ad7ed78ce05a829fae100ff4b3c7ff15"},
-    {file = "tokenizers-0.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f84dad1ff1863c648d80628b1b55353d16303431283e4efbb6ab1af56a75832"},
-    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:929c8f3afa16a5130a81ab5079c589226273ec618949cce79b46d96e59a84f61"},
-    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d10766473954397e2d370f215ebed1cc46dcf6fd3906a2a116aa1d6219bfedc3"},
-    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9300fac73ddc7e4b0330acbdda4efaabf74929a4a61e119a32a181f534a11b47"},
-    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ecaf7b0e39caeb1aa6dd6e0975c405716c82c1312b55ac4f716ef563a906969"},
-    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5170be9ec942f3d1d317817ced8d749b3e1202670865e4fd465e35d8c259de83"},
-    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f1ae08fa9aea5891cbd69df29913e11d3841798e0bfb1ff78b78e4e7ea0a4"},
-    {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ee86d4095d3542d73579e953c2e5e07d9321af2ffea6ecc097d16d538a2dea16"},
-    {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:86dcd08da163912e17b27bbaba5efdc71b4fbffb841530fdb74c5707f3c49216"},
-    {file = "tokenizers-0.20.1-cp311-none-win32.whl", hash = "sha256:9af2dc4ee97d037bc6b05fa4429ddc87532c706316c5e11ce2f0596dfcfa77af"},
-    {file = "tokenizers-0.20.1-cp311-none-win_amd64.whl", hash = "sha256:899152a78b095559c287b4c6d0099469573bb2055347bb8154db106651296f39"},
-    {file = "tokenizers-0.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:407ab666b38e02228fa785e81f7cf79ef929f104bcccf68a64525a54a93ceac9"},
-    {file = "tokenizers-0.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f13a2d16032ebc8bd812eb8099b035ac65887d8f0c207261472803b9633cf3e"},
-    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e98eee4dca22849fbb56a80acaa899eec5b72055d79637dd6aa15d5e4b8628c9"},
-    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47c1bcdd61e61136087459cb9e0b069ff23b5568b008265e5cbc927eae3387ce"},
-    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128c1110e950534426e2274837fc06b118ab5f2fa61c3436e60e0aada0ccfd67"},
-    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2e2d47a819d2954f2c1cd0ad51bb58ffac6f53a872d5d82d65d79bf76b9896d"},
-    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bdd67a0e3503a9a7cf8bc5a4a49cdde5fa5bada09a51e4c7e1c73900297539bd"},
-    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689b93d2e26d04da337ac407acec8b5d081d8d135e3e5066a88edd5bdb5aff89"},
-    {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0c6a796ddcd9a19ad13cf146997cd5895a421fe6aec8fd970d69f9117bddb45c"},
-    {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3ea919687aa7001a8ff1ba36ac64f165c4e89035f57998fa6cedcfd877be619d"},
-    {file = "tokenizers-0.20.1-cp312-none-win32.whl", hash = "sha256:6d3ac5c1f48358ffe20086bf065e843c0d0a9fce0d7f0f45d5f2f9fba3609ca5"},
-    {file = "tokenizers-0.20.1-cp312-none-win_amd64.whl", hash = "sha256:b0874481aea54a178f2bccc45aa2d0c99cd3f79143a0948af6a9a21dcc49173b"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:96af92e833bd44760fb17f23f402e07a66339c1dcbe17d79a9b55bb0cc4f038e"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:65f34e5b731a262dfa562820818533c38ce32a45864437f3d9c82f26c139ca7f"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17f98fccb5c12ab1ce1f471731a9cd86df5d4bd2cf2880c5a66b229802d96145"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8c0fc3542cf9370bf92c932eb71bdeb33d2d4aeeb4126d9fd567b60bd04cb30"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b39356df4575d37f9b187bb623aab5abb7b62c8cb702867a1768002f814800c"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfdad27b0e50544f6b838895a373db6114b85112ba5c0cefadffa78d6daae563"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:094663dd0e85ee2e573126918747bdb40044a848fde388efb5b09d57bc74c680"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e4cf033a2aa207d7ac790e91adca598b679999710a632c4a494aab0fc3a1b2"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9310951c92c9fb91660de0c19a923c432f110dbfad1a2d429fbc44fa956bf64f"},
-    {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05e41e302c315bd2ed86c02e917bf03a6cf7d2f652c9cee1a0eb0d0f1ca0d32c"},
-    {file = "tokenizers-0.20.1-cp37-none-win32.whl", hash = "sha256:212231ab7dfcdc879baf4892ca87c726259fa7c887e1688e3f3cead384d8c305"},
-    {file = "tokenizers-0.20.1-cp37-none-win_amd64.whl", hash = "sha256:896195eb9dfdc85c8c052e29947169c1fcbe75a254c4b5792cdbd451587bce85"},
-    {file = "tokenizers-0.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:741fb22788482d09d68e73ece1495cfc6d9b29a06c37b3df90564a9cfa688e6d"},
-    {file = "tokenizers-0.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:10be14ebd8082086a342d969e17fc2d6edc856c59dbdbddd25f158fa40eaf043"},
-    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:514cf279b22fa1ae0bc08e143458c74ad3b56cd078b319464959685a35c53d5e"},
-    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a647c5b7cb896d6430cf3e01b4e9a2d77f719c84cefcef825d404830c2071da2"},
-    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cdf379219e1e1dd432091058dab325a2e6235ebb23e0aec8d0508567c90cd01"},
-    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ba72260449e16c4c2f6f3252823b059fbf2d31b32617e582003f2b18b415c39"},
-    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:910b96ed87316e4277b23c7bcaf667ce849c7cc379a453fa179e7e09290eeb25"},
-    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e53975a6694428a0586534cc1354b2408d4e010a3103117f617cbb550299797c"},
-    {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:07c4b7be58da142b0730cc4e5fd66bb7bf6f57f4986ddda73833cd39efef8a01"},
-    {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b605c540753e62199bf15cf69c333e934077ef2350262af2ccada46026f83d1c"},
-    {file = "tokenizers-0.20.1-cp38-none-win32.whl", hash = "sha256:88b3bc76ab4db1ab95ead623d49c95205411e26302cf9f74203e762ac7e85685"},
-    {file = "tokenizers-0.20.1-cp38-none-win_amd64.whl", hash = "sha256:d412a74cf5b3f68a90c615611a5aa4478bb303d1c65961d22db45001df68afcb"},
-    {file = "tokenizers-0.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a25dcb2f41a0a6aac31999e6c96a75e9152fa0127af8ece46c2f784f23b8197a"},
-    {file = "tokenizers-0.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a12c3cebb8c92e9c35a23ab10d3852aee522f385c28d0b4fe48c0b7527d59762"},
-    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02e18da58cf115b7c40de973609c35bde95856012ba42a41ee919c77935af251"},
-    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f326a1ac51ae909b9760e34671c26cd0dfe15662f447302a9d5bb2d872bab8ab"},
-    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b4872647ea6f25224e2833b044b0b19084e39400e8ead3cfe751238b0802140"},
-    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce6238a3311bb8e4c15b12600927d35c267b92a52c881ef5717a900ca14793f7"},
-    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57b7a8880b208866508b06ce365dc631e7a2472a3faa24daa430d046fb56c885"},
-    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a908c69c2897a68f412aa05ba38bfa87a02980df70f5a72fa8490479308b1f2d"},
-    {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:da1001aa46f4490099c82e2facc4fbc06a6a32bf7de3918ba798010954b775e0"},
-    {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:42c097390e2f0ed0a5c5d569e6669dd4e9fff7b31c6a5ce6e9c66a61687197de"},
-    {file = "tokenizers-0.20.1-cp39-none-win32.whl", hash = "sha256:3d4d218573a3d8b121a1f8c801029d70444ffb6d8f129d4cca1c7b672ee4a24c"},
-    {file = "tokenizers-0.20.1-cp39-none-win_amd64.whl", hash = "sha256:37d1e6f616c84fceefa7c6484a01df05caf1e207669121c66213cb5b2911d653"},
-    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48689da7a395df41114f516208d6550e3e905e1239cc5ad386686d9358e9cef0"},
-    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:712f90ea33f9bd2586b4a90d697c26d56d0a22fd3c91104c5858c4b5b6489a79"},
-    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:359eceb6a620c965988fc559cebc0a98db26713758ec4df43fb76d41486a8ed5"},
-    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d3caf244ce89d24c87545aafc3448be15870096e796c703a0d68547187192e1"},
-    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03b03cf8b9a32254b1bf8a305fb95c6daf1baae0c1f93b27f2b08c9759f41dee"},
-    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:218e5a3561561ea0f0ef1559c6d95b825308dbec23fb55b70b92589e7ff2e1e8"},
-    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f40df5e0294a95131cc5f0e0eb91fe86d88837abfbee46b9b3610b09860195a7"},
-    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:08aaa0d72bb65058e8c4b0455f61b840b156c557e2aca57627056624c3a93976"},
-    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:998700177b45f70afeb206ad22c08d9e5f3a80639dae1032bf41e8cbc4dada4b"},
-    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62f7fbd3c2c38b179556d879edae442b45f68312019c3a6013e56c3947a4e648"},
-    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31e87fca4f6bbf5cc67481b562147fe932f73d5602734de7dd18a8f2eee9c6dd"},
-    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:956f21d359ae29dd51ca5726d2c9a44ffafa041c623f5aa33749da87cfa809b9"},
-    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:1fbbaf17a393c78d8aedb6a334097c91cb4119a9ced4764ab8cfdc8d254dc9f9"},
-    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ebe63e31f9c1a970c53866d814e35ec2ec26fda03097c486f82f3891cee60830"},
-    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:81970b80b8ac126910295f8aab2d7ef962009ea39e0d86d304769493f69aaa1e"},
-    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130e35e76f9337ed6c31be386e75d4925ea807055acf18ca1a9b0eec03d8fe23"},
-    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd28a8614f5c82a54ab2463554e84ad79526c5184cf4573bbac2efbbbcead457"},
-    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9041ee665d0fa7f5c4ccf0f81f5e6b7087f797f85b143c094126fc2611fec9d0"},
-    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:62eb9daea2a2c06bcd8113a5824af8ef8ee7405d3a71123ba4d52c79bb3d9f1a"},
-    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f861889707b54a9ab1204030b65fd6c22bdd4a95205deec7994dc22a8baa2ea4"},
-    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:89d5c337d74ea6e5e7dc8af124cf177be843bbb9ca6e58c01f75ea103c12c8a9"},
-    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:0b7f515c83397e73292accdbbbedc62264e070bae9682f06061e2ddce67cacaf"},
-    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0305fc1ec6b1e5052d30d9c1d5c807081a7bd0cae46a33d03117082e91908c"},
-    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dc611e6ac0fa00a41de19c3bf6391a05ea201d2d22b757d63f5491ec0e67faa"},
-    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5ffe0d7f7bfcfa3b2585776ecf11da2e01c317027c8573c78ebcb8985279e23"},
-    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e7edb8ec12c100d5458d15b1e47c0eb30ad606a05641f19af7563bc3d1608c14"},
-    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:de291633fb9303555793cc544d4a86e858da529b7d0b752bcaf721ae1d74b2c9"},
-    {file = "tokenizers-0.20.1.tar.gz", hash = "sha256:84edcc7cdeeee45ceedb65d518fffb77aec69311c9c8e30f77ad84da3025f002"},
+    {file = "tokenizers-0.21.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e78e413e9e668ad790a29456e677d9d3aa50a9ad311a40905d6861ba7692cf41"},
+    {file = "tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:cd51cd0a91ecc801633829fcd1fda9cf8682ed3477c6243b9a095539de4aecf3"},
+    {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28da6b72d4fb14ee200a1bd386ff74ade8992d7f725f2bde2c495a9a98cf4d9f"},
+    {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:34d8cfde551c9916cb92014e040806122295a6800914bab5865deb85623931cf"},
+    {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaa852d23e125b73d283c98f007e06d4595732104b65402f46e8ef24b588d9f8"},
+    {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a21a15d5c8e603331b8a59548bbe113564136dc0f5ad8306dd5033459a226da0"},
+    {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2fdbd4c067c60a0ac7eca14b6bd18a5bebace54eb757c706b47ea93204f7a37c"},
+    {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dd9a0061e403546f7377df940e866c3e678d7d4e9643d0461ea442b4f89e61a"},
+    {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:db9484aeb2e200c43b915a1a0150ea885e35f357a5a8fabf7373af333dcc8dbf"},
+    {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:ed248ab5279e601a30a4d67bdb897ecbe955a50f1e7bb62bd99f07dd11c2f5b6"},
+    {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:9ac78b12e541d4ce67b4dfd970e44c060a2147b9b2a21f509566d556a509c67d"},
+    {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e5a69c1a4496b81a5ee5d2c1f3f7fbdf95e90a0196101b0ee89ed9956b8a168f"},
+    {file = "tokenizers-0.21.1-cp39-abi3-win32.whl", hash = "sha256:1039a3a5734944e09de1d48761ade94e00d0fa760c0e0551151d4dd851ba63e3"},
+    {file = "tokenizers-0.21.1-cp39-abi3-win_amd64.whl", hash = "sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382"},
+    {file = "tokenizers-0.21.1.tar.gz", hash = "sha256:a1bb04dc5b448985f86ecd4b05407f5a8d97cb2c0532199b2a302a604a0165ab"},
 ]
 
 [package.dependencies]
@@ -3488,71 +2609,162 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
 
 [[package]]
 name = "tomli"
-version = "2.0.2"
+version = "2.2.1"
 description = "A lil' TOML parser"
 optional = false
 python-versions = ">=3.8"
+groups = ["dev"]
+markers = "python_version < \"3.11\""
 files = [
-    {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"},
-    {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"},
+    {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"},
+    {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"},
+    {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a"},
+    {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee"},
+    {file = "tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e"},
+    {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4"},
+    {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106"},
+    {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8"},
+    {file = "tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff"},
+    {file = "tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b"},
+    {file = "tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea"},
+    {file = "tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8"},
+    {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192"},
+    {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222"},
+    {file = "tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77"},
+    {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6"},
+    {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd"},
+    {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e"},
+    {file = "tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98"},
+    {file = "tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4"},
+    {file = "tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7"},
+    {file = "tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c"},
+    {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13"},
+    {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281"},
+    {file = "tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272"},
+    {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140"},
+    {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2"},
+    {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744"},
+    {file = "tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec"},
+    {file = "tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69"},
+    {file = "tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc"},
+    {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"},
 ]
 
+[[package]]
+name = "torch"
+version = "2.6.0"
+description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
+optional = false
+python-versions = ">=3.9.0"
+groups = ["main"]
+files = [
+    {file = "torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961"},
+    {file = "torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab"},
+    {file = "torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341"},
+    {file = "torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628"},
+    {file = "torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1"},
+    {file = "torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d"},
+    {file = "torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7"},
+    {file = "torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21"},
+    {file = "torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9"},
+    {file = "torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb"},
+    {file = "torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239"},
+    {file = "torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989"},
+    {file = "torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf"},
+    {file = "torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b"},
+    {file = "torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc"},
+    {file = "torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2"},
+    {file = "torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053"},
+    {file = "torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e"},
+    {file = "torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a"},
+    {file = "torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c"},
+]
+
+[package.dependencies]
+filelock = "*"
+fsspec = "*"
+jinja2 = "*"
+networkx = "*"
+nvidia-cublas-cu12 = {version = "12.4.5.8", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-cupti-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-nvrtc-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-runtime-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusparselt-cu12 = {version = "0.6.2", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+setuptools = {version = "*", markers = "python_version >= \"3.12\""}
+sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
+triton = {version = "3.2.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+typing-extensions = ">=4.10.0"
+
+[package.extras]
+opt-einsum = ["opt-einsum (>=3.3)"]
+optree = ["optree (>=0.13.0)"]
+
 [[package]]
 name = "tqdm"
-version = "4.66.5"
+version = "4.67.1"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"},
-    {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"},
+    {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"},
+    {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"},
 ]
 
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
 [package.extras]
-dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+dev = ["nbval", "pytest (>=6)", "pytest-asyncio (>=0.24)", "pytest-cov", "pytest-timeout"]
+discord = ["requests"]
 notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 
 [[package]]
 name = "transformers"
-version = "4.45.2"
+version = "4.49.0"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
-python-versions = ">=3.8.0"
+python-versions = ">=3.9.0"
+groups = ["main"]
 files = [
-    {file = "transformers-4.45.2-py3-none-any.whl", hash = "sha256:c551b33660cfc815bae1f9f097ecfd1e65be623f13c6ee0dda372bd881460210"},
-    {file = "transformers-4.45.2.tar.gz", hash = "sha256:72bc390f6b203892561f05f86bbfaa0e234aab8e927a83e62b9d92ea7e3ae101"},
+    {file = "transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03"},
+    {file = "transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e"},
 ]
 
 [package.dependencies]
 filelock = "*"
-huggingface-hub = ">=0.23.2,<1.0"
+huggingface-hub = ">=0.26.0,<1.0"
 numpy = ">=1.17"
 packaging = ">=20.0"
 pyyaml = ">=5.1"
 regex = "!=2019.12.17"
 requests = "*"
 safetensors = ">=0.4.1"
-sentencepiece = {version = ">=0.1.91,<0.1.92 || >0.1.92", optional = true, markers = "extra == \"sentencepiece\""}
-tokenizers = ">=0.20,<0.21"
+tokenizers = ">=0.21,<0.22"
 tqdm = ">=4.27"
 
 [package.extras]
 accelerate = ["accelerate (>=0.26.0)"]
-agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
-all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"]
+agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 benchmark = ["optimum-benchmark (>=0.3.0)"]
-codecarbon = ["codecarbon (==1.2.0)"]
+codecarbon = ["codecarbon (>=2.8.1)"]
 deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
-deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.20,<0.21)", "urllib3 (<2.0.0)"]
-dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
 flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
 flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 ftfy = ["ftfy"]
@@ -3573,42 +2785,36 @@ serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
 sigopt = ["sigopt"]
 sklearn = ["scikit-learn"]
 speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
 tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
 tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
 tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 tiktoken = ["blobfile", "tiktoken"]
-timm = ["timm (<=0.9.16)"]
-tokenizers = ["tokenizers (>=0.20,<0.21)"]
-torch = ["accelerate (>=0.26.0)", "torch"]
+timm = ["timm (<=1.0.11)"]
+tokenizers = ["tokenizers (>=0.21,<0.22)"]
+torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
 torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.23.2,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.20,<0.21)", "torch", "tqdm (>=4.27)"]
-video = ["av (==9.2.0)", "decord (==0.6.0)"]
+torchhub = ["filelock", "huggingface-hub (>=0.26.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
+video = ["av"]
 vision = ["Pillow (>=10.0.1,<=15.0)"]
 
 [[package]]
 name = "triton"
-version = "3.0.0"
+version = "3.2.0"
 description = "A language and compiler for custom Deep Learning operations"
-optional = true
+optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""
 files = [
-    {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"},
-    {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"},
-    {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
-    {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
-    {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
-    {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
-    {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
-    {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
-    {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
-    {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
+    {file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"},
+    {file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"},
+    {file = "triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c"},
+    {file = "triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0"},
+    {file = "triton-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30ceed0eff2c4a73b14eb63e052992f44bbdf175f3fad21e1ac8097a772de7ee"},
 ]
 
-[package.dependencies]
-filelock = "*"
-
 [package.extras]
 build = ["cmake (>=3.20)", "lit"]
 tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"]
@@ -3616,55 +2822,59 @@ tutorials = ["matplotlib", "pandas", "tabulate"]
 
 [[package]]
 name = "typer"
-version = "0.6.1"
+version = "0.15.2"
 description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
+groups = ["main"]
 files = [
-    {file = "typer-0.6.1-py3-none-any.whl", hash = "sha256:54b19e5df18654070a82f8c2aa1da456a4ac16a2a83e6dcd9f170e291c56338e"},
-    {file = "typer-0.6.1.tar.gz", hash = "sha256:2d5720a5e63f73eaf31edaa15f6ab87f35f0690f8ca233017d7d23d743a91d73"},
+    {file = "typer-0.15.2-py3-none-any.whl", hash = "sha256:46a499c6107d645a9c13f7ee46c5d5096cae6f5fc57dd11eccbbb9ae3e44ddfc"},
+    {file = "typer-0.15.2.tar.gz", hash = "sha256:ab2fab47533a813c49fe1f16b1a370fd5819099c00b119e0633df65f22144ba5"},
 ]
 
 [package.dependencies]
-click = ">=7.1.1,<9.0.0"
-
-[package.extras]
-all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
-dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
-doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"]
-test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
+click = ">=8.0.0"
+rich = ">=10.11.0"
+shellingham = ">=1.3.0"
+typing-extensions = ">=3.7.4.3"
 
 [[package]]
 name = "typing-extensions"
-version = "4.12.2"
+version = "4.13.2"
 description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
-    {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
+    {file = "typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c"},
+    {file = "typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef"},
 ]
 
 [[package]]
-name = "tzdata"
-version = "2024.2"
-description = "Provider of IANA time zone data"
+name = "typing-inspection"
+version = "0.4.0"
+description = "Runtime typing introspection tools"
 optional = true
-python-versions = ">=2"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"},
-    {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"},
+    {file = "typing_inspection-0.4.0-py3-none-any.whl", hash = "sha256:50e72559fcd2a6367a19f7a7e610e6afcb9fac940c650290eed893d61386832f"},
+    {file = "typing_inspection-0.4.0.tar.gz", hash = "sha256:9765c87de36671694a67904bf2c96e395be9c6439bb6c87b5142569dcdd65122"},
 ]
 
+[package.dependencies]
+typing-extensions = ">=4.12.0"
+
 [[package]]
 name = "urllib3"
-version = "2.2.3"
+version = "2.4.0"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
-    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
+    {file = "urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813"},
+    {file = "urllib3-2.4.0.tar.gz", hash = "sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466"},
 ]
 
 [package.extras]
@@ -3675,13 +2885,15 @@ zstd = ["zstandard (>=0.18.0)"]
 
 [[package]]
 name = "win32-setctime"
-version = "1.1.0"
+version = "1.2.0"
 description = "A small Python utility to set file creation time on Windows"
 optional = false
 python-versions = ">=3.5"
+groups = ["main"]
+markers = "sys_platform == \"win32\""
 files = [
-    {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"},
-    {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"},
+    {file = "win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390"},
+    {file = "win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0"},
 ]
 
 [package.extras]
@@ -3689,320 +2901,103 @@ dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
 
 [[package]]
 name = "wrapt"
-version = "1.16.0"
+version = "1.17.2"
 description = "Module for decorators, wrappers and monkey patching."
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
+groups = ["main"]
 files = [
-    {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"},
-    {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"},
-    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"},
-    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"},
-    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"},
-    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"},
-    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"},
-    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"},
-    {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"},
-    {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"},
-    {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"},
-    {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"},
-    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"},
-    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"},
-    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"},
-    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"},
-    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"},
-    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"},
-    {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"},
-    {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"},
-    {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"},
-    {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"},
-    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"},
-    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"},
-    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"},
-    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"},
-    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"},
-    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"},
-    {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"},
-    {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"},
-    {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"},
-    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"},
-    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"},
-    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"},
-    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"},
-    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"},
-    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"},
-    {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"},
-    {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"},
-    {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"},
-    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"},
-    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"},
-    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"},
-    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"},
-    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"},
-    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"},
-    {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"},
-    {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"},
-    {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"},
-    {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"},
-    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"},
-    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"},
-    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"},
-    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"},
-    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"},
-    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"},
-    {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"},
-    {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"},
-    {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"},
-    {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"},
-    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"},
-    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"},
-    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"},
-    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"},
-    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"},
-    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"},
-    {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"},
-    {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"},
-    {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"},
-    {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
+    {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984"},
+    {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22"},
+    {file = "wrapt-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80dd7db6a7cb57ffbc279c4394246414ec99537ae81ffd702443335a61dbf3a7"},
+    {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a6e821770cf99cc586d33833b2ff32faebdbe886bd6322395606cf55153246c"},
+    {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b60fb58b90c6d63779cb0c0c54eeb38941bae3ecf7a73c764c52c88c2dcb9d72"},
+    {file = "wrapt-1.17.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b870b5df5b71d8c3359d21be8f0d6c485fa0ebdb6477dda51a1ea54a9b558061"},
+    {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4011d137b9955791f9084749cba9a367c68d50ab8d11d64c50ba1688c9b457f2"},
+    {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1473400e5b2733e58b396a04eb7f35f541e1fb976d0c0724d0223dd607e0f74c"},
+    {file = "wrapt-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3cedbfa9c940fdad3e6e941db7138e26ce8aad38ab5fe9dcfadfed9db7a54e62"},
+    {file = "wrapt-1.17.2-cp310-cp310-win32.whl", hash = "sha256:582530701bff1dec6779efa00c516496968edd851fba224fbd86e46cc6b73563"},
+    {file = "wrapt-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:58705da316756681ad3c9c73fd15499aa4d8c69f9fd38dc8a35e06c12468582f"},
+    {file = "wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58"},
+    {file = "wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda"},
+    {file = "wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438"},
+    {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a"},
+    {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000"},
+    {file = "wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6"},
+    {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b"},
+    {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662"},
+    {file = "wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72"},
+    {file = "wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317"},
+    {file = "wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3"},
+    {file = "wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925"},
+    {file = "wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392"},
+    {file = "wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40"},
+    {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb1d0dbf99411f3d871deb6faa9aabb9d4e744d67dcaaa05399af89d847a91d"},
+    {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d18a4865f46b8579d44e4fe1e2bcbc6472ad83d98e22a26c963d46e4c125ef0b"},
+    {file = "wrapt-1.17.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc570b5f14a79734437cb7b0500376b6b791153314986074486e0b0fa8d71d98"},
+    {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6d9187b01bebc3875bac9b087948a2bccefe464a7d8f627cf6e48b1bbae30f82"},
+    {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9e8659775f1adf02eb1e6f109751268e493c73716ca5761f8acb695e52a756ae"},
+    {file = "wrapt-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8b2816ebef96d83657b56306152a93909a83f23994f4b30ad4573b00bd11bb9"},
+    {file = "wrapt-1.17.2-cp312-cp312-win32.whl", hash = "sha256:468090021f391fe0056ad3e807e3d9034e0fd01adcd3bdfba977b6fdf4213ea9"},
+    {file = "wrapt-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:ec89ed91f2fa8e3f52ae53cd3cf640d6feff92ba90d62236a81e4e563ac0e991"},
+    {file = "wrapt-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6ed6ffac43aecfe6d86ec5b74b06a5be33d5bb9243d055141e8cabb12aa08125"},
+    {file = "wrapt-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:35621ae4c00e056adb0009f8e86e28eb4a41a4bfa8f9bfa9fca7d343fe94f998"},
+    {file = "wrapt-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a604bf7a053f8362d27eb9fefd2097f82600b856d5abe996d623babd067b1ab5"},
+    {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cbabee4f083b6b4cd282f5b817a867cf0b1028c54d445b7ec7cfe6505057cf8"},
+    {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49703ce2ddc220df165bd2962f8e03b84c89fee2d65e1c24a7defff6f988f4d6"},
+    {file = "wrapt-1.17.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8112e52c5822fc4253f3901b676c55ddf288614dc7011634e2719718eaa187dc"},
+    {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fee687dce376205d9a494e9c121e27183b2a3df18037f89d69bd7b35bcf59e2"},
+    {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:18983c537e04d11cf027fbb60a1e8dfd5190e2b60cc27bc0808e653e7b218d1b"},
+    {file = "wrapt-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:703919b1633412ab54bcf920ab388735832fdcb9f9a00ae49387f0fe67dad504"},
+    {file = "wrapt-1.17.2-cp313-cp313-win32.whl", hash = "sha256:abbb9e76177c35d4e8568e58650aa6926040d6a9f6f03435b7a522bf1c487f9a"},
+    {file = "wrapt-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:69606d7bb691b50a4240ce6b22ebb319c1cfb164e5f6569835058196e0f3a845"},
+    {file = "wrapt-1.17.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4a721d3c943dae44f8e243b380cb645a709ba5bd35d3ad27bc2ed947e9c68192"},
+    {file = "wrapt-1.17.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:766d8bbefcb9e00c3ac3b000d9acc51f1b399513f44d77dfe0eb026ad7c9a19b"},
+    {file = "wrapt-1.17.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e496a8ce2c256da1eb98bd15803a79bee00fc351f5dfb9ea82594a3f058309e0"},
+    {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d615e4fe22f4ad3528448c193b218e077656ca9ccb22ce2cb20db730f8d306"},
+    {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a5aaeff38654462bc4b09023918b7f21790efb807f54c000a39d41d69cf552cb"},
+    {file = "wrapt-1.17.2-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a7d15bbd2bc99e92e39f49a04653062ee6085c0e18b3b7512a4f2fe91f2d681"},
+    {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e3890b508a23299083e065f435a492b5435eba6e304a7114d2f919d400888cc6"},
+    {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8c8b293cd65ad716d13d8dd3624e42e5a19cc2a2f1acc74b30c2c13f15cb61a6"},
+    {file = "wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f"},
+    {file = "wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555"},
+    {file = "wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c"},
+    {file = "wrapt-1.17.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c803c401ea1c1c18de70a06a6f79fcc9c5acfc79133e9869e730ad7f8ad8ef9"},
+    {file = "wrapt-1.17.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f917c1180fdb8623c2b75a99192f4025e412597c50b2ac870f156de8fb101119"},
+    {file = "wrapt-1.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ecc840861360ba9d176d413a5489b9a0aff6d6303d7e733e2c4623cfa26904a6"},
+    {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb87745b2e6dc56361bfde481d5a378dc314b252a98d7dd19a651a3fa58f24a9"},
+    {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58455b79ec2661c3600e65c0a716955adc2410f7383755d537584b0de41b1d8a"},
+    {file = "wrapt-1.17.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4e42a40a5e164cbfdb7b386c966a588b1047558a990981ace551ed7e12ca9c2"},
+    {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:91bd7d1773e64019f9288b7a5101f3ae50d3d8e6b1de7edee9c2ccc1d32f0c0a"},
+    {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bb90fb8bda722a1b9d48ac1e6c38f923ea757b3baf8ebd0c82e09c5c1a0e7a04"},
+    {file = "wrapt-1.17.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:08e7ce672e35efa54c5024936e559469436f8b8096253404faeb54d2a878416f"},
+    {file = "wrapt-1.17.2-cp38-cp38-win32.whl", hash = "sha256:410a92fefd2e0e10d26210e1dfb4a876ddaf8439ef60d6434f21ef8d87efc5b7"},
+    {file = "wrapt-1.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:95c658736ec15602da0ed73f312d410117723914a5c91a14ee4cdd72f1d790b3"},
+    {file = "wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a"},
+    {file = "wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061"},
+    {file = "wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82"},
+    {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62c2caa1585c82b3f7a7ab56afef7b3602021d6da34fbc1cf234ff139fed3cd9"},
+    {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c958bcfd59bacc2d0249dcfe575e71da54f9dcf4a8bdf89c4cb9a68a1170d73f"},
+    {file = "wrapt-1.17.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc78a84e2dfbc27afe4b2bd7c80c8db9bca75cc5b85df52bfe634596a1da846b"},
+    {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ba0f0eb61ef00ea10e00eb53a9129501f52385c44853dbd6c4ad3f403603083f"},
+    {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1e1fe0e6ab7775fd842bc39e86f6dcfc4507ab0ffe206093e76d61cde37225c8"},
+    {file = "wrapt-1.17.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c86563182421896d73858e08e1db93afdd2b947a70064b813d515d66549e15f9"},
+    {file = "wrapt-1.17.2-cp39-cp39-win32.whl", hash = "sha256:f393cda562f79828f38a819f4788641ac7c4085f30f1ce1a68672baa686482bb"},
+    {file = "wrapt-1.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:36ccae62f64235cf8ddb682073a60519426fdd4725524ae38874adf72b5f2aeb"},
+    {file = "wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8"},
+    {file = "wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3"},
 ]
 
-[[package]]
-name = "xxhash"
-version = "3.5.0"
-description = "Python binding for xxHash"
-optional = true
-python-versions = ">=3.7"
-files = [
-    {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"},
-    {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196"},
-    {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198"},
-    {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442"},
-    {file = "xxhash-3.5.0-cp310-cp310-win32.whl", hash = "sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da"},
-    {file = "xxhash-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9"},
-    {file = "xxhash-3.5.0-cp310-cp310-win_arm64.whl", hash = "sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6"},
-    {file = "xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1"},
-    {file = "xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a"},
-    {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d"},
-    {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839"},
-    {file = "xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da"},
-    {file = "xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58"},
-    {file = "xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3"},
-    {file = "xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00"},
-    {file = "xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6"},
-    {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab"},
-    {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e"},
-    {file = "xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8"},
-    {file = "xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e"},
-    {file = "xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2"},
-    {file = "xxhash-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6"},
-    {file = "xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb"},
-    {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7"},
-    {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c"},
-    {file = "xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637"},
-    {file = "xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43"},
-    {file = "xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b"},
-    {file = "xxhash-3.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6e5f70f6dca1d3b09bccb7daf4e087075ff776e3da9ac870f86ca316736bb4aa"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e76e83efc7b443052dd1e585a76201e40b3411fe3da7af4fe434ec51b2f163b"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33eac61d0796ca0591f94548dcfe37bb193671e0c9bcf065789b5792f2eda644"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ec70a89be933ea49222fafc3999987d7899fc676f688dd12252509434636622"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86b8e7f703ec6ff4f351cfdb9f428955859537125904aa8c963604f2e9d3e7"},
-    {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0adfbd36003d9f86c8c97110039f7539b379f28656a04097e7434d3eaf9aa131"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:63107013578c8a730419adc05608756c3fa640bdc6abe806c3123a49fb829f43"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:683b94dbd1ca67557850b86423318a2e323511648f9f3f7b1840408a02b9a48c"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5d2a01dcce81789cf4b12d478b5464632204f4c834dc2d064902ee27d2d1f0ee"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:a9d360a792cbcce2fe7b66b8d51274ec297c53cbc423401480e53b26161a290d"},
-    {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:f0b48edbebea1b7421a9c687c304f7b44d0677c46498a046079d445454504737"},
-    {file = "xxhash-3.5.0-cp37-cp37m-win32.whl", hash = "sha256:7ccb800c9418e438b44b060a32adeb8393764da7441eb52aa2aa195448935306"},
-    {file = "xxhash-3.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c3bc7bf8cb8806f8d1c9bf149c18708cb1c406520097d6b0a73977460ea03602"},
-    {file = "xxhash-3.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:74752ecaa544657d88b1d1c94ae68031e364a4d47005a90288f3bab3da3c970f"},
-    {file = "xxhash-3.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dee1316133c9b463aa81aca676bc506d3f80d8f65aeb0bba2b78d0b30c51d7bd"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:602d339548d35a8579c6b013339fb34aee2df9b4e105f985443d2860e4d7ffaa"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:695735deeddfb35da1677dbc16a083445360e37ff46d8ac5c6fcd64917ff9ade"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1030a39ba01b0c519b1a82f80e8802630d16ab95dc3f2b2386a0b5c8ed5cbb10"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5bc08f33c4966f4eb6590d6ff3ceae76151ad744576b5fc6c4ba8edd459fdec"},
-    {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:160e0c19ee500482ddfb5d5570a0415f565d8ae2b3fd69c5dcfce8a58107b1c3"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f1abffa122452481a61c3551ab3c89d72238e279e517705b8b03847b1d93d738"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:d5e9db7ef3ecbfc0b4733579cea45713a76852b002cf605420b12ef3ef1ec148"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:23241ff6423378a731d84864bf923a41649dc67b144debd1077f02e6249a0d54"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:82b833d5563fefd6fceafb1aed2f3f3ebe19f84760fdd289f8b926731c2e6e91"},
-    {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0a80ad0ffd78bef9509eee27b4a29e56f5414b87fb01a888353e3d5bda7038bd"},
-    {file = "xxhash-3.5.0-cp38-cp38-win32.whl", hash = "sha256:50ac2184ffb1b999e11e27c7e3e70cc1139047e7ebc1aa95ed12f4269abe98d4"},
-    {file = "xxhash-3.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:392f52ebbb932db566973693de48f15ce787cabd15cf6334e855ed22ea0be5b3"},
-    {file = "xxhash-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bfc8cdd7f33d57f0468b0614ae634cc38ab9202c6957a60e31d285a71ebe0301"},
-    {file = "xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0c48b6300cd0b0106bf49169c3e0536408dfbeb1ccb53180068a18b03c662ab"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe1a92cfbaa0a1253e339ccec42dbe6db262615e52df591b68726ab10338003f"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33513d6cc3ed3b559134fb307aae9bdd94d7e7c02907b37896a6c45ff9ce51bd"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eefc37f6138f522e771ac6db71a6d4838ec7933939676f3753eafd7d3f4c40bc"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a606c8070ada8aa2a88e181773fa1ef17ba65ce5dd168b9d08038e2a61b33754"},
-    {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42eca420c8fa072cc1dd62597635d140e78e384a79bb4944f825fbef8bfeeef6"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:604253b2143e13218ff1ef0b59ce67f18b8bd1c4205d2ffda22b09b426386898"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6e93a5ad22f434d7876665444a97e713a8f60b5b1a3521e8df11b98309bff833"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7a46e1d6d2817ba8024de44c4fd79913a90e5f7265434cef97026215b7d30df6"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:30eb2efe6503c379b7ab99c81ba4a779748e3830241f032ab46bd182bf5873af"},
-    {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c8aa771ff2c13dd9cda8166d685d7333d389fae30a4d2bb39d63ab5775de8606"},
-    {file = "xxhash-3.5.0-cp39-cp39-win32.whl", hash = "sha256:5ed9ebc46f24cf91034544b26b131241b699edbfc99ec5e7f8f3d02d6eb7fba4"},
-    {file = "xxhash-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:220f3f896c6b8d0316f63f16c077d52c412619e475f9372333474ee15133a558"},
-    {file = "xxhash-3.5.0-cp39-cp39-win_arm64.whl", hash = "sha256:a7b1d8315d9b5e9f89eb2933b73afae6ec9597a258d52190944437158b49d38e"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b"},
-    {file = "xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2b4154c00eb22e4d543f472cfca430e7962a0f1d0f3778334f2e08a7ba59363c"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d30bbc1644f726b825b3278764240f449d75f1a8bdda892e641d4a688b1494ae"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa0b72f2423e2aa53077e54a61c28e181d23effeaafd73fcb9c494e60930c8e"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:13de2b76c1835399b2e419a296d5b38dc4855385d9e96916299170085ef72f57"},
-    {file = "xxhash-3.5.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:0691bfcc4f9c656bcb96cc5db94b4d75980b9d5589f2e59de790091028580837"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:297595fe6138d4da2c8ce9e72a04d73e58725bb60f3a19048bc96ab2ff31c692"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc1276d369452040cbb943300dc8abeedab14245ea44056a2943183822513a18"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2061188a1ba352fc699c82bff722f4baacb4b4b8b2f0c745d2001e56d0dfb514"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38c384c434021e4f62b8d9ba0bc9467e14d394893077e2c66d826243025e1f81"},
-    {file = "xxhash-3.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e6a4dd644d72ab316b580a1c120b375890e4c52ec392d4aef3c63361ec4d77d1"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:531af8845aaadcadf951b7e0c1345c6b9c68a990eeb74ff9acd8501a0ad6a1c9"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ce379bcaa9fcc00f19affa7773084dd09f5b59947b3fb47a1ceb0179f91aaa1"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd1b2281d01723f076df3c8188f43f2472248a6b63118b036e641243656b1b0f"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c770750cc80e8694492244bca7251385188bc5597b6a39d98a9f30e8da984e0"},
-    {file = "xxhash-3.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b150b8467852e1bd844387459aa6fbe11d7f38b56e901f9f3b3e6aba0d660240"},
-    {file = "xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f"},
-]
-
-[[package]]
-name = "yarl"
-version = "1.16.0"
-description = "Yet another URL library"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:32468f41242d72b87ab793a86d92f885355bcf35b3355aa650bfa846a5c60058"},
-    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:234f3a3032b505b90e65b5bc6652c2329ea7ea8855d8de61e1642b74b4ee65d2"},
-    {file = "yarl-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a0296040e5cddf074c7f5af4a60f3fc42c0237440df7bcf5183be5f6c802ed5"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de6c14dd7c7c0badba48157474ea1f03ebee991530ba742d381b28d4f314d6f3"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b140e532fe0266003c936d017c1ac301e72ee4a3fd51784574c05f53718a55d8"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:019f5d58093402aa8f6661e60fd82a28746ad6d156f6c5336a70a39bd7b162b9"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c42998fd1cbeb53cd985bff0e4bc25fbe55fd6eb3a545a724c1012d69d5ec84"},
-    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c7c30fb38c300fe8140df30a046a01769105e4cf4282567a29b5cdb635b66c4"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e49e0fd86c295e743fd5be69b8b0712f70a686bc79a16e5268386c2defacaade"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:b9ca7b9147eb1365c8bab03c003baa1300599575effad765e0b07dd3501ea9af"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:27e11db3f1e6a51081a981509f75617b09810529de508a181319193d320bc5c7"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8994c42f4ca25df5380ddf59f315c518c81df6a68fed5bb0c159c6cb6b92f120"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:542fa8e09a581bcdcbb30607c7224beff3fdfb598c798ccd28a8184ffc18b7eb"},
-    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2bd6a51010c7284d191b79d3b56e51a87d8e1c03b0902362945f15c3d50ed46b"},
-    {file = "yarl-1.16.0-cp310-cp310-win32.whl", hash = "sha256:178ccb856e265174a79f59721031060f885aca428983e75c06f78aa24b91d929"},
-    {file = "yarl-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe8bba2545427418efc1929c5c42852bdb4143eb8d0a46b09de88d1fe99258e7"},
-    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d8643975a0080f361639787415a038bfc32d29208a4bf6b783ab3075a20b1ef3"},
-    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:676d96bafc8c2d0039cea0cd3fd44cee7aa88b8185551a2bb93354668e8315c2"},
-    {file = "yarl-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9525f03269e64310416dbe6c68d3b23e5d34aaa8f47193a1c45ac568cecbc49"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b37d5ec034e668b22cf0ce1074d6c21fd2a08b90d11b1b73139b750a8b0dd97"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f32c4cb7386b41936894685f6e093c8dfaf0960124d91fe0ec29fe439e201d0"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b8e265a0545637492a7e12fd7038370d66c9375a61d88c5567d0e044ded9202"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:789a3423f28a5fff46fbd04e339863c169ece97c827b44de16e1a7a42bc915d2"},
-    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1d1f45e3e8d37c804dca99ab3cf4ab3ed2e7a62cd82542924b14c0a4f46d243"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:621280719c4c5dad4c1391160a9b88925bb8b0ff6a7d5af3224643024871675f"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed097b26f18a1f5ff05f661dc36528c5f6735ba4ce8c9645e83b064665131349"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f1fe2b2e3ee418862f5ebc0c0083c97f6f6625781382f828f6d4e9b614eba9b"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:87dd10bc0618991c66cee0cc65fa74a45f4ecb13bceec3c62d78ad2e42b27a16"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4199db024b58a8abb2cfcedac7b1292c3ad421684571aeb622a02f242280e8d6"},
-    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:99a9dcd4b71dd5f5f949737ab3f356cfc058c709b4f49833aeffedc2652dac56"},
-    {file = "yarl-1.16.0-cp311-cp311-win32.whl", hash = "sha256:a9394c65ae0ed95679717d391c862dece9afacd8fa311683fc8b4362ce8a410c"},
-    {file = "yarl-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:5b9101f528ae0f8f65ac9d64dda2bb0627de8a50344b2f582779f32fda747c1d"},
-    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4ffb7c129707dd76ced0a4a4128ff452cecf0b0e929f2668ea05a371d9e5c104"},
-    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1a5e9d8ce1185723419c487758d81ac2bde693711947032cce600ca7c9cda7d6"},
-    {file = "yarl-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d743e3118b2640cef7768ea955378c3536482d95550222f908f392167fe62059"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26768342f256e6e3c37533bf9433f5f15f3e59e3c14b2409098291b3efaceacb"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1b0796168b953bca6600c5f97f5ed407479889a36ad7d17183366260f29a6b9"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:858728086914f3a407aa7979cab743bbda1fe2bdf39ffcd991469a370dd7414d"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5570e6d47bcb03215baf4c9ad7bf7c013e56285d9d35013541f9ac2b372593e7"},
-    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66ea8311422a7ba1fc79b4c42c2baa10566469fe5a78500d4e7754d6e6db8724"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:649bddcedee692ee8a9b7b6e38582cb4062dc4253de9711568e5620d8707c2a3"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3a91654adb7643cb21b46f04244c5a315a440dcad63213033826549fa2435f71"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b439cae82034ade094526a8f692b9a2b5ee936452de5e4c5f0f6c48df23f8604"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:571f781ae8ac463ce30bacebfaef2c6581543776d5970b2372fbe31d7bf31a07"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:aa7943f04f36d6cafc0cf53ea89824ac2c37acbdb4b316a654176ab8ffd0f968"},
-    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1a5cf32539373ff39d97723e39a9283a7277cbf1224f7aef0c56c9598b6486c3"},
-    {file = "yarl-1.16.0-cp312-cp312-win32.whl", hash = "sha256:a5b6c09b9b4253d6a208b0f4a2f9206e511ec68dce9198e0fbec4f160137aa67"},
-    {file = "yarl-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:1208ca14eed2fda324042adf8d6c0adf4a31522fa95e0929027cd487875f0240"},
-    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5ace0177520bd4caa99295a9b6fb831d0e9a57d8e0501a22ffaa61b4c024283"},
-    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7118bdb5e3ed81acaa2095cba7ec02a0fe74b52a16ab9f9ac8e28e53ee299732"},
-    {file = "yarl-1.16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38fec8a2a94c58bd47c9a50a45d321ab2285ad133adefbbadf3012c054b7e656"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8791d66d81ee45866a7bb15a517b01a2bcf583a18ebf5d72a84e6064c417e64b"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cf936ba67bc6c734f3aa1c01391da74ab7fc046a9f8bbfa230b8393b90cf472"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1aab176dd55b59f77a63b27cffaca67d29987d91a5b615cbead41331e6b7428"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:995d0759004c08abd5d1b81300a91d18c8577c6389300bed1c7c11675105a44d"},
-    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bc22e00edeb068f71967ab99081e9406cd56dbed864fc3a8259442999d71552"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:35b4f7842154176523e0a63c9b871168c69b98065d05a4f637fce342a6a2693a"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:7ace71c4b7a0c41f317ae24be62bb61e9d80838d38acb20e70697c625e71f120"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8f639e3f5795a6568aa4f7d2ac6057c757dcd187593679f035adbf12b892bb00"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e8be3aff14f0120ad049121322b107f8a759be76a6a62138322d4c8a337a9e2c"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:122d8e7986043d0549e9eb23c7fd23be078be4b70c9eb42a20052b3d3149c6f2"},
-    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fd9c227990f609c165f56b46107d0bc34553fe0387818c42c02f77974402c36"},
-    {file = "yarl-1.16.0-cp313-cp313-win32.whl", hash = "sha256:595ca5e943baed31d56b33b34736461a371c6ea0038d3baec399949dd628560b"},
-    {file = "yarl-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:921b81b8d78f0e60242fb3db615ea3f368827a76af095d5a69f1c3366db3f596"},
-    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab2b2ac232110a1fdb0d3ffcd087783edd3d4a6ced432a1bf75caf7b7be70916"},
-    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7f8713717a09acbfee7c47bfc5777e685539fefdd34fa72faf504c8be2f3df4e"},
-    {file = "yarl-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cdcffe1dbcb4477d2b4202f63cd972d5baa155ff5a3d9e35801c46a415b7f71a"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a91217208306d82357c67daeef5162a41a28c8352dab7e16daa82e3718852a7"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ab3ed42c78275477ea8e917491365e9a9b69bb615cb46169020bd0aa5e2d6d3"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:707ae579ccb3262dfaef093e202b4c3fb23c3810e8df544b1111bd2401fd7b09"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7a852d1cd0b8d8b37fc9d7f8581152add917a98cfe2ea6e241878795f917ae"},
-    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3f1cc3d3d4dc574bebc9b387f6875e228ace5748a7c24f49d8f01ac1bc6c31b"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5ff96da263740779b0893d02b718293cc03400c3a208fc8d8cd79d9b0993e532"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:3d375a19ba2bfe320b6d873f3fb165313b002cef8b7cc0a368ad8b8a57453837"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:62c7da0ad93a07da048b500514ca47b759459ec41924143e2ddb5d7e20fd3db5"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:147b0fcd0ee33b4b5f6edfea80452d80e419e51b9a3f7a96ce98eaee145c1581"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:504e1fe1cc4f170195320eb033d2b0ccf5c6114ce5bf2f617535c01699479bca"},
-    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bdcf667a5dec12a48f669e485d70c54189f0639c2157b538a4cffd24a853624f"},
-    {file = "yarl-1.16.0-cp39-cp39-win32.whl", hash = "sha256:e9951afe6557c75a71045148890052cb942689ee4c9ec29f5436240e1fcc73b7"},
-    {file = "yarl-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d7aaa8ff95d0840e289423e7dc35696c2b058d635f945bf05b5cd633146b027"},
-    {file = "yarl-1.16.0-py3-none-any.whl", hash = "sha256:e6980a558d8461230c457218bd6c92dfc1d10205548215c2c21d79dc8d0a96f3"},
-    {file = "yarl-1.16.0.tar.gz", hash = "sha256:b6f687ced5510a9a2474bbae96a4352e5ace5fa34dc44a217b0537fec1db00b4"},
-]
-
-[package.dependencies]
-idna = ">=2.0"
-multidict = ">=4.0"
-propcache = ">=0.2.0"
-
 [[package]]
 name = "zipp"
-version = "3.20.2"
+version = "3.21.0"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"},
-    {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"},
+    {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"},
+    {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"},
 ]
 
 [package.extras]
@@ -4014,6 +3009,6 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 type = ["pytest-mypy"]
 
 [metadata]
-lock-version = "2.0"
+lock-version = "2.1"
 python-versions = ">=3.9,<3.13"
-content-hash = "d169e52a2daaf41488cf7191e3f4ff209d39125941a6d0e656001aae534b1e8d"
+content-hash = "cb3921d3df77dd5a7c9c7f09fcdf4f4f61b307b04e5bd58c52cfb299ae053da3"
diff --git a/backends/gaudi/server/pyproject.toml b/backends/gaudi/server/pyproject.toml
index b38f4562..3f2676cb 100644
--- a/backends/gaudi/server/pyproject.toml
+++ b/backends/gaudi/server/pyproject.toml
@@ -9,30 +9,30 @@ text-generation-server = 'text_generation_server.cli:app'
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-protobuf = "^3.20.3"
-grpcio = "^1.51.1"
+protobuf = "^5.0"
+grpcio = "^1.71.1"
 grpcio-status = "*"
 grpcio-reflection = "*"
 grpc-interceptor = "^0.15.0"
-typer = "^0.7.0"
-loguru = "^0.6.0"
-opentelemetry-api = "^1.15.0"
-opentelemetry-exporter-otlp = "^1.15.0"
-opentelemetry-instrumentation-grpc = "^0.36b0"
-hf-transfer = "^0.1.2"
-sentencepiece = "^0.1.97"
-peft = "^0.10"
-optimum-habana = "1.16.0"
-transformers = "4.45.2"
-numpy = "1.26.4"
-accelerate = "0.33.0"
+typer = "^0.15.0"
+loguru = "^0.7.3"
+opentelemetry-api = "^1.32.0"
+opentelemetry-exporter-otlp = "^1.32.0"
+opentelemetry-instrumentation-grpc = "^0.53b0"
+hf-transfer = "^0.1.9"
+sentencepiece = "^0.2.0"
+peft = "^0.15"
+optimum-habana = "1.17"
+transformers = "^4.49"
+numpy = "^1.26"
+accelerate = "^0.33"
 outlines= { version = "^0.0.36", optional = true }
-prometheus-client = "^0.20.0"
+prometheus-client = "^0.21.1"
 py-cpuinfo = "^9.0.0"
 
 [tool.poetry.group.dev.dependencies]
 grpcio-tools = "*"
-pytest = "^7.3.0"
+pytest = "^8.3.5"
 
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
@@ -40,3 +40,6 @@ markers = ["private: marks tests as requiring an admin hf token (deselect with '
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.requires-plugins]
+poetry-plugin-export = ">=1.8"
diff --git a/backends/gaudi/server/requirements.txt b/backends/gaudi/server/requirements.txt
index b71d29d9..6644f8cf 100644
--- a/backends/gaudi/server/requirements.txt
+++ b/backends/gaudi/server/requirements.txt
@@ -1,104 +1,101 @@
 accelerate==0.33.0 ; python_version >= "3.9" and python_version < "3.13"
-aiohappyeyeballs==2.4.3 ; python_version >= "3.9" and python_version < "3.13"
-aiohttp==3.10.10 ; python_version >= "3.9" and python_version < "3.13"
-aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
-async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.11"
-attrs==24.2.0 ; python_version >= "3.9" and python_version < "3.13"
-backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
-certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
-click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-coloredlogs==15.0.1 ; python_version >= "3.9" and python_version < "3.13"
-datasets==3.0.1 ; python_version >= "3.9" and python_version < "3.13"
-deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
-diffusers==0.31.0 ; python_version >= "3.9" and python_version < "3.13"
-dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13"
-frozenlist==1.4.1 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
-fsspec[http]==2024.6.1 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < "3.13"
-grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.48.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.26.1 ; python_version >= "3.9" and python_version < "3.13"
-humanfriendly==10.0 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
-importlib-metadata==8.5.0 ; python_version >= "3.9" and python_version < "3.13"
-jinja2==3.1.4 ; python_version >= "3.9" and python_version < "3.13"
-joblib==1.4.2 ; python_version >= "3.9" and python_version < "3.13"
-loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
-markupsafe==3.0.2 ; python_version >= "3.9" and python_version < "3.13"
-mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
-multidict==6.1.0 ; python_version >= "3.9" and python_version < "3.13"
-multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "3.13"
-networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-optimum-habana==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-optimum==1.23.2 ; python_version >= "3.9" and python_version < "3.13"
-packaging==24.1 ; python_version >= "3.9" and python_version < "3.13"
-pandas==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
-peft==0.10.0 ; python_version >= "3.9" and python_version < "3.13"
-pillow==11.0.0 ; python_version >= "3.9" and python_version < "3.13"
-prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
-propcache==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==3.20.3 ; python_version >= "3.9" and python_version < "3.13"
-psutil==6.1.0 ; python_version >= "3.9" and python_version < "3.13"
-py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pyarrow==17.0.0 ; python_version >= "3.9" and python_version < "3.13"
-pyreadline3==3.5.4 ; sys_platform == "win32" and python_version >= "3.9" and python_version < "3.13"
-python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_version < "3.13"
-pytz==2024.2 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
-scikit-learn==1.5.2 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
-sentence-transformers[train]==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
-six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-sympy==1.12.1 ; python_version >= "3.9" and python_version < "3.13"
-threadpoolctl==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
-transformers[sentencepiece]==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
-triton==3.0.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.13" and python_version >= "3.9"
-typer==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
-tzdata==2024.2 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
-win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
-wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-xxhash==3.5.0 ; python_version >= "3.9" and python_version < "3.13"
-yarl==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-zipp==3.20.2 ; python_version >= "3.9" and python_version < "3.13"
-outlines==0.0.34 ; python_version >= "3.9" and python_version < "3.13"
-interegular==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
-lark==1.2.2 ; python_version >= "3.9" and python_version < "3.13"
-cloudpickle==3.1.0 ; python_version >= "3.9" and python_version < "3.13"
-diskcache==5.6.3 ; python_version >= "3.9" and python_version < "3.13"
-numba==0.60.0 ; python_version >= "3.9" and python_version < "3.13"
-llvmlite==0.43.0 ; python_version >= "3.9" and python_version < "3.13"
-jsonschema==4.23.0 ; python_version >= "3.9" and python_version < "3.13"
 annotated-types==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
+attrs==25.3.0 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2025.1.31 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.4.1 ; python_version >= "3.9" and python_version < "3.13"
+click==8.1.8 ; python_version >= "3.9" and python_version < "3.13"
+cloudpickle==3.1.1 ; python_version >= "3.9" and python_version < "3.13"
+colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Windows" or python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
+deprecated==1.2.18 ; python_version >= "3.9" and python_version < "3.13"
+diffusers==0.31.0 ; python_version >= "3.9" and python_version < "3.13"
+diskcache==5.6.3 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.18.0 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2025.3.2 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.70.0 ; python_version >= "3.9" and python_version < "3.13"
+grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.71.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.71.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.72.0rc1 ; python_version >= "3.9" and python_version < "3.13"
+hf-transfer==0.1.9 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.30.2 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
+importlib-metadata==8.6.1 ; python_version >= "3.9" and python_version < "3.13"
+interegular==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
+jinja2==3.1.6 ; python_version >= "3.9" and python_version < "3.13"
+joblib==1.4.2 ; python_version >= "3.9" and python_version < "3.13"
 jsonschema-specifications==2024.10.1 ; python_version >= "3.9" and python_version < "3.13"
-nest-asyncio==1.6.0; python_version >= "3.9" and python_version < "3.13"
-pydantic==2.10.6; python_version >= "3.9" and python_version < "3.13"
-pydantic-core==2.27.2 ; python_version >= "3.9" and python_version < "3.13"
+jsonschema==4.23.0 ; python_version >= "3.9" and python_version < "3.13"
+lark==1.2.2 ; python_version >= "3.9" and python_version < "3.13"
+llvmlite==0.43.0 ; python_version >= "3.9" and python_version < "3.13"
+loguru==0.7.3 ; python_version >= "3.9" and python_version < "3.13"
+markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
+markupsafe==3.0.2 ; python_version >= "3.9" and python_version < "3.13"
+mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
+mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
+nest-asyncio==1.6.0 ; python_version >= "3.9" and python_version < "3.13"
+networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
+numba==0.60.0 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
+nvidia-cublas-cu12==12.4.5.8 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cuda-cupti-cu12==12.4.127 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cuda-nvrtc-cu12==12.4.127 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cuda-runtime-cu12==12.4.127 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cudnn-cu12==9.1.0.70 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cufft-cu12==11.2.1.3 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-curand-cu12==10.3.5.147 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cusolver-cu12==11.6.1.9 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cusparse-cu12==12.3.1.170 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-cusparselt-cu12==0.6.2 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-nccl-cu12==2.21.5 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-nvjitlink-cu12==12.4.127 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+nvidia-nvtx-cu12==12.4.127 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+opentelemetry-api==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-common==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
+optimum-habana==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
+optimum==1.24.0 ; python_version >= "3.9" and python_version < "3.13"
+outlines==0.0.36 ; python_version >= "3.9" and python_version < "3.13"
+packaging==24.2 ; python_version >= "3.9" and python_version < "3.13"
+peft==0.10.0 ; python_version >= "3.9" and python_version < "3.13"
+pillow==11.2.1 ; python_version >= "3.9" and python_version < "3.13"
+prometheus-client==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==5.29.4 ; python_version >= "3.9" and python_version < "3.13"
+psutil==7.0.0 ; python_version >= "3.9" and python_version < "3.13"
+py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
+pydantic-core==2.33.1 ; python_version >= "3.9" and python_version < "3.13"
+pydantic==2.11.3 ; python_version >= "3.9" and python_version < "3.13"
+pygments==2.19.1 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
 referencing==0.36.2 ; python_version >= "3.9" and python_version < "3.13"
-rpds-py==0.22.3 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.11.6 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
+rich==14.0.0 ; python_version >= "3.9" and python_version < "3.13"
+rpds-py==0.24.0 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.5.3 ; python_version >= "3.9" and python_version < "3.13"
+scikit-learn==1.6.1 ; python_version >= "3.9" and python_version < "3.13"
+scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
+sentence-transformers==3.3.1 ; python_version >= "3.9" and python_version < "3.13"
+sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==78.1.0 ; python_version >= "3.12" and python_version < "3.13"
+shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
+sympy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
+threadpoolctl==3.6.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
+torch==2.6.0 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.67.1 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+triton==3.2.0 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+typer==0.15.2 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.13.2 ; python_version >= "3.9" and python_version < "3.13"
+typing-inspection==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.4.0 ; python_version >= "3.9" and python_version < "3.13"
+win32-setctime==1.2.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
+wrapt==1.17.2 ; python_version >= "3.9" and python_version < "3.13"
+zipp==3.21.0 ; python_version >= "3.9" and python_version < "3.13"

From ad765cd06b8aa4b4da52b49194cfbaa5ac0bc5d5 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 15 Apr 2025 11:55:28 +0200
Subject: [PATCH 181/183] Hotfixing gaudi deps. (#3174)

---
 backends/gaudi/server/requirements.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/gaudi/server/requirements.txt b/backends/gaudi/server/requirements.txt
index 6644f8cf..1a5d767f 100644
--- a/backends/gaudi/server/requirements.txt
+++ b/backends/gaudi/server/requirements.txt
@@ -59,11 +59,11 @@ opentelemetry-instrumentation==0.53b0 ; python_version >= "3.9" and python_versi
 opentelemetry-proto==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-sdk==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-semantic-conventions==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
-optimum-habana==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
+optimum-habana==1.17.0 ; python_version >= "3.9" and python_version < "3.13"
 optimum==1.24.0 ; python_version >= "3.9" and python_version < "3.13"
 outlines==0.0.36 ; python_version >= "3.9" and python_version < "3.13"
 packaging==24.2 ; python_version >= "3.9" and python_version < "3.13"
-peft==0.10.0 ; python_version >= "3.9" and python_version < "3.13"
+peft==0.15.1 ; python_version >= "3.9" and python_version < "3.13"
 pillow==11.2.1 ; python_version >= "3.9" and python_version < "3.13"
 prometheus-client==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
 protobuf==5.29.4 ; python_version >= "3.9" and python_version < "3.13"
@@ -87,10 +87,10 @@ setuptools==78.1.0 ; python_version >= "3.12" and python_version < "3.13"
 shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
 sympy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
 threadpoolctl==3.6.0 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
 torch==2.6.0 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.67.1 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.49.0 ; python_version >= "3.9" and python_version < "3.13"
 triton==3.2.0 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
 typer==0.15.2 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.13.2 ; python_version >= "3.9" and python_version < "3.13"

From 4645678ff0ce07c9da3ecfc042f46becf9d4b191 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 15 Apr 2025 12:39:28 +0200
Subject: [PATCH 182/183] Hotfix gaudi2 with newer transformers. (#3176)

---
 .../utils/logits_process.py                   | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/backends/gaudi/server/text_generation_server/utils/logits_process.py b/backends/gaudi/server/text_generation_server/utils/logits_process.py
index 472f2dcb..c0fd6cba 100644
--- a/backends/gaudi/server/text_generation_server/utils/logits_process.py
+++ b/backends/gaudi/server/text_generation_server/utils/logits_process.py
@@ -3,7 +3,7 @@ import torch
 import habana_frameworks.torch.core as htcore
 
 from loguru import logger
-from typing import Dict, Union
+from typing import Dict
 from text_generation_server.pb.generate_pb2 import GrammarType
 
 from outlines.fsm.fsm import RegexFSM
@@ -13,7 +13,6 @@ from typing import List, Optional, DefaultDict
 import time
 
 from transformers import (
-    LogitsWarper,
     LogitsProcessor,
     TemperatureLogitsWarper,
     TopKLogitsWarper,
@@ -191,7 +190,7 @@ class HeterogeneousFrequencyPenaltyLogitsProcessor(LogitsProcessor):
 
 class HeterogeneousTemperatureLogitsWarper:
     r"""
-    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
+    [`LogitsProcessor`] for temperature (exponential scaling output probability distribution).
     This version allows for a separate value for each sample and runs inplace when possible.
     It doesn't validate inputs.
 
@@ -220,7 +219,7 @@ class HeterogeneousTemperatureLogitsWarper:
         return None
 
 
-class HeterogeneousTopPLogitsWarper(LogitsWarper):
+class HeterogeneousTopPLogitsWarper(LogitsProcessor):
     """
     [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
     This version allows for a separate value for each sample and runs inplace when possible.
@@ -279,9 +278,9 @@ class HeterogeneousTopPLogitsWarper(LogitsWarper):
         return None
 
 
-class HeterogeneousTopKLogitsWarper(LogitsWarper):
+class HeterogeneousTopKLogitsWarper(LogitsProcessor):
     r"""
-    [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
+    [`LogitsProcessor`] that performs top-k, i.e. restricting to the k highest probability elements.
     This version allows for a separate value for each sample and runs inplace when possible.
     It doesn't validate inputs.
 
@@ -360,9 +359,9 @@ class HeterogeneousTopKLogitsWarper(LogitsWarper):
         return None
 
 
-class HeterogeneousTypicalLogitsWarper(LogitsWarper):
+class HeterogeneousTypicalLogitsWarper(LogitsProcessor):
     r"""
-    [`LogitsWarper`] that performs typical decoding. See [Typical Decoding for Natural Language
+    [`LogitsProcessor`] that performs typical decoding. See [Typical Decoding for Natural Language
     Generation](https://arxiv.org/abs/2202.00666) for more information.
     This version allows for a separate value for each sample and runs inplace when possible.
     It doesn't validate inputs.
@@ -454,13 +453,13 @@ class HeterogeneousProcessorWrapper(LogitsProcessor):
     r"""
     A wrapper for logit warpers or processors without heterogeneous parameter support.
     Args:
-        processors (`Dict[int, Union[LogitsProcessor, LogitsWarper]]`):
+        processors (`Dict[int, LogitsProcessor]`):
             A mapping of sample indices to logit warpers or processors, to be run sequentially.
     """
 
     def __init__(
         self,
-        processors: Dict[int, Union[LogitsProcessor, LogitsWarper]],
+        processors: Dict[int, LogitsProcessor],
     ):
         self.processors = processors
 

From 84ab88d84343fabef2baa89885e9efb55907f75e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 17 Apr 2025 18:07:41 +0200
Subject: [PATCH 183/183] Support flashinfer for Gemma3 prefill (#3167)

* launcher: ensure correct detection of Gemma 3 head size

* Support flashinfer for Gemma3 prefill

Gemma3 uses bidirectional attention for images. Flashinfer
supports custom masks. Hook up the mask with flashinfer, so that we do
not have to use the slower SDPA implementation for prefills with images.

* Update Gemma3 test outputs

* Fixed unused import
---
 .../test_flash_gemma3/test_flash_gemma3.json  | 164 +++++++++---------
 ...est_flash_gemma3_image_base64_rgb_png.json |  12 +-
 .../test_flash_gemma3_image_base64_rgba.json  |  10 +-
 .../test_flash_gemma3_image_cow.json          |   4 +-
 .../test_flash_gemma3_image_cow_dog.json      |   4 +-
 launcher/src/main.rs                          |  21 ++-
 .../layers/attention/flashinfer.py            |   2 +
 .../custom_modeling/flash_gemma3_modeling.py  |  22 ++-
 .../models/flash_causal_lm.py                 |   2 +
 .../models/vlm_causal_lm.py                   |   9 +
 10 files changed, 141 insertions(+), 109 deletions(-)

diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
index 859544c8..be8b3882 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
@@ -8,61 +8,61 @@
     "tokens": [
       {
         "id": 1331,
-        "logprob": -0.34960938,
+        "logprob": -0.31835938,
         "special": false,
         "text": " people"
       },
       {
         "id": 8390,
-        "logprob": -0.14746094,
+        "logprob": -0.1484375,
         "special": false,
         "text": " died"
       },
       {
         "id": 528,
-        "logprob": -1.2265625,
+        "logprob": -1.1171875,
         "special": false,
         "text": " in"
       },
       {
         "id": 506,
-        "logprob": -0.47070312,
+        "logprob": -0.45898438,
         "special": false,
         "text": " the"
       },
       {
         "id": 3640,
-        "logprob": -0.5859375,
+        "logprob": -0.55859375,
         "special": false,
         "text": " United"
       },
       {
         "id": 4184,
-        "logprob": -0.0027770996,
+        "logprob": -0.0026397705,
         "special": false,
         "text": " States"
       },
       {
         "id": 236761,
-        "logprob": -0.34765625,
+        "logprob": -0.38085938,
         "special": false,
         "text": "."
       },
       {
         "id": 108,
-        "logprob": -0.0859375,
+        "logprob": -0.07421875,
         "special": false,
         "text": "\n\n"
       },
       {
         "id": 818,
-        "logprob": -1.1640625,
+        "logprob": -1.0859375,
         "special": false,
         "text": "The"
       },
       {
         "id": 6816,
-        "logprob": -1.890625,
+        "logprob": -1.75,
         "special": false,
         "text": " generally"
       },
@@ -74,7 +74,7 @@
       },
       {
         "id": 10967,
-        "logprob": -0.90625,
+        "logprob": -0.9609375,
         "special": false,
         "text": " estimate"
       },
@@ -86,43 +86,43 @@
       },
       {
         "id": 600,
-        "logprob": -0.65234375,
+        "logprob": -0.703125,
         "special": false,
         "text": " that"
       },
       {
         "id": 236743,
-        "logprob": -1.2109375,
+        "logprob": -1.171875,
         "special": false,
         "text": " "
       },
       {
         "id": 236825,
-        "logprob": -0.00088119507,
+        "logprob": -0.0009918213,
         "special": false,
         "text": "6"
       },
       {
         "id": 236832,
-        "logprob": -6.580353e-05,
+        "logprob": -6.389618e-05,
         "special": false,
         "text": "7"
       },
       {
         "id": 236810,
-        "logprob": -5.2690506e-05,
+        "logprob": -4.7445297e-05,
         "special": false,
         "text": "5"
       },
       {
         "id": 236764,
-        "logprob": -0.0001745224,
+        "logprob": -0.00017929077,
         "special": false,
         "text": ","
       },
       {
         "id": 236771,
-        "logprob": -1.180172e-05,
+        "logprob": -1.4901161e-05,
         "special": false,
         "text": "0"
       },
@@ -140,7 +140,7 @@
       },
       {
         "id": 1331,
-        "logprob": -0.44921875,
+        "logprob": -0.45898438,
         "special": false,
         "text": " people"
       },
@@ -158,49 +158,49 @@
       },
       {
         "id": 506,
-        "logprob": -0.00034713745,
+        "logprob": -0.00032615662,
         "special": false,
         "text": " the"
       },
       {
         "id": 3640,
-        "logprob": -0.028564453,
+        "logprob": -0.029785156,
         "special": false,
         "text": " United"
       },
       {
         "id": 4184,
-        "logprob": -0.00012207031,
+        "logprob": -0.00012302399,
         "special": false,
         "text": " States"
       },
       {
         "id": 236761,
-        "logprob": -1.15625,
+        "logprob": -1.1796875,
         "special": false,
         "text": "."
       },
       {
         "id": 3153,
-        "logprob": -0.103027344,
+        "logprob": -0.09667969,
         "special": false,
         "text": " However"
       },
       {
         "id": 236764,
-        "logprob": -0.009155273,
+        "logprob": -0.009094238,
         "special": false,
         "text": ","
       },
       {
         "id": 1070,
-        "logprob": -0.92578125,
+        "logprob": -0.91015625,
         "special": false,
         "text": " some"
       },
       {
         "id": 61806,
-        "logprob": -0.91796875,
+        "logprob": -0.859375,
         "special": false,
         "text": " historians"
       },
@@ -218,79 +218,79 @@
       },
       {
         "id": 5396,
-        "logprob": -0.8046875,
+        "logprob": -0.765625,
         "special": false,
         "text": " actual"
       },
       {
         "id": 1548,
-        "logprob": -0.04321289,
+        "logprob": -0.048339844,
         "special": false,
         "text": " number"
       },
       {
         "id": 1451,
-        "logprob": -0.66015625,
+        "logprob": -0.65625,
         "special": false,
         "text": " could"
       },
       {
         "id": 577,
-        "logprob": -0.091308594,
+        "logprob": -0.09082031,
         "special": false,
         "text": " be"
       },
       {
         "id": 618,
-        "logprob": -0.57421875,
+        "logprob": -0.625,
         "special": false,
         "text": " as"
       },
       {
         "id": 1494,
-        "logprob": -0.00036239624,
+        "logprob": -0.00037193298,
         "special": false,
         "text": " high"
       },
       {
         "id": 618,
-        "logprob": -0.0001335144,
+        "logprob": -0.0001296997,
         "special": false,
         "text": " as"
       },
       {
         "id": 236743,
-        "logprob": -0.0009689331,
+        "logprob": -0.00093460083,
         "special": false,
         "text": " "
       },
       {
         "id": 236770,
-        "logprob": -0.26367188,
+        "logprob": -0.21289062,
         "special": false,
         "text": "1"
       },
       {
         "id": 236771,
-        "logprob": -0.17773438,
+        "logprob": -0.16796875,
         "special": false,
         "text": "0"
       },
       {
         "id": 3625,
-        "logprob": -0.012084961,
+        "logprob": -0.0126953125,
         "special": false,
         "text": " million"
       },
       {
         "id": 236761,
-        "logprob": -0.21289062,
+        "logprob": -0.22460938,
         "special": false,
         "text": "."
       },
       {
         "id": 108,
-        "logprob": -0.37304688,
+        "logprob": -0.3984375,
         "special": false,
         "text": "\n\n"
       },
@@ -302,13 +302,13 @@
       },
       {
         "id": 1006,
-        "logprob": -1.3203125,
+        "logprob": -1.359375,
         "special": false,
         "text": " am"
       },
       {
         "id": 3182,
-        "logprob": -1.078125,
+        "logprob": -1.0859375,
         "special": false,
         "text": " looking"
       },
@@ -320,85 +320,85 @@
       },
       {
         "id": 919,
-        "logprob": -1.25,
+        "logprob": -1.2578125,
         "special": false,
         "text": " more"
       },
       {
         "id": 1938,
-        "logprob": -1.2421875,
+        "logprob": -1.3046875,
         "special": false,
         "text": " information"
       },
       {
         "id": 580,
-        "logprob": -0.7734375,
+        "logprob": -0.7421875,
         "special": false,
         "text": " on"
       },
       {
         "id": 672,
-        "logprob": -0.73046875,
+        "logprob": -0.78125,
         "special": false,
         "text": " this"
       },
       {
         "id": 59725,
-        "logprob": -0.75,
+        "logprob": -0.7109375,
         "special": false,
         "text": " discrepancy"
       },
       {
         "id": 532,
-        "logprob": -0.83984375,
+        "logprob": -0.8046875,
         "special": false,
         "text": " and"
       },
       {
         "id": 506,
-        "logprob": -0.7109375,
+        "logprob": -0.71484375,
         "special": false,
         "text": " the"
       },
       {
         "id": 5872,
-        "logprob": -1.2734375,
+        "logprob": -1.1640625,
         "special": false,
         "text": " factors"
       },
       {
         "id": 600,
-        "logprob": -0.22851562,
+        "logprob": -0.20410156,
         "special": false,
         "text": " that"
       },
       {
         "id": 19263,
-        "logprob": -1.1640625,
+        "logprob": -1.1484375,
         "special": false,
         "text": " contributed"
       },
       {
         "id": 531,
-        "logprob": -0.0010757446,
+        "logprob": -0.000957489,
         "special": false,
         "text": " to"
       },
       {
         "id": 506,
-        "logprob": -0.18945312,
+        "logprob": -0.19921875,
         "special": false,
         "text": " the"
       },
       {
         "id": 5777,
-        "logprob": -1.2734375,
+        "logprob": -1.171875,
         "special": false,
         "text": " wide"
       },
       {
         "id": 2644,
-        "logprob": -0.01940918,
+        "logprob": -0.020141602,
         "special": false,
         "text": " range"
       },
@@ -410,31 +410,31 @@
       },
       {
         "id": 14287,
-        "logprob": -0.032470703,
+        "logprob": -0.03564453,
         "special": false,
         "text": " estimates"
       },
       {
         "id": 236761,
-        "logprob": -0.010375977,
+        "logprob": -0.010620117,
         "special": false,
         "text": "."
       },
       {
         "id": 108,
-        "logprob": -0.06591797,
+        "logprob": -0.060302734,
         "special": false,
         "text": "\n\n"
       },
       {
         "id": 8291,
-        "logprob": -0.8046875,
+        "logprob": -0.7421875,
         "special": false,
         "text": "Here"
       },
       {
         "id": 236789,
-        "logprob": -0.23828125,
+        "logprob": -0.24023438,
         "special": false,
         "text": "'"
       },
@@ -446,55 +446,55 @@
       },
       {
         "id": 496,
-        "logprob": -0.17480469,
+        "logprob": -0.16992188,
         "special": false,
         "text": " a"
       },
       {
         "id": 25890,
-        "logprob": -0.087402344,
+        "logprob": -0.06933594,
         "special": false,
         "text": " breakdown"
       },
       {
         "id": 529,
-        "logprob": -0.0021209717,
+        "logprob": -0.002243042,
         "special": false,
         "text": " of"
       },
       {
         "id": 506,
-        "logprob": -0.19140625,
+        "logprob": -0.18554688,
         "special": false,
         "text": " the"
       },
       {
         "id": 5872,
-        "logprob": -1.0078125,
+        "logprob": -0.9921875,
         "special": false,
         "text": " factors"
       },
       {
         "id": 20894,
-        "logprob": -0.26367188,
+        "logprob": -0.25976562,
         "special": false,
         "text": " contributing"
       },
       {
         "id": 531,
-        "logprob": -9.250641e-05,
+        "logprob": -8.440018e-05,
         "special": false,
         "text": " to"
       },
       {
         "id": 506,
-        "logprob": -0.008666992,
+        "logprob": -0.009765625,
         "special": false,
         "text": " the"
       },
       {
         "id": 5777,
-        "logprob": -0.6171875,
+        "logprob": -0.67578125,
         "special": false,
         "text": " wide"
       },
@@ -506,31 +506,31 @@
       },
       {
         "id": 529,
-        "logprob": -0.016723633,
+        "logprob": -0.014831543,
         "special": false,
         "text": " of"
       },
       {
         "id": 14287,
-        "logprob": -0.011352539,
+        "logprob": -0.012329102,
         "special": false,
         "text": " estimates"
       },
       {
         "id": 573,
-        "logprob": -0.30664062,
+        "logprob": -0.3125,
         "special": false,
         "text": " for"
       },
       {
         "id": 506,
-        "logprob": -0.21386719,
+        "logprob": -0.21484375,
         "special": false,
         "text": " the"
       },
       {
         "id": 236743,
-        "logprob": -0.35351562,
+        "logprob": -0.43359375,
         "special": false,
         "text": " "
       },
@@ -560,43 +560,43 @@
       },
       {
         "id": 7745,
-        "logprob": -0.70703125,
+        "logprob": -0.703125,
         "special": false,
         "text": " flu"
       },
       {
         "id": 10248,
-        "logprob": -0.015258789,
+        "logprob": -0.013427734,
         "special": false,
         "text": " pandemic"
       },
       {
         "id": 4355,
-        "logprob": -0.83203125,
+        "logprob": -0.6953125,
         "special": false,
         "text": " death"
       },
       {
         "id": 25363,
-        "logprob": -7.43866e-05,
+        "logprob": -6.771088e-05,
         "special": false,
         "text": " toll"
       },
       {
         "id": 528,
-        "logprob": -0.08496094,
+        "logprob": -0.076171875,
         "special": false,
         "text": " in"
       },
       {
         "id": 506,
-        "logprob": -6.67572e-06,
+        "logprob": -7.2717667e-06,
         "special": false,
         "text": " the"
       },
       {
         "id": 3640,
-        "logprob": -0.0059509277,
+        "logprob": -0.0052490234,
         "special": false,
         "text": " United"
       },
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
index afbfba30..cd1a598e 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
@@ -1,11 +1,11 @@
 {
   "choices": [
     {
-      "finish_reason": "stop",
+      "finish_reason": "length",
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "Okay, let's analyze the image. \n\nThe image is entirely white, with a very subtle, faint outline of a stylized, cartoonish figure. It appears to be a simplified depiction of a person, likely a child, with a wide-eyed expression and a small, rounded body. \n\nIt's almost like a minimalist, iconic representation. \n\nDo you want me to try and describe it in more detail or perhaps speculate about the context of the image?",
+        "content": "Okay, let's analyze the image. \n\nThe image is entirely white, with a very subtle, faint outline of a stylized, cartoonish figure. It appears to be a simplified depiction of a person, likely a child, with a wide-eyed expression and a small, rounded body. \n\nIt's almost like a minimalist, iconic representation. \n\nDo you want me to try and describe it in more detail, or perhaps suggest what this image might represent (e.g",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1741965892,
+  "created": 1744396706,
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.2.1-dev0-native",
+  "system_fingerprint": "3.2.3-dev0-native",
   "usage": {
-    "completion_tokens": 98,
+    "completion_tokens": 100,
     "prompt_tokens": 277,
-    "total_tokens": 375
+    "total_tokens": 377
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
index 1b97d261..a1d3ae78 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
@@ -5,7 +5,7 @@
       "index": 0,
       "logprobs": null,
       "message": {
-        "content": "Okay, let's analyze the image. \n\nThe transparent image reveals a stylized depiction of **a human head**. It's a minimalist, geometric representation, showing the basic shapes of the skull, eye sockets, and head outline. \n\nDo you want me to describe any specific element of the image in more detail?",
+        "content": "Okay, let's analyze the image. \n\nThe transparent image reveals a stylized depiction of **a human head**. It's a minimalist, geometric representation, showing the basic shapes of the skull, eye sockets, and head outline. \n\nIf you'd like, you can give me more details about the image or ask me to focus on a specific aspect of it.",
         "name": null,
         "role": "assistant",
         "tool_calls": null
@@ -13,14 +13,14 @@
       "usage": null
     }
   ],
-  "created": 1741966313,
+  "created": 1744396703,
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.2.1-dev0-native",
+  "system_fingerprint": "3.2.3-dev0-native",
   "usage": {
-    "completion_tokens": 67,
+    "completion_tokens": 78,
     "prompt_tokens": 277,
-    "total_tokens": 344
+    "total_tokens": 355
   }
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
index cd786b3c..a839d7aa 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
@@ -13,11 +13,11 @@
       "usage": null
     }
   ],
-  "created": 1741964480,
+  "created": 1744396699,
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.2.1-dev0-native",
+  "system_fingerprint": "3.2.3-dev0-native",
   "usage": {
     "completion_tokens": 74,
     "prompt_tokens": 275,
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
index 5ed2c450..c7215c93 100644
--- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
+++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
@@ -13,11 +13,11 @@
       "usage": null
     }
   ],
-  "created": 1741964477,
+  "created": 1744396697,
   "id": "",
   "model": "google/gemma-3-4b-it",
   "object": "chat.completion",
-  "system_fingerprint": "3.2.1-dev0-native",
+  "system_fingerprint": "3.2.3-dev0-native",
   "usage": {
     "completion_tokens": 75,
     "prompt_tokens": 279,
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index c169a78c..2fbb9c12 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -260,11 +260,22 @@ struct Config {
 
 impl Config {
     fn get_head_dim(&self) -> Option<usize> {
-        self.head_dim.or_else(|| {
-            self.text_config
-                .as_ref()
-                .and_then(|text_config| text_config.head_dim)
-        })
+        if let Some(head_dim) = self.head_dim {
+            return Some(head_dim);
+        }
+
+        let text_config = self.text_config.as_ref()?;
+        if let Some(head_size) = text_config.head_dim {
+            return Some(head_size);
+        }
+
+        match self.model_type.as_deref() {
+            // We special-case gemma3 here, since we need flashinfer for
+            // handling bidirectional masks. And flashinfer can only be
+            // used when the head size is known.
+            Some("gemma3") => Some(256),
+            _ => None,
+        }
     }
 
     fn flop(&self) -> Option<u64> {
diff --git a/server/text_generation_server/layers/attention/flashinfer.py b/server/text_generation_server/layers/attention/flashinfer.py
index 9479b606..f78475d5 100644
--- a/server/text_generation_server/layers/attention/flashinfer.py
+++ b/server/text_generation_server/layers/attention/flashinfer.py
@@ -45,6 +45,7 @@ def use_prefill_with_paged_kv_state(
     state: flashinfer.BatchPrefillWithPagedKVCacheWrapper,
     block_tables: torch.Tensor,
     cu_seqlens: torch.Tensor,
+    custom_mask: Optional[torch.Tensor],
     input_lengths: torch.Tensor,
     num_heads: int,
     num_kv_heads: int,
@@ -88,6 +89,7 @@ def use_prefill_with_paged_kv_state(
             paged_kv_indptr=indptr,
             paged_kv_indices=block_tables,
             paged_kv_last_page_len=last_page_len,
+            custom_mask=custom_mask,
             num_qo_heads=num_heads,
             num_kv_heads=num_kv_heads,
             head_dim=head_size,
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
index 70fe9a3d..58afd643 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
@@ -45,6 +45,7 @@ from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
 )
+from text_generation_server.models.globals import ATTENTION
 from text_generation_server.utils.weights import UnquantizedWeight
 from transformers.activations import ACT2FN
 from text_generation_server.layers.attention import (
@@ -248,7 +249,7 @@ class FlashGemma3Attention(torch.nn.Module):
 
         # Prefill
         if cu_seqlen_prefill is not None:
-            if attention_mask is None:
+            if attention_mask is None or ATTENTION == "flashinfer":
                 # flash attention
                 attn_output = attention(
                     query=query,
@@ -701,8 +702,16 @@ class Gemma3ForConditionalGeneration(nn.Module):
         )
 
     def get_attention_mask(
-        self, input_ids, max_s, cu_seqlen_prefill, dtype, image_token_mask
+        self,
+        input_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        dtype: torch.dtype,
+        bool_mask: bool = False,
     ):
+        image_token_mask = (input_ids == self.config.image_token_index).to(
+            input_ids.device
+        )
+
         device = input_ids.device
         min_dtype = torch.finfo(dtype).min
 
@@ -748,9 +757,10 @@ class Gemma3ForConditionalGeneration(nn.Module):
         )
         full_attention_mask[:, :, :, :sequence_length] = combined_mask
 
-        final_attention_mask = torch.where(full_attention_mask, 0, min_dtype).to(device)
-
-        return final_attention_mask
+        if bool_mask:
+            return full_attention_mask
+        else:
+            return torch.where(full_attention_mask, 0, min_dtype).to(device)
 
     def forward(
         self,
@@ -793,10 +803,8 @@ class Gemma3ForConditionalGeneration(nn.Module):
             )
             attention_mask = self.get_attention_mask(
                 input_ids,
-                max_s,
                 cu_seqlen_prefill,
                 inputs_embeds.dtype,
-                image_token_mask,
             )
         # Use flash attention for text-only input
         # else:
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index c7c5a374..a28ef381 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -2434,6 +2434,7 @@ class FlashCausalLM(Model):
         input_lengths_tensor: torch.Tensor,
         cache_lengths_tensor: torch.Tensor,
         state: Optional[Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
     ) -> ContextManager:
         if ATTENTION != "flashinfer":
             return nullcontext()
@@ -2450,6 +2451,7 @@ class FlashCausalLM(Model):
                 ),
                 block_tables=block_tables,
                 cu_seqlens=cu_seqlen_prefill,
+                custom_mask=attention_mask,
                 input_lengths=input_lengths_tensor + cache_lengths_tensor,
                 num_heads=self.num_heads,
                 num_kv_heads=self.num_kv_heads,
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 5f8eb906..2b1e01df 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -485,6 +485,14 @@ class VlmCausalLM(FlashCausalLM):
                 )
                 batch.position_ids = position_ids
 
+        if self.model.config.model_type == "gemma3" and cu_seqlen_prefill is not None:
+            # Get the mask, needed for flashinfer.
+            attention_mask = self.model.get_attention_mask(
+                input_ids, cu_seqlen_prefill, self.dtype, bool_mask=True
+            ).reshape(-1)
+        else:
+            attention_mask = None
+
         # Try to find an associated cuda graph
         bs = input_ids.shape[0]
         sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
@@ -508,6 +516,7 @@ class VlmCausalLM(FlashCausalLM):
                 cu_seqlen_prefill=cu_seqlen_prefill,
                 input_lengths_tensor=input_lengths,
                 cache_lengths_tensor=cache_lengths_tensor,
+                attention_mask=attention_mask,
             ):
                 seqlen = Seqlen(
                     input_lengths=input_lengths,